deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,96 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import logging
18
+ from io import BytesIO
19
+ from pptx import Presentation
20
+
21
+
22
+ class RAGFlowPptParser:
23
+ def __init__(self):
24
+ super().__init__()
25
+
26
+ def __get_bulleted_text(self, paragraph):
27
+ is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) or bool(paragraph._p.xpath("./a:pPr/a:buBlip"))
28
+ if is_bulleted:
29
+ return f"{' '* paragraph.level}.{paragraph.text}"
30
+ else:
31
+ return paragraph.text
32
+
33
+ def __extract(self, shape):
34
+ try:
35
+ # First try to get text content
36
+ if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
37
+ text_frame = shape.text_frame
38
+ texts = []
39
+ for paragraph in text_frame.paragraphs:
40
+ if paragraph.text.strip():
41
+ texts.append(self.__get_bulleted_text(paragraph))
42
+ return "\n".join(texts)
43
+
44
+ # Safely get shape_type
45
+ try:
46
+ shape_type = shape.shape_type
47
+ except NotImplementedError:
48
+ # If shape_type is not available, try to get text content
49
+ if hasattr(shape, 'text'):
50
+ return shape.text.strip()
51
+ return ""
52
+
53
+ # Handle table
54
+ if shape_type == 19:
55
+ tb = shape.table
56
+ rows = []
57
+ for i in range(1, len(tb.rows)):
58
+ rows.append("; ".join([tb.cell(
59
+ 0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
60
+ return "\n".join(rows)
61
+
62
+ # Handle group shape
63
+ if shape_type == 6:
64
+ texts = []
65
+ for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
66
+ t = self.__extract(p)
67
+ if t:
68
+ texts.append(t)
69
+ return "\n".join(texts)
70
+
71
+ return ""
72
+
73
+ except Exception as e:
74
+ logging.error(f"Error processing shape: {str(e)}")
75
+ return ""
76
+
77
+ def __call__(self, fnm, from_page, to_page, callback=None):
78
+ ppt = Presentation(fnm) if isinstance(
79
+ fnm, str) else Presentation(
80
+ BytesIO(fnm))
81
+ txts = []
82
+ self.total_page = len(ppt.slides)
83
+ for i, slide in enumerate(ppt.slides):
84
+ if i < from_page:
85
+ continue
86
+ if i >= to_page:
87
+ break
88
+ texts = []
89
+ for shape in sorted(
90
+ slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left if x.left is not None else 0)):
91
+ txt = self.__extract(shape)
92
+ if txt:
93
+ texts.append(txt)
94
+ txts.append("\n".join(texts))
95
+
96
+ return txts
@@ -0,0 +1,109 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import datetime
18
+
19
+
20
+ def refactor(cv):
21
+ for n in [
22
+ "raw_txt",
23
+ "parser_name",
24
+ "inference",
25
+ "ori_text",
26
+ "use_time",
27
+ "time_stat",
28
+ ]:
29
+ if n in cv and cv[n] is not None:
30
+ del cv[n]
31
+ cv["is_deleted"] = 0
32
+ if "basic" not in cv:
33
+ cv["basic"] = {}
34
+ if cv["basic"].get("photo2"):
35
+ del cv["basic"]["photo2"]
36
+
37
+ for n in [
38
+ "education",
39
+ "work",
40
+ "certificate",
41
+ "project",
42
+ "language",
43
+ "skill",
44
+ "training",
45
+ ]:
46
+ if n not in cv or cv[n] is None:
47
+ continue
48
+ if isinstance(cv[n], dict):
49
+ cv[n] = [v for _, v in cv[n].items()]
50
+ if not isinstance(cv[n], list):
51
+ del cv[n]
52
+ continue
53
+ vv = []
54
+ for v in cv[n]:
55
+ if "external" in v and v["external"] is not None:
56
+ del v["external"]
57
+ vv.append(v)
58
+ cv[n] = {str(i): vv[i] for i in range(len(vv))}
59
+
60
+ basics = [
61
+ ("basic_salary_month", "salary_month"),
62
+ ("expect_annual_salary_from", "expect_annual_salary"),
63
+ ]
64
+ for n, t in basics:
65
+ if cv["basic"].get(n):
66
+ cv["basic"][t] = cv["basic"][n]
67
+ del cv["basic"][n]
68
+
69
+ work = sorted(
70
+ [v for _, v in cv.get("work", {}).items()],
71
+ key=lambda x: x.get("start_time", ""),
72
+ )
73
+ edu = sorted(
74
+ [v for _, v in cv.get("education", {}).items()],
75
+ key=lambda x: x.get("start_time", ""),
76
+ )
77
+
78
+ if work:
79
+ cv["basic"]["work_start_time"] = work[0].get("start_time", "")
80
+ cv["basic"]["management_experience"] = (
81
+ "Y"
82
+ if any([w.get("management_experience", "") == "Y" for w in work])
83
+ else "N"
84
+ )
85
+ cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
86
+
87
+ for n in [
88
+ "annual_salary_from",
89
+ "annual_salary_to",
90
+ "industry_name",
91
+ "position_name",
92
+ "responsibilities",
93
+ "corporation_type",
94
+ "scale",
95
+ "corporation_name",
96
+ ]:
97
+ cv["basic"][n] = work[-1].get(n, "")
98
+
99
+ if edu:
100
+ for n in ["school_name", "discipline_name"]:
101
+ if n in edu[-1]:
102
+ cv["basic"][n] = edu[-1][n]
103
+
104
+ cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
105
+ if "contact" not in cv:
106
+ cv["contact"] = {}
107
+ if not cv["contact"].get("name"):
108
+ cv["contact"]["name"] = cv["basic"].get("name", "")
109
+ return cv
@@ -0,0 +1,15 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
@@ -0,0 +1,128 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import logging
18
+ import re
19
+ import json
20
+ import os
21
+ import pandas as pd
22
+ from ....depend import rag_tokenizer
23
+ from . import regions
24
+
25
+
26
+ current_file_path = os.path.dirname(os.path.abspath(__file__))
27
+ GOODS = pd.read_csv(
28
+ os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0
29
+ ).fillna(0)
30
+ GOODS["cid"] = GOODS["cid"].astype(str)
31
+ GOODS = GOODS.set_index(["cid"])
32
+ CORP_TKS = json.load(
33
+ open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r",encoding="utf-8")
34
+ )
35
+ GOOD_CORP = json.load(open(os.path.join(current_file_path, "res/good_corp.json"), "r",encoding="utf-8"))
36
+ CORP_TAG = json.load(open(os.path.join(current_file_path, "res/corp_tag.json"), "r",encoding="utf-8"))
37
+
38
+
39
+ def baike(cid, default_v=0):
40
+ global GOODS
41
+ try:
42
+ return GOODS.loc[str(cid), "len"]
43
+ except Exception:
44
+ pass
45
+ return default_v
46
+
47
+
48
+ def corpNorm(nm, add_region=True):
49
+ global CORP_TKS
50
+ if not nm or not isinstance(nm, str):
51
+ return ""
52
+ nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
53
+ nm = re.sub(r"&amp;", "&", nm)
54
+ nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
55
+ nm = re.sub(
56
+ r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, count=10000, flags=re.IGNORECASE
57
+ )
58
+ nm = re.sub(
59
+ r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$",
60
+ "",
61
+ nm,
62
+ count=10000,
63
+ flags=re.IGNORECASE,
64
+ )
65
+ if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])):
66
+ return nm
67
+
68
+ tks = rag_tokenizer.tokenize(nm).split()
69
+ reg = [t for i, t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
70
+ nm = ""
71
+ for t in tks:
72
+ if regions.isName(t) or t in CORP_TKS:
73
+ continue
74
+ if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):
75
+ nm += " "
76
+ nm += t
77
+
78
+ r = re.search(r"^([^a-z0-9 \(\)&]{2,})[a-z ]{4,}$", nm.strip())
79
+ if r:
80
+ nm = r.group(1)
81
+ r = re.search(r"^([a-z ]{3,})[^a-z0-9 \(\)&]{2,}$", nm.strip())
82
+ if r:
83
+ nm = r.group(1)
84
+ return nm.strip() + (("" if not reg else "(%s)" % reg[0]) if add_region else "")
85
+
86
+
87
+ def rmNoise(n):
88
+ n = re.sub(r"[\((][^()()]+[))]", "", n)
89
+ n = re.sub(r"[,. &()()]+", "", n)
90
+ return n
91
+
92
+
93
+ GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
94
+ for c, v in CORP_TAG.items():
95
+ cc = corpNorm(rmNoise(c), False)
96
+ if not cc:
97
+ logging.debug(c)
98
+ CORP_TAG = {corpNorm(rmNoise(c), False): v for c, v in CORP_TAG.items()}
99
+
100
+
101
+ def is_good(nm):
102
+ global GOOD_CORP
103
+ if nm.find("外派") >= 0:
104
+ return False
105
+ nm = rmNoise(nm)
106
+ nm = corpNorm(nm, False)
107
+ for n in GOOD_CORP:
108
+ if re.match(r"[0-9a-zA-Z]+$", n):
109
+ if n == nm:
110
+ return True
111
+ elif nm.find(n) >= 0:
112
+ return True
113
+ return False
114
+
115
+
116
+ def corp_tag(nm):
117
+ global CORP_TAG
118
+ nm = rmNoise(nm)
119
+ nm = corpNorm(nm, False)
120
+ for n in CORP_TAG.keys():
121
+ if re.match(r"[0-9a-zA-Z., ]+$", n):
122
+ if n == nm:
123
+ return CORP_TAG[n]
124
+ elif nm.find(n) >= 0:
125
+ if len(n) < 3 and len(nm) / len(n) >= 2:
126
+ continue
127
+ return CORP_TAG[n]
128
+ return []
@@ -0,0 +1,44 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ TBL = {
18
+ "94": "EMBA",
19
+ "6": "MBA",
20
+ "95": "MPA",
21
+ "92": "专升本",
22
+ "4": "专科",
23
+ "90": "中专",
24
+ "91": "中技",
25
+ "86": "初中",
26
+ "3": "博士",
27
+ "10": "博士后",
28
+ "1": "本科",
29
+ "2": "硕士",
30
+ "87": "职高",
31
+ "89": "高中",
32
+ }
33
+
34
+ TBL_ = {v: k for k, v in TBL.items()}
35
+
36
+
37
+ def get_name(id):
38
+ return TBL.get(str(id), "")
39
+
40
+
41
+ def get_id(nm):
42
+ if not nm:
43
+ return ""
44
+ return TBL_.get(nm.upper().strip(), "")