PyPI - deepdoc-lib - Versions diffs - 0.2.0__py3-none-any.whl - Mend

deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

deepdoc/README.md +122 -0
deepdoc/README_zh.md +116 -0
deepdoc/__init__.py +43 -0
deepdoc/_version.py +34 -0
deepdoc/common/__init__.py +52 -0
deepdoc/common/config_utils.py +63 -0
deepdoc/common/connection_utils.py +73 -0
deepdoc/common/file_utils.py +19 -0
deepdoc/common/misc_utils.py +44 -0
deepdoc/common/model_store.py +369 -0
deepdoc/common/settings.py +42 -0
deepdoc/common/tiktoken_cache.py +84 -0
deepdoc/common/token_utils.py +96 -0
deepdoc/config.py +149 -0
deepdoc/depend/find_codec.py +42 -0
deepdoc/depend/nltk_manager.py +114 -0
deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
deepdoc/depend/prompts.py +35 -0
deepdoc/depend/rag_tokenizer.py +578 -0
deepdoc/depend/simple_cv_model.py +469 -0
deepdoc/depend/surname.py +91 -0
deepdoc/depend/timeout.py +73 -0
deepdoc/depend/vision_llm_chunk.py +35 -0
deepdoc/dict/README.md +19 -0
deepdoc/dict/huqie.txt +555629 -0
deepdoc/download_models.py +169 -0
deepdoc/llm_adapter/__init__.py +15 -0
deepdoc/llm_adapter/adapter.py +223 -0
deepdoc/llm_adapter/utils.py +104 -0
deepdoc/llm_adapter/vision.py +163 -0
deepdoc/parser/__init__.py +42 -0
deepdoc/parser/docling_parser.py +889 -0
deepdoc/parser/docx_parser.py +150 -0
deepdoc/parser/excel_parser.py +270 -0
deepdoc/parser/figure_parser.py +182 -0
deepdoc/parser/html_parser.py +221 -0
deepdoc/parser/json_parser.py +179 -0
deepdoc/parser/markdown_parser.py +321 -0
deepdoc/parser/mineru_parser.py +646 -0
deepdoc/parser/pdf_parser.py +1591 -0
deepdoc/parser/ppt_parser.py +96 -0
deepdoc/parser/resume/__init__.py +109 -0
deepdoc/parser/resume/entities/__init__.py +15 -0
deepdoc/parser/resume/entities/corporations.py +128 -0
deepdoc/parser/resume/entities/degrees.py +44 -0
deepdoc/parser/resume/entities/industries.py +712 -0
deepdoc/parser/resume/entities/regions.py +789 -0
deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
deepdoc/parser/resume/entities/res/good_corp.json +911 -0
deepdoc/parser/resume/entities/res/good_sch.json +595 -0
deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
deepdoc/parser/resume/entities/res/schools.csv +5713 -0
deepdoc/parser/resume/entities/schools.py +91 -0
deepdoc/parser/resume/step_one.py +189 -0
deepdoc/parser/resume/step_two.py +692 -0
deepdoc/parser/tcadp_parser.py +538 -0
deepdoc/parser/txt_parser.py +64 -0
deepdoc/parser/utils.py +33 -0
deepdoc/vision/__init__.py +90 -0
deepdoc/vision/layout_recognizer.py +481 -0
deepdoc/vision/ocr.py +757 -0
deepdoc/vision/operators.py +733 -0
deepdoc/vision/postprocess.py +370 -0
deepdoc/vision/recognizer.py +451 -0
deepdoc/vision/seeit.py +87 -0
deepdoc/vision/t_ocr.py +101 -0
deepdoc/vision/t_recognizer.py +186 -0
deepdoc/vision/table_structure_recognizer.py +617 -0
deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
scripts/download_models.py +10 -0

deepdoc/parser/ppt_parser.py ADDED Viewed

@@ -0,0 +1,96 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+from io import BytesIO
+from pptx import Presentation
+class RAGFlowPptParser:
+    def __init__(self):
+        super().__init__()
+    def __get_bulleted_text(self, paragraph):
+        is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) or bool(paragraph._p.xpath("./a:pPr/a:buBlip"))
+        if is_bulleted:
+            return f"{'  '* paragraph.level}.{paragraph.text}"
+        else:
+            return paragraph.text
+    def __extract(self, shape):
+        try:
+            # First try to get text content
+            if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
+                text_frame = shape.text_frame
+                texts = []
+                for paragraph in text_frame.paragraphs:
+                    if paragraph.text.strip():
+                        texts.append(self.__get_bulleted_text(paragraph))
+                return "\n".join(texts)
+            # Safely get shape_type
+            try:
+                shape_type = shape.shape_type
+            except NotImplementedError:
+                # If shape_type is not available, try to get text content
+                if hasattr(shape, 'text'):
+                    return shape.text.strip()
+                return ""
+            # Handle table
+            if shape_type == 19:
+                tb = shape.table
+                rows = []
+                for i in range(1, len(tb.rows)):
+                    rows.append("; ".join([tb.cell(
+                        0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
+                return "\n".join(rows)
+            # Handle group shape
+            if shape_type == 6:
+                texts = []
+                for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
+                    t = self.__extract(p)
+                    if t:
+                        texts.append(t)
+                return "\n".join(texts)
+            return ""
+        except Exception as e:
+            logging.error(f"Error processing shape: {str(e)}")
+            return ""
+    def __call__(self, fnm, from_page, to_page, callback=None):
+        ppt = Presentation(fnm) if isinstance(
+            fnm, str) else Presentation(
+            BytesIO(fnm))
+        txts = []
+        self.total_page = len(ppt.slides)
+        for i, slide in enumerate(ppt.slides):
+            if i < from_page:
+                continue
+            if i >= to_page:
+                break
+            texts = []
+            for shape in sorted(
+                    slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left if x.left is not None else 0)):
+                txt = self.__extract(shape)
+                if txt:
+                    texts.append(txt)
+            txts.append("\n".join(texts))
+        return txts

deepdoc/parser/resume/__init__.py ADDED Viewed

@@ -0,0 +1,109 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import datetime
+def refactor(cv):
+    for n in [
+        "raw_txt",
+        "parser_name",
+        "inference",
+        "ori_text",
+        "use_time",
+        "time_stat",
+    ]:
+        if n in cv and cv[n] is not None:
+            del cv[n]
+    cv["is_deleted"] = 0
+    if "basic" not in cv:
+        cv["basic"] = {}
+    if cv["basic"].get("photo2"):
+        del cv["basic"]["photo2"]
+    for n in [
+        "education",
+        "work",
+        "certificate",
+        "project",
+        "language",
+        "skill",
+        "training",
+    ]:
+        if n not in cv or cv[n] is None:
+            continue
+        if isinstance(cv[n], dict):
+            cv[n] = [v for _, v in cv[n].items()]
+        if not isinstance(cv[n], list):
+            del cv[n]
+            continue
+        vv = []
+        for v in cv[n]:
+            if "external" in v and v["external"] is not None:
+                del v["external"]
+            vv.append(v)
+        cv[n] = {str(i): vv[i] for i in range(len(vv))}
+    basics = [
+        ("basic_salary_month", "salary_month"),
+        ("expect_annual_salary_from", "expect_annual_salary"),
+    ]
+    for n, t in basics:
+        if cv["basic"].get(n):
+            cv["basic"][t] = cv["basic"][n]
+            del cv["basic"][n]
+    work = sorted(
+        [v for _, v in cv.get("work", {}).items()],
+        key=lambda x: x.get("start_time", ""),
+    )
+    edu = sorted(
+        [v for _, v in cv.get("education", {}).items()],
+        key=lambda x: x.get("start_time", ""),
+    )
+    if work:
+        cv["basic"]["work_start_time"] = work[0].get("start_time", "")
+        cv["basic"]["management_experience"] = (
+            "Y"
+            if any([w.get("management_experience", "") == "Y" for w in work])
+            else "N"
+        )
+        cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
+        for n in [
+            "annual_salary_from",
+            "annual_salary_to",
+            "industry_name",
+            "position_name",
+            "responsibilities",
+            "corporation_type",
+            "scale",
+            "corporation_name",
+        ]:
+            cv["basic"][n] = work[-1].get(n, "")
+    if edu:
+        for n in ["school_name", "discipline_name"]:
+            if n in edu[-1]:
+                cv["basic"][n] = edu[-1][n]
+    cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    if "contact" not in cv:
+        cv["contact"] = {}
+    if not cv["contact"].get("name"):
+        cv["contact"]["name"] = cv["basic"].get("name", "")
+    return cv

deepdoc/parser/resume/entities/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#

deepdoc/parser/resume/entities/corporations.py ADDED Viewed

@@ -0,0 +1,128 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import re
+import json
+import os
+import pandas as pd
+from ....depend import rag_tokenizer
+from . import regions
+current_file_path = os.path.dirname(os.path.abspath(__file__))
+GOODS = pd.read_csv(
+    os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0
+).fillna(0)
+GOODS["cid"] = GOODS["cid"].astype(str)
+GOODS = GOODS.set_index(["cid"])
+CORP_TKS = json.load(
+    open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r",encoding="utf-8")
+)
+GOOD_CORP = json.load(open(os.path.join(current_file_path, "res/good_corp.json"), "r",encoding="utf-8"))
+CORP_TAG = json.load(open(os.path.join(current_file_path, "res/corp_tag.json"), "r",encoding="utf-8"))
+def baike(cid, default_v=0):
+    global GOODS
+    try:
+        return GOODS.loc[str(cid), "len"]
+    except Exception:
+        pass
+    return default_v
+def corpNorm(nm, add_region=True):
+    global CORP_TKS
+    if not nm or not isinstance(nm, str):
+        return ""
+    nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
+    nm = re.sub(r"&amp;", "&", nm)
+    nm = re.sub(r"[\(\)（）\+'\"\t \*\\【】-]+", " ", nm)
+    nm = re.sub(
+        r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, count=10000, flags=re.IGNORECASE
+    )
+    nm = re.sub(
+        r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$",
+        "",
+        nm,
+        count=10000,
+        flags=re.IGNORECASE,
+    )
+    if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])):
+        return nm
+    tks = rag_tokenizer.tokenize(nm).split()
+    reg = [t for i, t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
+    nm = ""
+    for t in tks:
+        if regions.isName(t) or t in CORP_TKS:
+            continue
+        if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):
+            nm += " "
+        nm += t
+    r = re.search(r"^([^a-z0-9 \(\)&]{2,})[a-z ]{4,}$", nm.strip())
+    if r:
+        nm = r.group(1)
+    r = re.search(r"^([a-z ]{3,})[^a-z0-9 \(\)&]{2,}$", nm.strip())
+    if r:
+        nm = r.group(1)
+    return nm.strip() + (("" if not reg else "(%s)" % reg[0]) if add_region else "")
+def rmNoise(n):
+    n = re.sub(r"[\(（][^()（）]+[)）]", "", n)
+    n = re.sub(r"[,. &（）()]+", "", n)
+    return n
+GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
+for c, v in CORP_TAG.items():
+    cc = corpNorm(rmNoise(c), False)
+    if not cc:
+        logging.debug(c)
+CORP_TAG = {corpNorm(rmNoise(c), False): v for c, v in CORP_TAG.items()}
+def is_good(nm):
+    global GOOD_CORP
+    if nm.find("外派") >= 0:
+        return False
+    nm = rmNoise(nm)
+    nm = corpNorm(nm, False)
+    for n in GOOD_CORP:
+        if re.match(r"[0-9a-zA-Z]+$", n):
+            if n == nm:
+                return True
+        elif nm.find(n) >= 0:
+            return True
+    return False
+def corp_tag(nm):
+    global CORP_TAG
+    nm = rmNoise(nm)
+    nm = corpNorm(nm, False)
+    for n in CORP_TAG.keys():
+        if re.match(r"[0-9a-zA-Z., ]+$", n):
+            if n == nm:
+                return CORP_TAG[n]
+        elif nm.find(n) >= 0:
+            if len(n) < 3 and len(nm) / len(n) >= 2:
+                continue
+            return CORP_TAG[n]
+    return []

deepdoc/parser/resume/entities/degrees.py ADDED Viewed

@@ -0,0 +1,44 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+TBL = {
+    "94": "EMBA",
+    "6": "MBA",
+    "95": "MPA",
+    "92": "专升本",
+    "4": "专科",
+    "90": "中专",
+    "91": "中技",
+    "86": "初中",
+    "3": "博士",
+    "10": "博士后",
+    "1": "本科",
+    "2": "硕士",
+    "87": "职高",
+    "89": "高中",
+}
+TBL_ = {v: k for k, v in TBL.items()}
+def get_name(id):
+    return TBL.get(str(id), "")
+def get_id(nm):
+    if not nm:
+        return ""
+    return TBL_.get(nm.upper().strip(), "")