PyPI - deepdoc-lib - Versions diffs - 0.2.0__py3-none-any.whl - Mend

deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

deepdoc/README.md +122 -0
deepdoc/README_zh.md +116 -0
deepdoc/__init__.py +43 -0
deepdoc/_version.py +34 -0
deepdoc/common/__init__.py +52 -0
deepdoc/common/config_utils.py +63 -0
deepdoc/common/connection_utils.py +73 -0
deepdoc/common/file_utils.py +19 -0
deepdoc/common/misc_utils.py +44 -0
deepdoc/common/model_store.py +369 -0
deepdoc/common/settings.py +42 -0
deepdoc/common/tiktoken_cache.py +84 -0
deepdoc/common/token_utils.py +96 -0
deepdoc/config.py +149 -0
deepdoc/depend/find_codec.py +42 -0
deepdoc/depend/nltk_manager.py +114 -0
deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
deepdoc/depend/prompts.py +35 -0
deepdoc/depend/rag_tokenizer.py +578 -0
deepdoc/depend/simple_cv_model.py +469 -0
deepdoc/depend/surname.py +91 -0
deepdoc/depend/timeout.py +73 -0
deepdoc/depend/vision_llm_chunk.py +35 -0
deepdoc/dict/README.md +19 -0
deepdoc/dict/huqie.txt +555629 -0
deepdoc/download_models.py +169 -0
deepdoc/llm_adapter/__init__.py +15 -0
deepdoc/llm_adapter/adapter.py +223 -0
deepdoc/llm_adapter/utils.py +104 -0
deepdoc/llm_adapter/vision.py +163 -0
deepdoc/parser/__init__.py +42 -0
deepdoc/parser/docling_parser.py +889 -0
deepdoc/parser/docx_parser.py +150 -0
deepdoc/parser/excel_parser.py +270 -0
deepdoc/parser/figure_parser.py +182 -0
deepdoc/parser/html_parser.py +221 -0
deepdoc/parser/json_parser.py +179 -0
deepdoc/parser/markdown_parser.py +321 -0
deepdoc/parser/mineru_parser.py +646 -0
deepdoc/parser/pdf_parser.py +1591 -0
deepdoc/parser/ppt_parser.py +96 -0
deepdoc/parser/resume/__init__.py +109 -0
deepdoc/parser/resume/entities/__init__.py +15 -0
deepdoc/parser/resume/entities/corporations.py +128 -0
deepdoc/parser/resume/entities/degrees.py +44 -0
deepdoc/parser/resume/entities/industries.py +712 -0
deepdoc/parser/resume/entities/regions.py +789 -0
deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
deepdoc/parser/resume/entities/res/good_corp.json +911 -0
deepdoc/parser/resume/entities/res/good_sch.json +595 -0
deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
deepdoc/parser/resume/entities/res/schools.csv +5713 -0
deepdoc/parser/resume/entities/schools.py +91 -0
deepdoc/parser/resume/step_one.py +189 -0
deepdoc/parser/resume/step_two.py +692 -0
deepdoc/parser/tcadp_parser.py +538 -0
deepdoc/parser/txt_parser.py +64 -0
deepdoc/parser/utils.py +33 -0
deepdoc/vision/__init__.py +90 -0
deepdoc/vision/layout_recognizer.py +481 -0
deepdoc/vision/ocr.py +757 -0
deepdoc/vision/operators.py +733 -0
deepdoc/vision/postprocess.py +370 -0
deepdoc/vision/recognizer.py +451 -0
deepdoc/vision/seeit.py +87 -0
deepdoc/vision/t_ocr.py +101 -0
deepdoc/vision/t_recognizer.py +186 -0
deepdoc/vision/table_structure_recognizer.py +617 -0
deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
scripts/download_models.py +10 -0

deepdoc/config.py ADDED Viewed

@@ -0,0 +1,149 @@
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from importlib import resources
+from pathlib import Path
+from typing import Literal
+from .common.model_store import (
+    resolve_vision_model_dir,
+    resolve_xgb_model_dir,
+    validate_bundle_dir,
+)
+from .common.misc_utils import offline_mode_or_from_env
+ProviderType = Literal["local", "modelscope", "auto"]
+def _normalize_provider(provider: str) -> ProviderType:
+    normalized = provider.strip().lower()
+    aliases = {
+        "ms": "modelscope",
+        "remote": "modelscope",
+        "filesystem": "local",
+        "user": "local",
+    }
+    normalized = aliases.get(normalized, normalized)
+    if normalized not in {"local", "modelscope", "auto"}:
+        raise ValueError("Unsupported model provider '{}'. Use one of: local, modelscope, auto.".format(provider))
+    return normalized  # type: ignore[return-value]
+def _require_file(path: Path, message: str) -> None:
+    if not path.exists():
+        raise FileNotFoundError(message)
+def _default_packaged_dict_path() -> str:
+    packaged_dict = resources.files("deepdoc").joinpath("dict", "huqie.txt")
+    dict_path = Path(str(packaged_dict))
+    _require_file(
+        dict_path,
+        "Packaged tokenizer dictionary not found: {}".format(dict_path),
+    )
+    return str(dict_path)
+@dataclass(frozen=True)
+class TokenizerConfig:
+    dict_path: str | None = None
+    offline: bool = False
+    nltk_data_dir: str | None = None
+    def resolve_dict_path(self) -> str:
+        if self.dict_path:
+            dictionary = Path(self.dict_path).expanduser().resolve()
+            if dictionary.is_dir():
+                dictionary = dictionary.joinpath("huqie.txt")
+            if dictionary.suffix != ".txt":
+                raise ValueError("TokenizerConfig.dict_path must point to a '.txt' dictionary file, got: {}".format(dictionary))
+            _require_file(
+                dictionary,
+                "Tokenizer dictionary not found: {}. Provide a valid TokenizerConfig.dict_path.".format(dictionary),
+            )
+            return str(dictionary)
+        return _default_packaged_dict_path()
+    def resolve_dict_prefix(self) -> str:
+        return str(Path(self.resolve_dict_path()).with_suffix(""))
+    @classmethod
+    def from_env(cls) -> "TokenizerConfig":
+        tokenizer_dir = os.getenv("DEEPDOC_TOKENIZER_MODEL_DIR")
+        dict_path = str(Path(tokenizer_dir).expanduser().resolve().joinpath("huqie.txt")) if tokenizer_dir else None
+        return cls(
+            dict_path=dict_path,
+            offline=offline_mode_or_from_env(None),
+            nltk_data_dir=os.getenv("DEEPDOC_NLTK_DATA_DIR"),
+        )
+@dataclass(frozen=True)
+class PdfModelConfig:
+    vision_model_dir: str | None = None
+    xgb_model_dir: str | None = None
+    ascend_model_dir: str | None = None
+    model_home: str | None = None
+    model_provider: ProviderType = "auto"
+    def normalized_provider(self) -> ProviderType:
+        return _normalize_provider(self.model_provider)
+    def _resolve_bundle_dir(self, bundle: str, explicit_dir: str | None) -> str:
+        if explicit_dir:
+            candidate = Path(explicit_dir).expanduser().resolve()
+            exists, missing = validate_bundle_dir(bundle, candidate)
+            if not exists:
+                raise FileNotFoundError("Missing required files for '{}' bundle in {}: {}".format(bundle, candidate, ", ".join(missing)))
+            return str(candidate)
+        model_provider = self.normalized_provider()
+        model_offline = model_provider == "local"
+        if bundle == "vision":
+            return resolve_vision_model_dir(
+                model_home=self.model_home,
+                provider=model_provider,
+                offline=model_offline,
+            )
+        if bundle == "xgb":
+            return resolve_xgb_model_dir(
+                model_home=self.model_home,
+                provider=model_provider,
+                offline=model_offline,
+            )
+        raise ValueError(f"Unsupported PDF model bundle '{bundle}'")
+    def resolve_vision_model_dir(self) -> str:
+        return self._resolve_bundle_dir("vision", self.vision_model_dir)
+    def resolve_xgb_model_dir(self) -> str:
+        return self._resolve_bundle_dir("xgb", self.xgb_model_dir)
+    def resolve_ascend_model_dir(self) -> str | None:
+        if not self.ascend_model_dir:
+            return None
+        candidate = Path(self.ascend_model_dir).expanduser().resolve()
+        if not candidate.exists() or not candidate.is_dir():
+            raise FileNotFoundError(f"Ascend model directory does not exist: {candidate}")
+        return str(candidate)
+    @classmethod
+    def from_env(cls) -> "PdfModelConfig":
+        return cls(
+            vision_model_dir=os.getenv("DEEPDOC_VISION_MODEL_DIR"),
+            xgb_model_dir=os.getenv("DEEPDOC_XGB_MODEL_DIR"),
+            ascend_model_dir=os.getenv("DEEPDOC_ASCEND_MODEL_DIR"),
+            model_provider=_normalize_provider(os.getenv("DEEPDOC_MODEL_PROVIDER", "auto")),
+        )
+@dataclass(frozen=True)
+class ParserRuntimeConfig:
+    tokenizer: TokenizerConfig
+    pdf_models: PdfModelConfig

deepdoc/depend/find_codec.py ADDED Viewed

@@ -0,0 +1,42 @@
+import chardet
+all_codecs = [
+    'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
+    'cp037', 'cp273', 'cp424', 'cp437',
+    'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp856', 'cp857',
+    'cp858', 'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 'cp869',
+    'cp874', 'cp875', 'cp932', 'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125',
+    'cp1140', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
+    'cp1257', 'cp1258', 'euc_jp', 'euc_jis_2004', 'euc_jisx0213', 'euc_kr',
+    'gb18030', 'hz', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2',
+    'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr', 'latin_1',
+    'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7',
+    'iso8859_8', 'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13',
+    'iso8859_14', 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
+    'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman',
+    'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213',
+    'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16_be', 'utf_16_le', 'utf_7', 'windows-1250', 'windows-1251',
+    'windows-1252', 'windows-1253', 'windows-1254', 'windows-1255', 'windows-1256',
+    'windows-1257', 'windows-1258', 'latin-2'
+]
+def find_codec(blob):
+    detected = chardet.detect(blob[:1024])
+    if detected['confidence'] > 0.5:
+        if detected['encoding'] == "ascii":
+            return "utf-8"
+    for c in all_codecs:
+        try:
+            blob[:1024].decode(c)
+            return c
+        except Exception:
+            pass
+        try:
+            blob.decode(c)
+            return c
+        except Exception:
+            pass
+    return "utf-8"

deepdoc/depend/nltk_manager.py ADDED Viewed

@@ -0,0 +1,114 @@
+import logging
+import os
+import threading
+from pathlib import Path
+from ..common.misc_utils import offline_mode_or_from_env
+logger = logging.getLogger(__name__)
+_RESOURCE_SPECS: tuple[tuple[str, tuple[str, ...]], ...] = (
+    ("punkt", ("tokenizers/punkt", "tokenizers/punkt.zip")),
+    ("punkt_tab", ("tokenizers/punkt_tab", "tokenizers/punkt_tab.zip")),
+    ("wordnet", ("corpora/wordnet", "corpora/wordnet.zip")),
+    (
+        "averaged_perceptron_tagger",
+        (
+            "taggers/averaged_perceptron_tagger",
+            "taggers/averaged_perceptron_tagger.zip",
+            "taggers/averaged_perceptron_tagger_eng",
+            "taggers/averaged_perceptron_tagger_eng.zip",
+        ),
+    ),
+)
+_lock = threading.Lock()
+_ensured_keys: set[tuple[str, bool]] = set()
+def _resolve_nltk_data_dir(data_dir: str | None) -> Path | None:
+    # Resolution precedence:
+    # 1) explicit arg
+    # 2) DEEPDOC_NLTK_DATA_DIR
+    # 3) NLTK_DATA (common NLTK env var)
+    # 4) fall back to a stable Deepdoc cache location so parsers can pick it up automatically
+    configured = data_dir or os.getenv("DEEPDOC_NLTK_DATA_DIR") or os.getenv("NLTK_DATA")
+    if configured and configured.strip():
+        return Path(configured).expanduser().resolve()
+    model_home = os.getenv("DEEPDOC_MODEL_HOME")
+    if model_home and model_home.strip():
+        base = Path(model_home).expanduser().resolve()
+    else:
+        base = Path.home().joinpath(".cache", "deepdoc")
+    return base.joinpath("nltk_data")
+def _ensure_search_path(nltk_module, data_path: Path | None):
+    if not data_path:
+        return
+    data_path.mkdir(parents=True, exist_ok=True)
+    text_path = str(data_path)
+    if text_path not in nltk_module.data.path:
+        nltk_module.data.path.insert(0, text_path)
+    os.environ["NLTK_DATA"] = text_path
+def _resource_exists(nltk_module, candidates: tuple[str, ...]) -> bool:
+    for resource_path in candidates:
+        try:
+            nltk_module.data.find(resource_path)
+            return True
+        except LookupError:
+            continue
+        except Exception as exc:
+            logger.warning("NLTK resource check failed for %s: %s", resource_path, exc)
+            continue
+    return False
+def ensure_nltk_data(
+    *,
+    data_dir: str | None = None,
+    offline: bool | None = None,
+) -> None:
+    """Ensure required NLTK resources are available for tokenizer usage."""
+    import nltk
+    resolved_dir = _resolve_nltk_data_dir(data_dir)
+    offline_mode = offline_mode_or_from_env(offline)
+    auto_download_mode = not offline_mode
+    _ensure_search_path(nltk, resolved_dir)
+    cache_key = (str(resolved_dir) if resolved_dir else "", offline_mode)
+    with _lock:
+        if cache_key in _ensured_keys:
+            return
+        missing_packages: list[str] = []
+        for package, candidates in _RESOURCE_SPECS:
+            if not _resource_exists(nltk, candidates):
+                missing_packages.append(package)
+        if missing_packages and auto_download_mode:
+            download_dir = str(resolved_dir) if resolved_dir else None
+            for package in list(missing_packages):
+                try:
+                    success = nltk.download(package, quiet=True, download_dir=download_dir)
+                except Exception as exc:
+                    logger.warning("Failed to download NLTK package %s: %s", package, exc)
+                    success = False
+                if success and _resource_exists(nltk, dict(_RESOURCE_SPECS)[package]):
+                    missing_packages.remove(package)
+        if missing_packages:
+            searched_paths = ", ".join(nltk.data.path)
+            raise RuntimeError(
+                "Missing required NLTK packages: {}. Searched paths: {}. Set DEEPDOC_NLTK_DATA_DIR to a local NLTK data path, or disable offline mode by setting DEEPDOC_OFFLINE=0.".format(
+                    ", ".join(missing_packages),
+                    searched_paths,
+                )
+            )
+        _ensured_keys.add(cache_key)

deepdoc/depend/prompts/vision_llm_describe_prompt.md ADDED Viewed

@@ -0,0 +1,23 @@
+## INSTRUCTION
+Transcribe the content from the provided PDF page image into clean Markdown format.
+- Only output the content transcribed from the image.
+- Do NOT output this instruction or any other explanation.
+- If the content is missing or you do not understand the input, return an empty string.
+## RULES
+1. Do NOT generate examples, demonstrations, or templates.
+2. Do NOT output any extra text such as 'Example', 'Example Output', or similar.
+3. Do NOT generate any tables, headings, or content that is not explicitly present in the image.
+4. Transcribe content word-for-word. Do NOT modify, translate, or omit any content.
+5. Do NOT explain Markdown or mention that you are using Markdown.
+6. Do NOT wrap the output in ```markdown or ``` blocks.
+7. Only apply Markdown structure to headings, paragraphs, lists, and tables, strictly based on the layout of the image. Do NOT create tables unless an actual table exists in the image.
+8. Preserve the original language, information, and order exactly as shown in the image.
+{% if page %}
+At the end of the transcription, add the page divider: `--- Page {{ page }} ---`.
+{% endif %}
+> If you do not detect valid content in the image, return an empty string.

deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md ADDED Viewed

@@ -0,0 +1,24 @@
+## ROLE
+You are an expert visual data analyst.
+## GOAL
+Analyze the image and provide a comprehensive description of its content. Focus on identifying the type of visual data representation (e.g., bar chart, pie chart, line graph, table, flowchart), its structure, and any text captions or labels included in the image.
+## TASKS
+1. Describe the overall structure of the visual representation. Specify if it is a chart, graph, table, or diagram.
+2. Identify and extract any axes, legends, titles, or labels present in the image. Provide the exact text where available.
+3. Extract the data points from the visual elements (e.g., bar heights, line graph coordinates, pie chart segments, table rows and columns).
+4. Analyze and explain any trends, comparisons, or patterns shown in the data.
+5. Capture any annotations, captions, or footnotes, and explain their relevance to the image.
+6. Only include details that are explicitly present in the image. If an element (e.g., axis, legend, or caption) does not exist or is not visible, do not mention it.
+## OUTPUT FORMAT (Include only sections relevant to the image content)
+- Visual Type: [Type]
+- Title: [Title text, if available]
+- Axes / Legends / Labels: [Details, if available]
+- Data Points: [Extracted data]
+- Trends / Insights: [Analysis and interpretation]
+- Captions / Annotations: [Text and relevance, if available]
+> Ensure high accuracy, clarity, and completeness in your analysis, and include only the information present in the image. Avoid unnecessary statements about missing elements.

deepdoc/depend/prompts.py ADDED Viewed

@@ -0,0 +1,35 @@
+import jinja2
+import os
+BASE_DIR = os.path.dirname(__file__)
+PROMPT_DIR = os.path.join(BASE_DIR, "prompts")
+_loaded_prompts = {}
+def load_prompt(name: str) -> str:
+    if name in _loaded_prompts:
+        return _loaded_prompts[name]
+    path = os.path.join(PROMPT_DIR, f"{name}.md")
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f"Prompt file '{name}.md' not found in prompts/ directory at {PROMPT_DIR}.")
+    with open(path, "r", encoding="utf-8") as f:
+        content = f.read().strip()
+        _loaded_prompts[name] = content
+        return content
+VISION_LLM_DESCRIBE_PROMPT = load_prompt("vision_llm_describe_prompt")
+VISION_LLM_FIGURE_DESCRIBE_PROMPT = load_prompt("vision_llm_figure_describe_prompt")
+PROMPT_JINJA_ENV = jinja2.Environment(autoescape=False, trim_blocks=True, lstrip_blocks=True)
+def vision_llm_describe_prompt(page=None) -> str:
+    template = PROMPT_JINJA_ENV.from_string(VISION_LLM_DESCRIBE_PROMPT)
+    return template.render(page=page)
+def vision_llm_figure_describe_prompt() -> str:
+    template = PROMPT_JINJA_ENV.from_string(VISION_LLM_FIGURE_DESCRIBE_PROMPT)
+    return template.render()