deepdoc-lib 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepdoc/README.md +122 -0
- deepdoc/README_zh.md +116 -0
- deepdoc/__init__.py +43 -0
- deepdoc/_version.py +34 -0
- deepdoc/common/__init__.py +52 -0
- deepdoc/common/config_utils.py +63 -0
- deepdoc/common/connection_utils.py +73 -0
- deepdoc/common/file_utils.py +19 -0
- deepdoc/common/misc_utils.py +44 -0
- deepdoc/common/model_store.py +369 -0
- deepdoc/common/settings.py +42 -0
- deepdoc/common/tiktoken_cache.py +84 -0
- deepdoc/common/token_utils.py +96 -0
- deepdoc/config.py +149 -0
- deepdoc/depend/find_codec.py +42 -0
- deepdoc/depend/nltk_manager.py +114 -0
- deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
- deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
- deepdoc/depend/prompts.py +35 -0
- deepdoc/depend/rag_tokenizer.py +578 -0
- deepdoc/depend/simple_cv_model.py +469 -0
- deepdoc/depend/surname.py +91 -0
- deepdoc/depend/timeout.py +73 -0
- deepdoc/depend/vision_llm_chunk.py +35 -0
- deepdoc/dict/README.md +19 -0
- deepdoc/dict/huqie.txt +555629 -0
- deepdoc/download_models.py +169 -0
- deepdoc/llm_adapter/__init__.py +15 -0
- deepdoc/llm_adapter/adapter.py +223 -0
- deepdoc/llm_adapter/utils.py +104 -0
- deepdoc/llm_adapter/vision.py +163 -0
- deepdoc/parser/__init__.py +42 -0
- deepdoc/parser/docling_parser.py +889 -0
- deepdoc/parser/docx_parser.py +150 -0
- deepdoc/parser/excel_parser.py +270 -0
- deepdoc/parser/figure_parser.py +182 -0
- deepdoc/parser/html_parser.py +221 -0
- deepdoc/parser/json_parser.py +179 -0
- deepdoc/parser/markdown_parser.py +321 -0
- deepdoc/parser/mineru_parser.py +646 -0
- deepdoc/parser/pdf_parser.py +1591 -0
- deepdoc/parser/ppt_parser.py +96 -0
- deepdoc/parser/resume/__init__.py +109 -0
- deepdoc/parser/resume/entities/__init__.py +15 -0
- deepdoc/parser/resume/entities/corporations.py +128 -0
- deepdoc/parser/resume/entities/degrees.py +44 -0
- deepdoc/parser/resume/entities/industries.py +712 -0
- deepdoc/parser/resume/entities/regions.py +789 -0
- deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
- deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
- deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
- deepdoc/parser/resume/entities/res/good_corp.json +911 -0
- deepdoc/parser/resume/entities/res/good_sch.json +595 -0
- deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
- deepdoc/parser/resume/entities/res/schools.csv +5713 -0
- deepdoc/parser/resume/entities/schools.py +91 -0
- deepdoc/parser/resume/step_one.py +189 -0
- deepdoc/parser/resume/step_two.py +692 -0
- deepdoc/parser/tcadp_parser.py +538 -0
- deepdoc/parser/txt_parser.py +64 -0
- deepdoc/parser/utils.py +33 -0
- deepdoc/vision/__init__.py +90 -0
- deepdoc/vision/layout_recognizer.py +481 -0
- deepdoc/vision/ocr.py +757 -0
- deepdoc/vision/operators.py +733 -0
- deepdoc/vision/postprocess.py +370 -0
- deepdoc/vision/recognizer.py +451 -0
- deepdoc/vision/seeit.py +87 -0
- deepdoc/vision/t_ocr.py +101 -0
- deepdoc/vision/t_recognizer.py +186 -0
- deepdoc/vision/table_structure_recognizer.py +617 -0
- deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
- deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
- deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
- deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
- deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
- deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
- scripts/download_models.py +10 -0
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
"""Model artifact resolution for local files or ModelScope downloads."""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import inspect
|
|
22
|
+
import logging
|
|
23
|
+
import os
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
from importlib import resources
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from ..common.misc_utils import offline_mode_or_from_env
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
GLOBAL_MODELSCOPE_REPO_ENV = "DEEPDOC_MODELSCOPE_REPO"
|
|
31
|
+
GLOBAL_MODELSCOPE_REVISION_ENV = "DEEPDOC_MODELSCOPE_REVISION"
|
|
32
|
+
TOKENIZER_MODEL_DIR_ENV = "DEEPDOC_TOKENIZER_MODEL_DIR"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _normalize_provider(provider: str | None) -> str:
|
|
36
|
+
normalized = (provider or os.getenv("DEEPDOC_MODEL_PROVIDER", "auto")).strip().lower()
|
|
37
|
+
aliases = {
|
|
38
|
+
"ms": "modelscope",
|
|
39
|
+
"remote": "modelscope",
|
|
40
|
+
"filesystem": "local",
|
|
41
|
+
"user": "local",
|
|
42
|
+
}
|
|
43
|
+
normalized = aliases.get(normalized, normalized)
|
|
44
|
+
if normalized not in {"auto", "local", "modelscope"}:
|
|
45
|
+
raise ValueError("Unsupported model provider '{}'. Use one of: auto, local, modelscope.".format(normalized))
|
|
46
|
+
return normalized
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _model_home_path(model_home: str | None) -> Path:
|
|
50
|
+
configured = model_home or os.getenv("DEEPDOC_MODEL_HOME")
|
|
51
|
+
if configured:
|
|
52
|
+
return Path(configured).expanduser().resolve()
|
|
53
|
+
return Path.home().joinpath(".cache", "deepdoc")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _resolve_tokenizer_dict_path() -> Path:
|
|
57
|
+
configured_dir = os.getenv(TOKENIZER_MODEL_DIR_ENV)
|
|
58
|
+
if configured_dir:
|
|
59
|
+
dictionary = Path(configured_dir).expanduser().resolve().joinpath("huqie.txt")
|
|
60
|
+
else:
|
|
61
|
+
dictionary = Path(str(resources.files("deepdoc").joinpath("dict", "huqie.txt"))).resolve()
|
|
62
|
+
|
|
63
|
+
if not dictionary.exists():
|
|
64
|
+
raise FileNotFoundError("Tokenizer dictionary not found: {}. Set {} to a directory containing huqie.txt.".format(dictionary, TOKENIZER_MODEL_DIR_ENV))
|
|
65
|
+
return dictionary
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass(frozen=True)
|
|
69
|
+
class BundleSpec:
|
|
70
|
+
name: str
|
|
71
|
+
subdir: str
|
|
72
|
+
required_files: tuple[str, ...]
|
|
73
|
+
local_dir_env: str
|
|
74
|
+
repo_env: str
|
|
75
|
+
repo_default: str
|
|
76
|
+
revision_env: str
|
|
77
|
+
revision_default: str = "master"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
BUNDLES: dict[str, BundleSpec] = {
|
|
81
|
+
"vision": BundleSpec(
|
|
82
|
+
name="vision",
|
|
83
|
+
subdir="vision",
|
|
84
|
+
required_files=(
|
|
85
|
+
"det.onnx",
|
|
86
|
+
"layout.onnx",
|
|
87
|
+
"layout.laws.onnx",
|
|
88
|
+
"layout.manual.onnx",
|
|
89
|
+
"layout.paper.onnx",
|
|
90
|
+
"ocr.res",
|
|
91
|
+
"rec.onnx",
|
|
92
|
+
"tsr.onnx",
|
|
93
|
+
),
|
|
94
|
+
local_dir_env="DEEPDOC_VISION_MODEL_DIR",
|
|
95
|
+
repo_env="DEEPDOC_MODELSCOPE_VISION_REPO",
|
|
96
|
+
repo_default="Xorbits/deepdoc",
|
|
97
|
+
revision_env="DEEPDOC_MODELSCOPE_VISION_REVISION",
|
|
98
|
+
),
|
|
99
|
+
"xgb": BundleSpec(
|
|
100
|
+
name="xgb",
|
|
101
|
+
subdir="xgb",
|
|
102
|
+
required_files=("updown_concat_xgb.model",),
|
|
103
|
+
local_dir_env="DEEPDOC_XGB_MODEL_DIR",
|
|
104
|
+
repo_env="DEEPDOC_MODELSCOPE_XGB_REPO",
|
|
105
|
+
repo_default="Xorbits/deepdoc",
|
|
106
|
+
revision_env="DEEPDOC_MODELSCOPE_XGB_REVISION",
|
|
107
|
+
),
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _resolve_modelscope_repo_id(spec: BundleSpec) -> tuple[str, bool]:
|
|
112
|
+
"""Resolve ModelScope repo id, supporting a shared combined repo.
|
|
113
|
+
|
|
114
|
+
Precedence:
|
|
115
|
+
1) per-bundle env (e.g. DEEPDOC_MODELSCOPE_VISION_REPO)
|
|
116
|
+
2) shared env (DEEPDOC_MODELSCOPE_REPO)
|
|
117
|
+
3) per-bundle default (spec.repo_default)
|
|
118
|
+
|
|
119
|
+
Returns (repo_id, use_shared_download_dir).
|
|
120
|
+
"""
|
|
121
|
+
explicit = os.getenv(spec.repo_env)
|
|
122
|
+
if explicit and explicit.strip():
|
|
123
|
+
repo_id = explicit.strip()
|
|
124
|
+
shared_repo = os.getenv(GLOBAL_MODELSCOPE_REPO_ENV)
|
|
125
|
+
use_shared_dir = bool(shared_repo and shared_repo.strip() and shared_repo.strip() == repo_id)
|
|
126
|
+
return repo_id, use_shared_dir
|
|
127
|
+
|
|
128
|
+
shared_repo = os.getenv(GLOBAL_MODELSCOPE_REPO_ENV)
|
|
129
|
+
if shared_repo and shared_repo.strip():
|
|
130
|
+
return shared_repo.strip(), True
|
|
131
|
+
|
|
132
|
+
repo_id = spec.repo_default.strip()
|
|
133
|
+
# If all bundle defaults point at the same repo, treat it as a combined repo and share the download directory.
|
|
134
|
+
default_repos = {bundle.repo_default.strip() for bundle in BUNDLES.values()}
|
|
135
|
+
use_shared_dir = len(default_repos) == 1 and repo_id in default_repos
|
|
136
|
+
return repo_id, use_shared_dir
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _resolve_modelscope_revision(spec: BundleSpec) -> str:
|
|
140
|
+
"""Resolve ModelScope revision with an optional shared default."""
|
|
141
|
+
explicit = os.getenv(spec.revision_env)
|
|
142
|
+
if explicit and explicit.strip():
|
|
143
|
+
return explicit.strip()
|
|
144
|
+
|
|
145
|
+
shared = os.getenv(GLOBAL_MODELSCOPE_REVISION_ENV)
|
|
146
|
+
if shared and shared.strip():
|
|
147
|
+
return shared.strip()
|
|
148
|
+
|
|
149
|
+
return spec.revision_default
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _slugify_repo_path(value: str) -> str:
|
|
153
|
+
return value.strip().replace("/", "__").replace(":", "__").replace("\\", "__")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _modelscope_shared_download_dir(model_home: str | None, repo_id: str, revision: str) -> Path:
|
|
157
|
+
"""Stable directory for a combined repo snapshot to avoid collisions."""
|
|
158
|
+
base = _model_home_path(model_home)
|
|
159
|
+
return base.joinpath("modelscope", _slugify_repo_path(repo_id), _slugify_repo_path(revision))
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _validate_bundle_dir(spec: BundleSpec, base_dir: Path) -> tuple[bool, list[str]]:
|
|
163
|
+
missing = [name for name in spec.required_files if not base_dir.joinpath(name).exists()]
|
|
164
|
+
return not missing, missing
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _discover_bundle_dir(spec: BundleSpec, roots: list[Path]) -> Path | None:
|
|
168
|
+
for root in roots:
|
|
169
|
+
exists, _ = _validate_bundle_dir(spec, root)
|
|
170
|
+
if exists:
|
|
171
|
+
return root
|
|
172
|
+
|
|
173
|
+
for root in roots:
|
|
174
|
+
if not root.exists():
|
|
175
|
+
continue
|
|
176
|
+
for required in spec.required_files:
|
|
177
|
+
for hit in root.rglob(required):
|
|
178
|
+
candidate = hit.parent
|
|
179
|
+
exists, _ = _validate_bundle_dir(spec, candidate)
|
|
180
|
+
if exists:
|
|
181
|
+
return candidate
|
|
182
|
+
return None
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _import_modelscope_snapshot_download():
|
|
186
|
+
try:
|
|
187
|
+
from modelscope.hub.snapshot_download import snapshot_download # type: ignore
|
|
188
|
+
|
|
189
|
+
return snapshot_download
|
|
190
|
+
except Exception:
|
|
191
|
+
try:
|
|
192
|
+
from modelscope import snapshot_download # type: ignore
|
|
193
|
+
|
|
194
|
+
return snapshot_download
|
|
195
|
+
except Exception as exc: # pragma: no cover - import behavior depends on runtime env
|
|
196
|
+
raise RuntimeError("ModelScope provider requires the 'modelscope' package. Install it or switch DEEPDOC_MODEL_PROVIDER=local.") from exc
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _download_modelscope_repo(*, repo_id: str, revision: str, target_dir: Path, offline: bool) -> Path:
|
|
200
|
+
snapshot_download = _import_modelscope_snapshot_download()
|
|
201
|
+
|
|
202
|
+
if not repo_id:
|
|
203
|
+
raise RuntimeError(f"ModelScope repo id is empty. Set {GLOBAL_MODELSCOPE_REPO_ENV} or a bundle-specific env like DEEPDOC_MODELSCOPE_VISION_REPO.")
|
|
204
|
+
|
|
205
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
206
|
+
|
|
207
|
+
signature = inspect.signature(snapshot_download)
|
|
208
|
+
params = signature.parameters
|
|
209
|
+
kwargs: dict[str, object] = {}
|
|
210
|
+
|
|
211
|
+
if "model_id" in params:
|
|
212
|
+
kwargs["model_id"] = repo_id
|
|
213
|
+
elif "repo_id" in params:
|
|
214
|
+
kwargs["repo_id"] = repo_id
|
|
215
|
+
|
|
216
|
+
if "revision" in params:
|
|
217
|
+
kwargs["revision"] = revision
|
|
218
|
+
|
|
219
|
+
if "cache_dir" in params:
|
|
220
|
+
kwargs["cache_dir"] = str(target_dir.parent)
|
|
221
|
+
|
|
222
|
+
if "local_dir" in params:
|
|
223
|
+
kwargs["local_dir"] = str(target_dir)
|
|
224
|
+
|
|
225
|
+
if "local_dir_use_symlinks" in params:
|
|
226
|
+
kwargs["local_dir_use_symlinks"] = False
|
|
227
|
+
|
|
228
|
+
if "local_files_only" in params:
|
|
229
|
+
kwargs["local_files_only"] = offline
|
|
230
|
+
|
|
231
|
+
if "model_id" in kwargs or "repo_id" in kwargs:
|
|
232
|
+
snapshot_root = snapshot_download(**kwargs)
|
|
233
|
+
else:
|
|
234
|
+
snapshot_root = snapshot_download(repo_id, **kwargs)
|
|
235
|
+
|
|
236
|
+
resolved_snapshot = Path(snapshot_root).expanduser().resolve()
|
|
237
|
+
logging.info("Downloaded ModelScope repo %s@%s to %s", repo_id, revision, resolved_snapshot)
|
|
238
|
+
return resolved_snapshot
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def resolve_bundle_dir(
|
|
242
|
+
bundle: str,
|
|
243
|
+
*,
|
|
244
|
+
model_home: str | None = None,
|
|
245
|
+
provider: str | None = None,
|
|
246
|
+
offline: bool | None = None,
|
|
247
|
+
) -> str:
|
|
248
|
+
"""Resolve a model bundle directory from local files or ModelScope."""
|
|
249
|
+
|
|
250
|
+
if bundle not in BUNDLES:
|
|
251
|
+
raise ValueError(f"Unknown model bundle '{bundle}'. Expected one of: {', '.join(BUNDLES)}")
|
|
252
|
+
|
|
253
|
+
spec = BUNDLES[bundle]
|
|
254
|
+
provider_name = _normalize_provider(provider)
|
|
255
|
+
offline_mode = offline_mode_or_from_env(offline)
|
|
256
|
+
|
|
257
|
+
explicit_local = os.getenv(spec.local_dir_env)
|
|
258
|
+
if explicit_local:
|
|
259
|
+
local_bundle_dir = Path(explicit_local).expanduser().resolve()
|
|
260
|
+
else:
|
|
261
|
+
local_bundle_dir = _model_home_path(model_home).joinpath(spec.subdir)
|
|
262
|
+
|
|
263
|
+
roots_to_scan = [local_bundle_dir]
|
|
264
|
+
|
|
265
|
+
# Prefer reusing already-downloaded ModelScope artifacts from our stable
|
|
266
|
+
# `model_home/modelscope/<repo>/<revision>/...` location when using shared repos.
|
|
267
|
+
shared_download_dir: Path | None = None
|
|
268
|
+
shared_repo_id: str | None = None
|
|
269
|
+
shared_revision: str | None = None
|
|
270
|
+
use_shared_repo_dir = False
|
|
271
|
+
if provider_name in {"auto", "modelscope"} and not offline_mode:
|
|
272
|
+
shared_repo_id, use_shared_repo_dir = _resolve_modelscope_repo_id(spec)
|
|
273
|
+
if use_shared_repo_dir:
|
|
274
|
+
shared_revision = _resolve_modelscope_revision(spec)
|
|
275
|
+
shared_download_dir = _modelscope_shared_download_dir(model_home, shared_repo_id, shared_revision)
|
|
276
|
+
roots_to_scan.append(shared_download_dir)
|
|
277
|
+
|
|
278
|
+
discovered = _discover_bundle_dir(spec, roots_to_scan)
|
|
279
|
+
if discovered:
|
|
280
|
+
return str(discovered)
|
|
281
|
+
|
|
282
|
+
if provider_name == "local":
|
|
283
|
+
_, missing = _validate_bundle_dir(spec, local_bundle_dir)
|
|
284
|
+
raise FileNotFoundError(
|
|
285
|
+
"Missing required files for local '{}' bundle under {}: {}. Set {} or DEEPDOC_MODEL_HOME to a directory containing these files.".format(
|
|
286
|
+
spec.name, local_bundle_dir, ", ".join(missing), spec.local_dir_env
|
|
287
|
+
)
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
if provider_name == "local" or offline_mode:
|
|
291
|
+
raise FileNotFoundError("Bundle '{}' was not found locally at {} and remote download is disabled. Disable DEEPDOC_OFFLINE or provide local model files.".format(spec.name, local_bundle_dir))
|
|
292
|
+
|
|
293
|
+
repo_id = shared_repo_id
|
|
294
|
+
revision = shared_revision
|
|
295
|
+
if not repo_id or revision is None or not use_shared_repo_dir:
|
|
296
|
+
repo_id, use_shared_repo_dir = _resolve_modelscope_repo_id(spec)
|
|
297
|
+
revision = _resolve_modelscope_revision(spec)
|
|
298
|
+
download_dir = _modelscope_shared_download_dir(model_home, repo_id, revision) if use_shared_repo_dir else local_bundle_dir
|
|
299
|
+
else:
|
|
300
|
+
# We already computed the shared repo download dir above.
|
|
301
|
+
download_dir = shared_download_dir or _modelscope_shared_download_dir(model_home, repo_id, revision)
|
|
302
|
+
|
|
303
|
+
snapshot_root = _download_modelscope_repo(
|
|
304
|
+
repo_id=repo_id,
|
|
305
|
+
revision=revision,
|
|
306
|
+
target_dir=download_dir,
|
|
307
|
+
offline=offline_mode,
|
|
308
|
+
)
|
|
309
|
+
discovered = _discover_bundle_dir(spec, [local_bundle_dir, download_dir, snapshot_root])
|
|
310
|
+
if discovered:
|
|
311
|
+
return str(discovered)
|
|
312
|
+
|
|
313
|
+
raise FileNotFoundError(
|
|
314
|
+
"Downloaded ModelScope repo '{}@{}' for bundle '{}' but could not locate the required files. "
|
|
315
|
+
"Expected the following files to be colocated under a single directory in the repo (e.g. '{}/'): {}. "
|
|
316
|
+
"Configured via {} / {} and {} / {}. "
|
|
317
|
+
"Searched under: {}, {}.".format(
|
|
318
|
+
repo_id,
|
|
319
|
+
revision,
|
|
320
|
+
spec.name,
|
|
321
|
+
spec.subdir,
|
|
322
|
+
", ".join(spec.required_files),
|
|
323
|
+
spec.repo_env,
|
|
324
|
+
GLOBAL_MODELSCOPE_REPO_ENV,
|
|
325
|
+
spec.revision_env,
|
|
326
|
+
GLOBAL_MODELSCOPE_REVISION_ENV,
|
|
327
|
+
download_dir,
|
|
328
|
+
snapshot_root,
|
|
329
|
+
)
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def validate_bundle_dir(bundle: str, directory: str | Path) -> tuple[bool, list[str]]:
|
|
334
|
+
"""Validate a local model bundle directory and report missing required files."""
|
|
335
|
+
|
|
336
|
+
if bundle not in BUNDLES:
|
|
337
|
+
raise ValueError(f"Unknown model bundle '{bundle}'. Expected one of: {', '.join(BUNDLES)}")
|
|
338
|
+
|
|
339
|
+
spec = BUNDLES[bundle]
|
|
340
|
+
candidate = Path(directory).expanduser().resolve()
|
|
341
|
+
return _validate_bundle_dir(spec, candidate)
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def resolve_vision_model_dir(
|
|
345
|
+
*,
|
|
346
|
+
model_home: str | None = None,
|
|
347
|
+
provider: str | None = None,
|
|
348
|
+
offline: bool | None = None,
|
|
349
|
+
) -> str:
|
|
350
|
+
return resolve_bundle_dir("vision", model_home=model_home, provider=provider, offline=offline)
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def resolve_xgb_model_dir(
|
|
354
|
+
*,
|
|
355
|
+
model_home: str | None = None,
|
|
356
|
+
provider: str | None = None,
|
|
357
|
+
offline: bool | None = None,
|
|
358
|
+
) -> str:
|
|
359
|
+
return resolve_bundle_dir("xgb", model_home=model_home, provider=provider, offline=offline)
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def resolve_tokenizer_dict_prefix(
|
|
363
|
+
*,
|
|
364
|
+
model_home: str | None = None,
|
|
365
|
+
provider: str | None = None,
|
|
366
|
+
offline: bool | None = None,
|
|
367
|
+
) -> str:
|
|
368
|
+
del model_home, provider, offline
|
|
369
|
+
return str(_resolve_tokenizer_dict_path().with_suffix(""))
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
Simplified settings for DeepDoc independent library.
|
|
19
|
+
Only includes configurations needed by DeepDoc components.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import logging
|
|
23
|
+
|
|
24
|
+
# GPU device count detection
|
|
25
|
+
PARALLEL_DEVICES: int = 0
|
|
26
|
+
|
|
27
|
+
def check_and_install_torch():
|
|
28
|
+
"""
|
|
29
|
+
Check for PyTorch and detect GPU devices.
|
|
30
|
+
Simplified version for independent library.
|
|
31
|
+
"""
|
|
32
|
+
global PARALLEL_DEVICES
|
|
33
|
+
try:
|
|
34
|
+
import torch.cuda
|
|
35
|
+
PARALLEL_DEVICES = torch.cuda.device_count()
|
|
36
|
+
logging.info(f"Found {PARALLEL_DEVICES} GPUs")
|
|
37
|
+
except Exception as e:
|
|
38
|
+
logging.info("Can't import package 'torch' or access GPU: %s", str(e))
|
|
39
|
+
PARALLEL_DEVICES = 0
|
|
40
|
+
|
|
41
|
+
# Initialize on import
|
|
42
|
+
check_and_install_torch()
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import os
|
|
5
|
+
import uuid
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from urllib.request import urlopen
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
DEEPDOC_TIKTOKEN_CACHE_DIR_ENV = "DEEPDOC_TIKTOKEN_CACHE_DIR"
|
|
11
|
+
CL100K_BASE_BLOB_URL = "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
|
12
|
+
CL100K_BASE_EXPECTED_HASH = "223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def resolve_tiktoken_cache_dir(cache_dir: str | None = None, model_home: str | None = None) -> Path:
|
|
16
|
+
if cache_dir:
|
|
17
|
+
return Path(cache_dir).expanduser().resolve()
|
|
18
|
+
|
|
19
|
+
explicit_cache_dir = os.getenv(DEEPDOC_TIKTOKEN_CACHE_DIR_ENV) or os.getenv("TIKTOKEN_CACHE_DIR")
|
|
20
|
+
if explicit_cache_dir:
|
|
21
|
+
return Path(explicit_cache_dir).expanduser().resolve()
|
|
22
|
+
|
|
23
|
+
configured_model_home = model_home or os.getenv("DEEPDOC_MODEL_HOME")
|
|
24
|
+
if configured_model_home:
|
|
25
|
+
return Path(configured_model_home).expanduser().resolve().joinpath("tiktoken_cache")
|
|
26
|
+
|
|
27
|
+
return Path.home().joinpath(".cache", "deepdoc", "tiktoken_cache").resolve()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def configure_tiktoken_cache_env(cache_dir: str | None = None, model_home: str | None = None) -> str:
|
|
31
|
+
resolved_cache_dir = resolve_tiktoken_cache_dir(cache_dir=cache_dir, model_home=model_home)
|
|
32
|
+
os.environ["TIKTOKEN_CACHE_DIR"] = str(resolved_cache_dir)
|
|
33
|
+
return str(resolved_cache_dir)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def cl100k_base_cache_key(blob_url: str = CL100K_BASE_BLOB_URL) -> str:
|
|
37
|
+
return hashlib.sha1(blob_url.encode()).hexdigest()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _matches_expected_hash(data: bytes, expected_hash: str | None) -> bool:
|
|
41
|
+
if not expected_hash:
|
|
42
|
+
return True
|
|
43
|
+
return hashlib.sha256(data).hexdigest() == expected_hash
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def download_cl100k_base(
|
|
47
|
+
*,
|
|
48
|
+
cache_dir: str | None = None,
|
|
49
|
+
model_home: str | None = None,
|
|
50
|
+
offline: bool = False,
|
|
51
|
+
blob_url: str = CL100K_BASE_BLOB_URL,
|
|
52
|
+
expected_hash: str = CL100K_BASE_EXPECTED_HASH,
|
|
53
|
+
timeout: int = 60,
|
|
54
|
+
) -> Path:
|
|
55
|
+
resolved_cache_dir = resolve_tiktoken_cache_dir(cache_dir=cache_dir, model_home=model_home)
|
|
56
|
+
cache_key = cl100k_base_cache_key(blob_url)
|
|
57
|
+
target_path = resolved_cache_dir.joinpath(cache_key)
|
|
58
|
+
|
|
59
|
+
if target_path.exists():
|
|
60
|
+
data = target_path.read_bytes()
|
|
61
|
+
if _matches_expected_hash(data, expected_hash):
|
|
62
|
+
return target_path
|
|
63
|
+
target_path.unlink()
|
|
64
|
+
|
|
65
|
+
if offline:
|
|
66
|
+
raise FileNotFoundError(
|
|
67
|
+
"Missing cached tiktoken encoder '{}'. Expected file at {}. Run the download command without --offline first."
|
|
68
|
+
.format(cache_key, target_path)
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
with urlopen(blob_url, timeout=timeout) as response:
|
|
72
|
+
data = response.read()
|
|
73
|
+
|
|
74
|
+
if not _matches_expected_hash(data, expected_hash):
|
|
75
|
+
raise ValueError(
|
|
76
|
+
"Hash mismatch for tiktoken encoder downloaded from {}. Expected SHA256 {}."
|
|
77
|
+
.format(blob_url, expected_hash)
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
resolved_cache_dir.mkdir(parents=True, exist_ok=True)
|
|
81
|
+
tmp_path = target_path.with_name("{}.{}.tmp".format(target_path.name, uuid.uuid4().hex))
|
|
82
|
+
tmp_path.write_bytes(data)
|
|
83
|
+
tmp_path.replace(target_path)
|
|
84
|
+
return target_path
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
import tiktoken
|
|
18
|
+
|
|
19
|
+
from .tiktoken_cache import configure_tiktoken_cache_env
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
_encoder = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _get_encoder():
|
|
26
|
+
global _encoder
|
|
27
|
+
if _encoder is None:
|
|
28
|
+
configure_tiktoken_cache_env()
|
|
29
|
+
_encoder = tiktoken.get_encoding("cl100k_base")
|
|
30
|
+
return _encoder
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def num_tokens_from_string(string: str) -> int:
|
|
34
|
+
"""Returns the number of tokens in a text string."""
|
|
35
|
+
try:
|
|
36
|
+
code_list = _get_encoder().encode(string)
|
|
37
|
+
return len(code_list)
|
|
38
|
+
except Exception:
|
|
39
|
+
return 0
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def total_token_count_from_response(resp):
|
|
43
|
+
"""
|
|
44
|
+
Extract token count from LLM response in various formats.
|
|
45
|
+
|
|
46
|
+
Handles None responses and different response structures from various LLM providers.
|
|
47
|
+
Returns 0 if token count cannot be determined.
|
|
48
|
+
"""
|
|
49
|
+
if resp is None:
|
|
50
|
+
return 0
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
if hasattr(resp, "usage") and hasattr(resp.usage, "total_tokens"):
|
|
54
|
+
return resp.usage.total_tokens
|
|
55
|
+
except Exception:
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
if hasattr(resp, "usage_metadata") and hasattr(resp.usage_metadata, "total_tokens"):
|
|
60
|
+
return resp.usage_metadata.total_tokens
|
|
61
|
+
except Exception:
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
if hasattr(resp, "meta") and hasattr(resp.meta, "billed_units") and hasattr(resp.meta.billed_units, "input_tokens"):
|
|
66
|
+
return resp.meta.billed_units.input_tokens
|
|
67
|
+
except Exception:
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
if isinstance(resp, dict) and 'usage' in resp and 'total_tokens' in resp['usage']:
|
|
71
|
+
try:
|
|
72
|
+
return resp["usage"]["total_tokens"]
|
|
73
|
+
except Exception:
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
if isinstance(resp, dict) and 'usage' in resp and 'input_tokens' in resp['usage'] and 'output_tokens' in resp['usage']:
|
|
77
|
+
try:
|
|
78
|
+
return resp["usage"]["input_tokens"] + resp["usage"]["output_tokens"]
|
|
79
|
+
except Exception:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
if isinstance(resp, dict) and 'meta' in resp and 'tokens' in resp['meta'] and 'input_tokens' in resp['meta']['tokens'] and 'output_tokens' in resp['meta']['tokens']:
|
|
83
|
+
try:
|
|
84
|
+
return resp["meta"]["tokens"]["input_tokens"] + resp["meta"]["tokens"]["output_tokens"]
|
|
85
|
+
except Exception:
|
|
86
|
+
pass
|
|
87
|
+
return 0
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def truncate(string: str, max_len: int) -> str:
|
|
91
|
+
"""Returns truncated text if the length of text exceed max_len."""
|
|
92
|
+
try:
|
|
93
|
+
encoder = _get_encoder()
|
|
94
|
+
return encoder.decode(encoder.encode(string)[:max_len])
|
|
95
|
+
except Exception:
|
|
96
|
+
return string[:max_len]
|