deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,369 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Model artifact resolution for local files or ModelScope downloads."""
18
+
19
+ from __future__ import annotations
20
+
21
+ import inspect
22
+ import logging
23
+ import os
24
+ from dataclasses import dataclass
25
+ from importlib import resources
26
+ from pathlib import Path
27
+ from ..common.misc_utils import offline_mode_or_from_env
28
+
29
+
30
+ GLOBAL_MODELSCOPE_REPO_ENV = "DEEPDOC_MODELSCOPE_REPO"
31
+ GLOBAL_MODELSCOPE_REVISION_ENV = "DEEPDOC_MODELSCOPE_REVISION"
32
+ TOKENIZER_MODEL_DIR_ENV = "DEEPDOC_TOKENIZER_MODEL_DIR"
33
+
34
+
35
+ def _normalize_provider(provider: str | None) -> str:
36
+ normalized = (provider or os.getenv("DEEPDOC_MODEL_PROVIDER", "auto")).strip().lower()
37
+ aliases = {
38
+ "ms": "modelscope",
39
+ "remote": "modelscope",
40
+ "filesystem": "local",
41
+ "user": "local",
42
+ }
43
+ normalized = aliases.get(normalized, normalized)
44
+ if normalized not in {"auto", "local", "modelscope"}:
45
+ raise ValueError("Unsupported model provider '{}'. Use one of: auto, local, modelscope.".format(normalized))
46
+ return normalized
47
+
48
+
49
+ def _model_home_path(model_home: str | None) -> Path:
50
+ configured = model_home or os.getenv("DEEPDOC_MODEL_HOME")
51
+ if configured:
52
+ return Path(configured).expanduser().resolve()
53
+ return Path.home().joinpath(".cache", "deepdoc")
54
+
55
+
56
+ def _resolve_tokenizer_dict_path() -> Path:
57
+ configured_dir = os.getenv(TOKENIZER_MODEL_DIR_ENV)
58
+ if configured_dir:
59
+ dictionary = Path(configured_dir).expanduser().resolve().joinpath("huqie.txt")
60
+ else:
61
+ dictionary = Path(str(resources.files("deepdoc").joinpath("dict", "huqie.txt"))).resolve()
62
+
63
+ if not dictionary.exists():
64
+ raise FileNotFoundError("Tokenizer dictionary not found: {}. Set {} to a directory containing huqie.txt.".format(dictionary, TOKENIZER_MODEL_DIR_ENV))
65
+ return dictionary
66
+
67
+
68
+ @dataclass(frozen=True)
69
+ class BundleSpec:
70
+ name: str
71
+ subdir: str
72
+ required_files: tuple[str, ...]
73
+ local_dir_env: str
74
+ repo_env: str
75
+ repo_default: str
76
+ revision_env: str
77
+ revision_default: str = "master"
78
+
79
+
80
+ BUNDLES: dict[str, BundleSpec] = {
81
+ "vision": BundleSpec(
82
+ name="vision",
83
+ subdir="vision",
84
+ required_files=(
85
+ "det.onnx",
86
+ "layout.onnx",
87
+ "layout.laws.onnx",
88
+ "layout.manual.onnx",
89
+ "layout.paper.onnx",
90
+ "ocr.res",
91
+ "rec.onnx",
92
+ "tsr.onnx",
93
+ ),
94
+ local_dir_env="DEEPDOC_VISION_MODEL_DIR",
95
+ repo_env="DEEPDOC_MODELSCOPE_VISION_REPO",
96
+ repo_default="Xorbits/deepdoc",
97
+ revision_env="DEEPDOC_MODELSCOPE_VISION_REVISION",
98
+ ),
99
+ "xgb": BundleSpec(
100
+ name="xgb",
101
+ subdir="xgb",
102
+ required_files=("updown_concat_xgb.model",),
103
+ local_dir_env="DEEPDOC_XGB_MODEL_DIR",
104
+ repo_env="DEEPDOC_MODELSCOPE_XGB_REPO",
105
+ repo_default="Xorbits/deepdoc",
106
+ revision_env="DEEPDOC_MODELSCOPE_XGB_REVISION",
107
+ ),
108
+ }
109
+
110
+
111
+ def _resolve_modelscope_repo_id(spec: BundleSpec) -> tuple[str, bool]:
112
+ """Resolve ModelScope repo id, supporting a shared combined repo.
113
+
114
+ Precedence:
115
+ 1) per-bundle env (e.g. DEEPDOC_MODELSCOPE_VISION_REPO)
116
+ 2) shared env (DEEPDOC_MODELSCOPE_REPO)
117
+ 3) per-bundle default (spec.repo_default)
118
+
119
+ Returns (repo_id, use_shared_download_dir).
120
+ """
121
+ explicit = os.getenv(spec.repo_env)
122
+ if explicit and explicit.strip():
123
+ repo_id = explicit.strip()
124
+ shared_repo = os.getenv(GLOBAL_MODELSCOPE_REPO_ENV)
125
+ use_shared_dir = bool(shared_repo and shared_repo.strip() and shared_repo.strip() == repo_id)
126
+ return repo_id, use_shared_dir
127
+
128
+ shared_repo = os.getenv(GLOBAL_MODELSCOPE_REPO_ENV)
129
+ if shared_repo and shared_repo.strip():
130
+ return shared_repo.strip(), True
131
+
132
+ repo_id = spec.repo_default.strip()
133
+ # If all bundle defaults point at the same repo, treat it as a combined repo and share the download directory.
134
+ default_repos = {bundle.repo_default.strip() for bundle in BUNDLES.values()}
135
+ use_shared_dir = len(default_repos) == 1 and repo_id in default_repos
136
+ return repo_id, use_shared_dir
137
+
138
+
139
+ def _resolve_modelscope_revision(spec: BundleSpec) -> str:
140
+ """Resolve ModelScope revision with an optional shared default."""
141
+ explicit = os.getenv(spec.revision_env)
142
+ if explicit and explicit.strip():
143
+ return explicit.strip()
144
+
145
+ shared = os.getenv(GLOBAL_MODELSCOPE_REVISION_ENV)
146
+ if shared and shared.strip():
147
+ return shared.strip()
148
+
149
+ return spec.revision_default
150
+
151
+
152
+ def _slugify_repo_path(value: str) -> str:
153
+ return value.strip().replace("/", "__").replace(":", "__").replace("\\", "__")
154
+
155
+
156
+ def _modelscope_shared_download_dir(model_home: str | None, repo_id: str, revision: str) -> Path:
157
+ """Stable directory for a combined repo snapshot to avoid collisions."""
158
+ base = _model_home_path(model_home)
159
+ return base.joinpath("modelscope", _slugify_repo_path(repo_id), _slugify_repo_path(revision))
160
+
161
+
162
+ def _validate_bundle_dir(spec: BundleSpec, base_dir: Path) -> tuple[bool, list[str]]:
163
+ missing = [name for name in spec.required_files if not base_dir.joinpath(name).exists()]
164
+ return not missing, missing
165
+
166
+
167
+ def _discover_bundle_dir(spec: BundleSpec, roots: list[Path]) -> Path | None:
168
+ for root in roots:
169
+ exists, _ = _validate_bundle_dir(spec, root)
170
+ if exists:
171
+ return root
172
+
173
+ for root in roots:
174
+ if not root.exists():
175
+ continue
176
+ for required in spec.required_files:
177
+ for hit in root.rglob(required):
178
+ candidate = hit.parent
179
+ exists, _ = _validate_bundle_dir(spec, candidate)
180
+ if exists:
181
+ return candidate
182
+ return None
183
+
184
+
185
+ def _import_modelscope_snapshot_download():
186
+ try:
187
+ from modelscope.hub.snapshot_download import snapshot_download # type: ignore
188
+
189
+ return snapshot_download
190
+ except Exception:
191
+ try:
192
+ from modelscope import snapshot_download # type: ignore
193
+
194
+ return snapshot_download
195
+ except Exception as exc: # pragma: no cover - import behavior depends on runtime env
196
+ raise RuntimeError("ModelScope provider requires the 'modelscope' package. Install it or switch DEEPDOC_MODEL_PROVIDER=local.") from exc
197
+
198
+
199
+ def _download_modelscope_repo(*, repo_id: str, revision: str, target_dir: Path, offline: bool) -> Path:
200
+ snapshot_download = _import_modelscope_snapshot_download()
201
+
202
+ if not repo_id:
203
+ raise RuntimeError(f"ModelScope repo id is empty. Set {GLOBAL_MODELSCOPE_REPO_ENV} or a bundle-specific env like DEEPDOC_MODELSCOPE_VISION_REPO.")
204
+
205
+ target_dir.mkdir(parents=True, exist_ok=True)
206
+
207
+ signature = inspect.signature(snapshot_download)
208
+ params = signature.parameters
209
+ kwargs: dict[str, object] = {}
210
+
211
+ if "model_id" in params:
212
+ kwargs["model_id"] = repo_id
213
+ elif "repo_id" in params:
214
+ kwargs["repo_id"] = repo_id
215
+
216
+ if "revision" in params:
217
+ kwargs["revision"] = revision
218
+
219
+ if "cache_dir" in params:
220
+ kwargs["cache_dir"] = str(target_dir.parent)
221
+
222
+ if "local_dir" in params:
223
+ kwargs["local_dir"] = str(target_dir)
224
+
225
+ if "local_dir_use_symlinks" in params:
226
+ kwargs["local_dir_use_symlinks"] = False
227
+
228
+ if "local_files_only" in params:
229
+ kwargs["local_files_only"] = offline
230
+
231
+ if "model_id" in kwargs or "repo_id" in kwargs:
232
+ snapshot_root = snapshot_download(**kwargs)
233
+ else:
234
+ snapshot_root = snapshot_download(repo_id, **kwargs)
235
+
236
+ resolved_snapshot = Path(snapshot_root).expanduser().resolve()
237
+ logging.info("Downloaded ModelScope repo %s@%s to %s", repo_id, revision, resolved_snapshot)
238
+ return resolved_snapshot
239
+
240
+
241
+ def resolve_bundle_dir(
242
+ bundle: str,
243
+ *,
244
+ model_home: str | None = None,
245
+ provider: str | None = None,
246
+ offline: bool | None = None,
247
+ ) -> str:
248
+ """Resolve a model bundle directory from local files or ModelScope."""
249
+
250
+ if bundle not in BUNDLES:
251
+ raise ValueError(f"Unknown model bundle '{bundle}'. Expected one of: {', '.join(BUNDLES)}")
252
+
253
+ spec = BUNDLES[bundle]
254
+ provider_name = _normalize_provider(provider)
255
+ offline_mode = offline_mode_or_from_env(offline)
256
+
257
+ explicit_local = os.getenv(spec.local_dir_env)
258
+ if explicit_local:
259
+ local_bundle_dir = Path(explicit_local).expanduser().resolve()
260
+ else:
261
+ local_bundle_dir = _model_home_path(model_home).joinpath(spec.subdir)
262
+
263
+ roots_to_scan = [local_bundle_dir]
264
+
265
+ # Prefer reusing already-downloaded ModelScope artifacts from our stable
266
+ # `model_home/modelscope/<repo>/<revision>/...` location when using shared repos.
267
+ shared_download_dir: Path | None = None
268
+ shared_repo_id: str | None = None
269
+ shared_revision: str | None = None
270
+ use_shared_repo_dir = False
271
+ if provider_name in {"auto", "modelscope"} and not offline_mode:
272
+ shared_repo_id, use_shared_repo_dir = _resolve_modelscope_repo_id(spec)
273
+ if use_shared_repo_dir:
274
+ shared_revision = _resolve_modelscope_revision(spec)
275
+ shared_download_dir = _modelscope_shared_download_dir(model_home, shared_repo_id, shared_revision)
276
+ roots_to_scan.append(shared_download_dir)
277
+
278
+ discovered = _discover_bundle_dir(spec, roots_to_scan)
279
+ if discovered:
280
+ return str(discovered)
281
+
282
+ if provider_name == "local":
283
+ _, missing = _validate_bundle_dir(spec, local_bundle_dir)
284
+ raise FileNotFoundError(
285
+ "Missing required files for local '{}' bundle under {}: {}. Set {} or DEEPDOC_MODEL_HOME to a directory containing these files.".format(
286
+ spec.name, local_bundle_dir, ", ".join(missing), spec.local_dir_env
287
+ )
288
+ )
289
+
290
+ if provider_name == "local" or offline_mode:
291
+ raise FileNotFoundError("Bundle '{}' was not found locally at {} and remote download is disabled. Disable DEEPDOC_OFFLINE or provide local model files.".format(spec.name, local_bundle_dir))
292
+
293
+ repo_id = shared_repo_id
294
+ revision = shared_revision
295
+ if not repo_id or revision is None or not use_shared_repo_dir:
296
+ repo_id, use_shared_repo_dir = _resolve_modelscope_repo_id(spec)
297
+ revision = _resolve_modelscope_revision(spec)
298
+ download_dir = _modelscope_shared_download_dir(model_home, repo_id, revision) if use_shared_repo_dir else local_bundle_dir
299
+ else:
300
+ # We already computed the shared repo download dir above.
301
+ download_dir = shared_download_dir or _modelscope_shared_download_dir(model_home, repo_id, revision)
302
+
303
+ snapshot_root = _download_modelscope_repo(
304
+ repo_id=repo_id,
305
+ revision=revision,
306
+ target_dir=download_dir,
307
+ offline=offline_mode,
308
+ )
309
+ discovered = _discover_bundle_dir(spec, [local_bundle_dir, download_dir, snapshot_root])
310
+ if discovered:
311
+ return str(discovered)
312
+
313
+ raise FileNotFoundError(
314
+ "Downloaded ModelScope repo '{}@{}' for bundle '{}' but could not locate the required files. "
315
+ "Expected the following files to be colocated under a single directory in the repo (e.g. '{}/'): {}. "
316
+ "Configured via {} / {} and {} / {}. "
317
+ "Searched under: {}, {}.".format(
318
+ repo_id,
319
+ revision,
320
+ spec.name,
321
+ spec.subdir,
322
+ ", ".join(spec.required_files),
323
+ spec.repo_env,
324
+ GLOBAL_MODELSCOPE_REPO_ENV,
325
+ spec.revision_env,
326
+ GLOBAL_MODELSCOPE_REVISION_ENV,
327
+ download_dir,
328
+ snapshot_root,
329
+ )
330
+ )
331
+
332
+
333
+ def validate_bundle_dir(bundle: str, directory: str | Path) -> tuple[bool, list[str]]:
334
+ """Validate a local model bundle directory and report missing required files."""
335
+
336
+ if bundle not in BUNDLES:
337
+ raise ValueError(f"Unknown model bundle '{bundle}'. Expected one of: {', '.join(BUNDLES)}")
338
+
339
+ spec = BUNDLES[bundle]
340
+ candidate = Path(directory).expanduser().resolve()
341
+ return _validate_bundle_dir(spec, candidate)
342
+
343
+
344
+ def resolve_vision_model_dir(
345
+ *,
346
+ model_home: str | None = None,
347
+ provider: str | None = None,
348
+ offline: bool | None = None,
349
+ ) -> str:
350
+ return resolve_bundle_dir("vision", model_home=model_home, provider=provider, offline=offline)
351
+
352
+
353
+ def resolve_xgb_model_dir(
354
+ *,
355
+ model_home: str | None = None,
356
+ provider: str | None = None,
357
+ offline: bool | None = None,
358
+ ) -> str:
359
+ return resolve_bundle_dir("xgb", model_home=model_home, provider=provider, offline=offline)
360
+
361
+
362
+ def resolve_tokenizer_dict_prefix(
363
+ *,
364
+ model_home: str | None = None,
365
+ provider: str | None = None,
366
+ offline: bool | None = None,
367
+ ) -> str:
368
+ del model_home, provider, offline
369
+ return str(_resolve_tokenizer_dict_path().with_suffix(""))
@@ -0,0 +1,42 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """
18
+ Simplified settings for DeepDoc independent library.
19
+ Only includes configurations needed by DeepDoc components.
20
+ """
21
+
22
+ import logging
23
+
24
+ # GPU device count detection
25
+ PARALLEL_DEVICES: int = 0
26
+
27
+ def check_and_install_torch():
28
+ """
29
+ Check for PyTorch and detect GPU devices.
30
+ Simplified version for independent library.
31
+ """
32
+ global PARALLEL_DEVICES
33
+ try:
34
+ import torch.cuda
35
+ PARALLEL_DEVICES = torch.cuda.device_count()
36
+ logging.info(f"Found {PARALLEL_DEVICES} GPUs")
37
+ except Exception as e:
38
+ logging.info("Can't import package 'torch' or access GPU: %s", str(e))
39
+ PARALLEL_DEVICES = 0
40
+
41
+ # Initialize on import
42
+ check_and_install_torch()
@@ -0,0 +1,84 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import os
5
+ import uuid
6
+ from pathlib import Path
7
+ from urllib.request import urlopen
8
+
9
+
10
+ DEEPDOC_TIKTOKEN_CACHE_DIR_ENV = "DEEPDOC_TIKTOKEN_CACHE_DIR"
11
+ CL100K_BASE_BLOB_URL = "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
12
+ CL100K_BASE_EXPECTED_HASH = "223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7"
13
+
14
+
15
+ def resolve_tiktoken_cache_dir(cache_dir: str | None = None, model_home: str | None = None) -> Path:
16
+ if cache_dir:
17
+ return Path(cache_dir).expanduser().resolve()
18
+
19
+ explicit_cache_dir = os.getenv(DEEPDOC_TIKTOKEN_CACHE_DIR_ENV) or os.getenv("TIKTOKEN_CACHE_DIR")
20
+ if explicit_cache_dir:
21
+ return Path(explicit_cache_dir).expanduser().resolve()
22
+
23
+ configured_model_home = model_home or os.getenv("DEEPDOC_MODEL_HOME")
24
+ if configured_model_home:
25
+ return Path(configured_model_home).expanduser().resolve().joinpath("tiktoken_cache")
26
+
27
+ return Path.home().joinpath(".cache", "deepdoc", "tiktoken_cache").resolve()
28
+
29
+
30
+ def configure_tiktoken_cache_env(cache_dir: str | None = None, model_home: str | None = None) -> str:
31
+ resolved_cache_dir = resolve_tiktoken_cache_dir(cache_dir=cache_dir, model_home=model_home)
32
+ os.environ["TIKTOKEN_CACHE_DIR"] = str(resolved_cache_dir)
33
+ return str(resolved_cache_dir)
34
+
35
+
36
+ def cl100k_base_cache_key(blob_url: str = CL100K_BASE_BLOB_URL) -> str:
37
+ return hashlib.sha1(blob_url.encode()).hexdigest()
38
+
39
+
40
+ def _matches_expected_hash(data: bytes, expected_hash: str | None) -> bool:
41
+ if not expected_hash:
42
+ return True
43
+ return hashlib.sha256(data).hexdigest() == expected_hash
44
+
45
+
46
+ def download_cl100k_base(
47
+ *,
48
+ cache_dir: str | None = None,
49
+ model_home: str | None = None,
50
+ offline: bool = False,
51
+ blob_url: str = CL100K_BASE_BLOB_URL,
52
+ expected_hash: str = CL100K_BASE_EXPECTED_HASH,
53
+ timeout: int = 60,
54
+ ) -> Path:
55
+ resolved_cache_dir = resolve_tiktoken_cache_dir(cache_dir=cache_dir, model_home=model_home)
56
+ cache_key = cl100k_base_cache_key(blob_url)
57
+ target_path = resolved_cache_dir.joinpath(cache_key)
58
+
59
+ if target_path.exists():
60
+ data = target_path.read_bytes()
61
+ if _matches_expected_hash(data, expected_hash):
62
+ return target_path
63
+ target_path.unlink()
64
+
65
+ if offline:
66
+ raise FileNotFoundError(
67
+ "Missing cached tiktoken encoder '{}'. Expected file at {}. Run the download command without --offline first."
68
+ .format(cache_key, target_path)
69
+ )
70
+
71
+ with urlopen(blob_url, timeout=timeout) as response:
72
+ data = response.read()
73
+
74
+ if not _matches_expected_hash(data, expected_hash):
75
+ raise ValueError(
76
+ "Hash mismatch for tiktoken encoder downloaded from {}. Expected SHA256 {}."
77
+ .format(blob_url, expected_hash)
78
+ )
79
+
80
+ resolved_cache_dir.mkdir(parents=True, exist_ok=True)
81
+ tmp_path = target_path.with_name("{}.{}.tmp".format(target_path.name, uuid.uuid4().hex))
82
+ tmp_path.write_bytes(data)
83
+ tmp_path.replace(target_path)
84
+ return target_path
@@ -0,0 +1,96 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import tiktoken
18
+
19
+ from .tiktoken_cache import configure_tiktoken_cache_env
20
+
21
+
22
+ _encoder = None
23
+
24
+
25
+ def _get_encoder():
26
+ global _encoder
27
+ if _encoder is None:
28
+ configure_tiktoken_cache_env()
29
+ _encoder = tiktoken.get_encoding("cl100k_base")
30
+ return _encoder
31
+
32
+
33
+ def num_tokens_from_string(string: str) -> int:
34
+ """Returns the number of tokens in a text string."""
35
+ try:
36
+ code_list = _get_encoder().encode(string)
37
+ return len(code_list)
38
+ except Exception:
39
+ return 0
40
+
41
+
42
+ def total_token_count_from_response(resp):
43
+ """
44
+ Extract token count from LLM response in various formats.
45
+
46
+ Handles None responses and different response structures from various LLM providers.
47
+ Returns 0 if token count cannot be determined.
48
+ """
49
+ if resp is None:
50
+ return 0
51
+
52
+ try:
53
+ if hasattr(resp, "usage") and hasattr(resp.usage, "total_tokens"):
54
+ return resp.usage.total_tokens
55
+ except Exception:
56
+ pass
57
+
58
+ try:
59
+ if hasattr(resp, "usage_metadata") and hasattr(resp.usage_metadata, "total_tokens"):
60
+ return resp.usage_metadata.total_tokens
61
+ except Exception:
62
+ pass
63
+
64
+ try:
65
+ if hasattr(resp, "meta") and hasattr(resp.meta, "billed_units") and hasattr(resp.meta.billed_units, "input_tokens"):
66
+ return resp.meta.billed_units.input_tokens
67
+ except Exception:
68
+ pass
69
+
70
+ if isinstance(resp, dict) and 'usage' in resp and 'total_tokens' in resp['usage']:
71
+ try:
72
+ return resp["usage"]["total_tokens"]
73
+ except Exception:
74
+ pass
75
+
76
+ if isinstance(resp, dict) and 'usage' in resp and 'input_tokens' in resp['usage'] and 'output_tokens' in resp['usage']:
77
+ try:
78
+ return resp["usage"]["input_tokens"] + resp["usage"]["output_tokens"]
79
+ except Exception:
80
+ pass
81
+
82
+ if isinstance(resp, dict) and 'meta' in resp and 'tokens' in resp['meta'] and 'input_tokens' in resp['meta']['tokens'] and 'output_tokens' in resp['meta']['tokens']:
83
+ try:
84
+ return resp["meta"]["tokens"]["input_tokens"] + resp["meta"]["tokens"]["output_tokens"]
85
+ except Exception:
86
+ pass
87
+ return 0
88
+
89
+
90
+ def truncate(string: str, max_len: int) -> str:
91
+ """Returns truncated text if the length of text exceed max_len."""
92
+ try:
93
+ encoder = _get_encoder()
94
+ return encoder.decode(encoder.encode(string)[:max_len])
95
+ except Exception:
96
+ return string[:max_len]