sirchmunk 0.0.0__py3-none-any.whl → 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. sirchmunk/__init__.py +8 -0
  2. sirchmunk/base.py +17 -0
  3. sirchmunk/insight/__init__.py +4 -0
  4. sirchmunk/insight/text_insights.py +292 -0
  5. sirchmunk/learnings/__init__.py +1 -0
  6. sirchmunk/learnings/evidence_processor.py +525 -0
  7. sirchmunk/learnings/knowledge_base.py +232 -0
  8. sirchmunk/llm/__init__.py +2 -0
  9. sirchmunk/llm/openai_chat.py +247 -0
  10. sirchmunk/llm/prompts.py +216 -0
  11. sirchmunk/retrieve/__init__.py +1 -0
  12. sirchmunk/retrieve/base.py +25 -0
  13. sirchmunk/retrieve/text_retriever.py +1026 -0
  14. sirchmunk/scan/__init__.py +1 -0
  15. sirchmunk/scan/base.py +18 -0
  16. sirchmunk/scan/file_scanner.py +373 -0
  17. sirchmunk/scan/web_scanner.py +18 -0
  18. sirchmunk/scheduler/__init__.py +0 -0
  19. sirchmunk/schema/__init__.py +2 -0
  20. sirchmunk/schema/cognition.py +106 -0
  21. sirchmunk/schema/context.py +25 -0
  22. sirchmunk/schema/knowledge.py +318 -0
  23. sirchmunk/schema/metadata.py +658 -0
  24. sirchmunk/schema/request.py +221 -0
  25. sirchmunk/schema/response.py +20 -0
  26. sirchmunk/schema/snapshot.py +346 -0
  27. sirchmunk/search.py +475 -0
  28. sirchmunk/storage/__init__.py +7 -0
  29. sirchmunk/storage/duckdb.py +676 -0
  30. sirchmunk/storage/knowledge_manager.py +720 -0
  31. sirchmunk/utils/__init__.py +15 -0
  32. sirchmunk/utils/constants.py +15 -0
  33. sirchmunk/utils/deps.py +23 -0
  34. sirchmunk/utils/file_utils.py +70 -0
  35. sirchmunk/utils/install_rga.py +124 -0
  36. sirchmunk/utils/log_utils.py +360 -0
  37. sirchmunk/utils/tokenizer_util.py +55 -0
  38. sirchmunk/utils/utils.py +108 -0
  39. sirchmunk/version.py +1 -1
  40. sirchmunk-0.0.1.dist-info/METADATA +416 -0
  41. sirchmunk-0.0.1.dist-info/RECORD +45 -0
  42. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/WHEEL +1 -1
  43. sirchmunk-0.0.0.dist-info/METADATA +0 -26
  44. sirchmunk-0.0.0.dist-info/RECORD +0 -8
  45. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/entry_points.txt +0 -0
  46. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/licenses/LICENSE +0 -0
  47. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,15 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+
3
+ from sirchmunk.utils.log_utils import (
4
+ AsyncLogger,
5
+ SyncLogger,
6
+ LogCallback,
7
+ create_logger,
8
+ )
9
+
10
+ __all__ = [
11
+ "create_logger",
12
+ "AsyncLogger",
13
+ "SyncLogger",
14
+ "LogCallback",
15
+ ]
@@ -0,0 +1,15 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ import os
3
+ from pathlib import Path
4
+
5
+ # Limit for concurrent RGA requests
6
+ GREP_CONCURRENT_LIMIT = int(os.getenv("GREP_CONCURRENT_LIMIT", "5"))
7
+
8
+ # LLM Configuration
9
+ LLM_BASE_URL = os.getenv("LLM_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1")
10
+ LLM_API_KEY = os.getenv("LLM_API_KEY", "")
11
+ LLM_MODEL_NAME = os.getenv("LLM_MODEL_NAME", "qwen3-max")
12
+
13
+ # Search Configuration
14
+ DEFAULT_WORK_PATH = os.path.expanduser("~/sirchmunk")
15
+ WORK_PATH = os.getenv("WORK_PATH", DEFAULT_WORK_PATH)
@@ -0,0 +1,23 @@
1
+ import shutil
2
+ import warnings
3
+
4
+ def check_dependencies() -> bool:
5
+ """
6
+ Check if required dependencies 'rg' (ripgrep) and 'rga' (ripgrep-all) are installed.
7
+ """
8
+
9
+ if not shutil.which("rg"):
10
+ warnings.warn(
11
+ "\n\n"
12
+ "⚠️ [Sirchmunk Warning] Missing dependency: 'rg' (ripgrep).\n"
13
+ )
14
+ return False
15
+
16
+ if not shutil.which("rga"):
17
+ warnings.warn(
18
+ "\n\n"
19
+ "⚠️ [Sirchmunk Warning] Missing dependency: 'rga' (ripgrep-all).\n"
20
+ )
21
+ return False
22
+
23
+ return True
@@ -0,0 +1,70 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ import hashlib
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Union
6
+
7
+ from kreuzberg import ExtractionResult, extract_file
8
+ from loguru import logger
9
+
10
+
11
+ async def fast_extract(file_path: Union[str, Path]) -> ExtractionResult:
12
+ """
13
+ Automatically detects and extracts text content from various file formats like docx, pptx, pdf, xlsx.
14
+ """
15
+ result: ExtractionResult = await extract_file(file_path=file_path)
16
+
17
+ return result
18
+
19
+
20
+ def get_fast_hash(file_path: Union[str, Path], sample_size: int = 8192):
21
+ """
22
+ Computes a partial hash (fingerprint) by combining:
23
+ File Size + Head Chunk + Tail Chunk.
24
+ This is extremely efficient for large-scale file hash calculation.
25
+ """
26
+ file_path = Path(file_path)
27
+ try:
28
+ # Get metadata first (O(1) operation)
29
+ file_size = file_path.stat().st_size
30
+
31
+ # If the file is smaller than the combined sample size, read it entirely
32
+ if file_size <= sample_size * 2:
33
+ with open(file_path, "rb") as f:
34
+ return f"{hashlib.md5(f.read()).hexdigest()}_{file_size}"
35
+
36
+ # Large file sampling: Read head and tail to avoid full disk I/O
37
+ hash_content = hashlib.md5()
38
+ with open(file_path, "rb") as f:
39
+ hash_content.update(f.read(sample_size))
40
+ f.seek(-sample_size, os.SEEK_END)
41
+ hash_content.update(f.read(sample_size))
42
+
43
+ # Mix the file size into the hash string to minimize collisions
44
+ return f"{hash_content.hexdigest()}_{file_size}"
45
+ except (FileNotFoundError, PermissionError):
46
+ # Handle cases where files are deleted during scan or access is denied
47
+ logger.warning("File not found or inaccessible: {}", file_path)
48
+ return None
49
+
50
+
51
+ class StorageStructure:
52
+ """
53
+ Standardized directory and file naming conventions for caching and storage.
54
+ """
55
+
56
+ CACHE_DIR = ".cache"
57
+
58
+ METADATA_DIR = "metadata"
59
+
60
+ GREP_DIR = "rga"
61
+
62
+ KNOWLEDGE_DIR = "knowledge"
63
+
64
+ COGNITION_DIR = "cognition"
65
+
66
+ # `.idx` -> Index file for fast lookup of cluster content
67
+ CLUSTER_INDEX_FILE = "cluster.idx"
68
+
69
+ # `.mpk` -> MessagePack serialized cluster content
70
+ CLUSTER_CONTENT_FILE = "cluster.mpk"
@@ -0,0 +1,124 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ import json
3
+ import os
4
+ import platform
5
+ import shutil
6
+ import subprocess
7
+ import sys
8
+ import tarfile
9
+ import tempfile
10
+ import urllib.request
11
+ import zipfile
12
+ from pathlib import Path
13
+ from typing import Optional, List
14
+
15
+
16
+ def _download_and_extract(url: str, ext: str, required_bins: List[str], install_dir: Path, bin_label: str):
17
+ """Downloads and extracts specific binaries from an archive."""
18
+ try:
19
+ with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp_file:
20
+ tmp_path = Path(tmp_file.name)
21
+ with urllib.request.urlopen(url, timeout=60) as response:
22
+ shutil.copyfileobj(response, tmp_file)
23
+
24
+ temp_extract_dir = Path(tempfile.mkdtemp())
25
+ if ext == ".zip":
26
+ with zipfile.ZipFile(tmp_path, "r") as zf:
27
+ for member in zf.namelist():
28
+ fname = os.path.basename(member)
29
+ if fname in required_bins:
30
+ with zf.open(member) as source, open(install_dir / fname, "wb") as f:
31
+ shutil.copyfileobj(source, f)
32
+ (install_dir / fname).chmod(0o755)
33
+ else: # .tar.gz
34
+ with tarfile.open(tmp_path, "r:gz") as tf:
35
+ for member in tf.getmembers():
36
+ fname = os.path.basename(member.name)
37
+ if fname in required_bins:
38
+ # Fix for Python 3.14 DeprecationWarning
39
+ tf.extract(member, temp_extract_dir, filter='data')
40
+ target = install_dir / fname
41
+ shutil.move(str(temp_extract_dir / member.name), str(target))
42
+ target.chmod(0o755)
43
+ finally:
44
+ if 'tmp_path' in locals(): tmp_path.unlink(missing_ok=True)
45
+ if 'temp_extract_dir' in locals(): shutil.rmtree(temp_extract_dir, ignore_errors=True)
46
+
47
+
48
+ def _verify_bin(path: Path, expected_name: str) -> bool:
49
+ """Check if binary exists and responds to --version."""
50
+ if not path.exists(): return False
51
+ try:
52
+ res = subprocess.run([str(path), "--version"], capture_output=True, text=True, timeout=5)
53
+ return res.returncode == 0
54
+ except:
55
+ return False
56
+
57
+
58
+ def _install_component(repo: str, bin_name: str, required_bins: List[str], install_dir: Path, force: bool) -> str:
59
+ """Generic installer for ripgrep and rga."""
60
+ system = platform.system().lower()
61
+ machine = platform.machine().lower()
62
+
63
+ # Platform mapping
64
+ arch = "x86_64" if machine in ("x86_64", "amd64") else "aarch64" if machine in ("arm64", "aarch64") else None
65
+ if not arch: raise RuntimeError(f"Unsupported arch: {machine}")
66
+
67
+ if system == "windows":
68
+ os_tag, ext = "pc-windows-msvc", ".zip"
69
+ elif system == "darwin":
70
+ os_tag, ext = "apple-darwin", ".tar.gz"
71
+ else:
72
+ # ripgrep and rga both use musl for static linux binaries
73
+ os_tag, ext = "unknown-linux-musl", ".tar.gz"
74
+
75
+ final_bin = install_dir / (bin_name + (".exe" if system == "windows" else ""))
76
+
77
+ if not force and _verify_bin(final_bin, bin_name):
78
+ return str(final_bin)
79
+
80
+ print(f"Installing {bin_name} from {repo}...", file=sys.stderr)
81
+ try:
82
+ api_url = f"https://api.github.com/repos/{repo}/releases/latest"
83
+ req = urllib.request.Request(api_url, headers={"User-Agent": "Mozilla/5.0"})
84
+ with urllib.request.urlopen(req, timeout=10) as resp:
85
+ assets = json.loads(resp.read())["assets"]
86
+
87
+ # Find asset (ripgrep assets often contain 'x86_64-unknown-linux-musl')
88
+ asset = next(a for a in assets if arch in a["name"] and os_tag in a["name"] and a["name"].endswith(ext))
89
+ _download_and_extract(asset["browser_download_url"], ext, required_bins, install_dir, bin_name)
90
+
91
+ if not _verify_bin(final_bin, bin_name):
92
+ raise RuntimeError(f"Verification failed for {bin_name}")
93
+ return str(final_bin)
94
+ except Exception as e:
95
+ raise RuntimeError(f"Failed to install {bin_name}: {e}")
96
+
97
+
98
+ def install_rga(force_reinstall: bool = False, install_dir: Optional[str] = None) -> str:
99
+ """Main entry: Installs ripgrep (rg) then ripgrep-all (rga)."""
100
+ if install_dir is None:
101
+ if platform.system().lower() == "windows":
102
+ install_dir = os.path.expandvars(r"%LOCALAPPDATA%\bin")
103
+ else:
104
+ install_dir = os.path.expanduser("~/.local/bin")
105
+
106
+ path_dir = Path(install_dir)
107
+ path_dir.mkdir(parents=True, exist_ok=True)
108
+
109
+ # 1. Install ripgrep (rg)
110
+ rg_exe = "rg.exe" if platform.system().lower() == "windows" else "rg"
111
+ _install_component("BurntSushi/ripgrep", "rg", [rg_exe], path_dir, force_reinstall)
112
+
113
+ # 2. Install ripgrep-all (rga)
114
+ rga_bins = ["rga.exe", "rga-preproc.exe"] if platform.system().lower() == "windows" else ["rga", "rga-preproc"]
115
+ return _install_component("phiresky/ripgrep-all", "rga", rga_bins, path_dir, force_reinstall)
116
+
117
+
118
+ if __name__ == "__main__":
119
+ try:
120
+ path = install_rga()
121
+ print(f"SUCCESS: ripgrep and ripgrep-all are ready at: {os.path.dirname(path)}")
122
+ except Exception as e:
123
+ print(f"ERROR: {e}", file=sys.stderr)
124
+ sys.exit(1)
@@ -0,0 +1,360 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ """
3
+ Unified logging utilities for Sirchmunk
4
+ Provides flexible logging with optional callbacks and fallback to loguru
5
+ Supports both synchronous and asynchronous logging
6
+ """
7
+ import asyncio
8
+ from typing import Any, Awaitable, Callable, Optional, Union
9
+
10
+ from loguru import logger as default_logger
11
+
12
+
13
+ # Type alias for log callback function (can be sync or async)
14
+ # Signature: (level: str, message: str, end: str, flush: bool) -> None or Awaitable[None]
15
+ LogCallback = Optional[Callable[[str, str, str, bool], Union[None, Awaitable[None]]]]
16
+
17
+
18
+ async def log_with_callback_async(
19
+ level: str,
20
+ message: str,
21
+ log_callback: LogCallback = None,
22
+ flush: bool = False,
23
+ end: str = "\n",
24
+ ) -> None:
25
+ """
26
+ Send log message through callback if available, otherwise use loguru logger.
27
+
28
+ This is a universal logging utility that supports both synchronous and
29
+ asynchronous callback functions, with automatic fallback to loguru.
30
+
31
+ Args:
32
+ level: Log level (e.g., "info", "debug", "error", "warning", "success")
33
+ message: Message content to log
34
+ log_callback: Optional callback function (sync or async) that takes (level, message).
35
+ If None, uses loguru's default_logger.
36
+ flush: If True, force immediate output and use raw mode (no timestamp/level prefix).
37
+ Useful for progress indicators. Equivalent to logger.opt(raw=True).
38
+ end: String appended after the message (default: "\n")
39
+
40
+ Examples:
41
+ # Using default loguru logger (with prefix)
42
+ await log_with_callback("info", "Processing started")
43
+ # Output: 2026-01-16 10:30:00.123 | INFO | Processing started
44
+
45
+ # Progress indicator without prefix (flush=True removes formatting)
46
+ await log_with_callback("info", "Processing...", flush=True, end="")
47
+ await log_with_callback("info", " Done!", flush=True, end="\n")
48
+ # Output: Processing... Done!
49
+
50
+ # Using custom async callback
51
+ async def my_callback(level: str, msg: str):
52
+ await websocket.send_text(f"[{level}] {msg}")
53
+ await log_with_callback("debug", "Custom log", log_callback=my_callback)
54
+ """
55
+ if log_callback is not None:
56
+ # Pass original message, end, and flush to callback
57
+ # Let the callback handle message formatting
58
+ if asyncio.iscoroutinefunction(log_callback):
59
+ await log_callback(level, message, end, flush)
60
+ else:
61
+ # Call sync callback directly with all parameters
62
+ log_callback(level, message, end, flush)
63
+
64
+ # If flush is requested and callback is async, yield control to allow immediate processing
65
+ if flush and asyncio.iscoroutinefunction(log_callback):
66
+ await asyncio.sleep(0)
67
+ else:
68
+ # Fallback to loguru logger (process message locally)
69
+ full_message = message + end if end else message
70
+ if flush:
71
+ # Use raw mode (no prefix) for flush=True
72
+ default_logger.opt(raw=True).log(level.upper(), full_message)
73
+ else:
74
+ # Normal formatted output with prefix
75
+ getattr(default_logger, level.lower())(full_message.rstrip("\n"))
76
+
77
+
78
+ def log_with_callback(
79
+ level: str,
80
+ message: str,
81
+ log_callback: LogCallback = None,
82
+ flush: bool = False,
83
+ end: str = "\n",
84
+ ) -> None:
85
+ """
86
+ Synchronous version of log_with_callback.
87
+
88
+ Args:
89
+ level: Log level (e.g., "info", "debug", "error", "warning", "success")
90
+ message: Message content to log
91
+ log_callback: Optional callback function (must be sync)
92
+ flush: If True, force immediate output and use raw mode (no timestamp/level prefix).
93
+ Useful for progress indicators. Equivalent to logger.opt(raw=True).
94
+ end: String appended after the message (default: "\n")
95
+
96
+ Examples:
97
+ # Normal logging (with prefix)
98
+ log_with_callback("info", "Processing started")
99
+ # Output: 2026-01-16 10:30:00.123 | INFO | Processing started
100
+
101
+ # Progress indicator without prefix (flush=True removes formatting)
102
+ log_with_callback("info", "Loading", flush=True, end="")
103
+ log_with_callback("info", "...", flush=True, end="")
104
+ log_with_callback("info", " Done!", flush=True)
105
+ # Output: Loading... Done!
106
+ """
107
+ if log_callback is not None:
108
+ # Pass original message, end, and flush to callback
109
+ # Let the callback handle message formatting
110
+ if not asyncio.iscoroutinefunction(log_callback):
111
+ log_callback(level, message, end, flush)
112
+ else:
113
+ # If async callback provided in sync mode, schedule safely.
114
+ # Avoid asyncio.run() when already inside a running event loop.
115
+ try:
116
+ running_loop = asyncio.get_running_loop()
117
+ except RuntimeError:
118
+ asyncio.run(log_callback(level, message, end, flush))
119
+ else:
120
+ running_loop.create_task(log_callback(level, message, end, flush))
121
+ else:
122
+ # Fallback to loguru logger (process message locally)
123
+ full_message = message + end if end else message
124
+ if flush:
125
+ # Use raw mode (no prefix) for flush=True
126
+ default_logger.opt(raw=True).log(level.upper(), full_message)
127
+ else:
128
+ # Normal formatted output with prefix
129
+ getattr(default_logger, level.lower())(full_message.rstrip("\n"))
130
+
131
+
132
+ def create_logger(log_callback: LogCallback = None, enable_async: bool = True) -> Union["AsyncLogger", "SyncLogger"]:
133
+ """
134
+ Create a logger instance with a bound log_callback.
135
+
136
+ This factory function creates a logger with logger-style methods (info, warning, etc.)
137
+ pre-configured with a specific callback, compatible with loguru logger usage.
138
+
139
+ Args:
140
+ log_callback: Optional callback function to bind
141
+ enable_async: If True, create AsyncLogger; if False, create SyncLogger
142
+
143
+ Returns:
144
+ AsyncLogger or SyncLogger instance depending on enable_async parameter
145
+
146
+ Example:
147
+ # Create async logger (default)
148
+ async def my_callback(level: str, msg: str):
149
+ print(f"[{level}] {msg}")
150
+
151
+ logger = create_logger(log_callback=my_callback, enable_async=True)
152
+ await logger.info("Starting process") # Async usage
153
+
154
+ # Create sync logger
155
+ def sync_callback(level: str, msg: str):
156
+ print(f"[{level}] {msg}")
157
+
158
+ logger = create_logger(log_callback=sync_callback, enable_async=False)
159
+ logger.info("Starting process") # Sync usage (no await)
160
+
161
+ # Without callback (uses default loguru)
162
+ async_logger = create_logger(enable_async=True)
163
+ await async_logger.info("Async with loguru")
164
+
165
+ sync_logger = create_logger(enable_async=False)
166
+ sync_logger.info("Sync with loguru")
167
+ """
168
+ if enable_async:
169
+ return AsyncLogger(log_callback=log_callback)
170
+ else:
171
+ return SyncLogger(log_callback=log_callback)
172
+
173
+
174
+ class SyncLogger:
175
+ """
176
+ Synchronous logger class with optional callback support.
177
+
178
+ Provides a synchronous interface for logging. Use this when you need
179
+ synchronous logging or when working in non-async contexts.
180
+
181
+ Supports print-like flush and end parameters for advanced output control.
182
+ When flush=True, uses raw mode (no timestamp/level prefix) for clean output.
183
+
184
+ Example:
185
+ # With custom sync callback
186
+ def my_callback(level: str, msg: str):
187
+ print(f"[{level}] {msg}", end="")
188
+
189
+ logger = SyncLogger(log_callback=my_callback)
190
+ logger.info("Starting process")
191
+ logger.error("Failed to connect")
192
+
193
+ # Progress indicator (flush=True removes prefix for clean output)
194
+ logger.info("Processing", flush=True, end="")
195
+ logger.info("...", flush=True, end="")
196
+ logger.info(" Done!", flush=True)
197
+ # Output: Processing... Done!
198
+
199
+ # Without callback (uses loguru with normal formatting)
200
+ logger = SyncLogger()
201
+ logger.info("Using default logger")
202
+ # Output: 2026-01-16 10:30:00.123 | INFO | Using default logger
203
+ """
204
+
205
+ def __init__(self, log_callback: LogCallback = None):
206
+ """
207
+ Initialize sync logger with optional callback.
208
+
209
+ Args:
210
+ log_callback: Optional callback function (preferably sync)
211
+ """
212
+ self.log_callback = log_callback
213
+
214
+ def log(self, level: str, message: str, flush: bool = False, end: str = "\n"):
215
+ """Log a message at the specified level (synchronous)"""
216
+ log_with_callback(level, message, log_callback=self.log_callback, flush=flush, end=end)
217
+
218
+ def debug(self, message: str, flush: bool = False, end: str = "\n"):
219
+ """Log a debug message (synchronous)"""
220
+ self.log("debug", message, flush=flush, end=end)
221
+
222
+ def info(self, message: str, flush: bool = False, end: str = "\n"):
223
+ """Log an info message (synchronous)"""
224
+ self.log("info", message, flush=flush, end=end)
225
+
226
+ def warning(self, message: str, flush: bool = False, end: str = "\n"):
227
+ """Log a warning message (synchronous)"""
228
+ self.log("warning", message, flush=flush, end=end)
229
+
230
+ def error(self, message: str, flush: bool = False, end: str = "\n"):
231
+ """Log an error message (synchronous)"""
232
+ self.log("error", message, flush=flush, end=end)
233
+
234
+ def success(self, message: str, flush: bool = False, end: str = "\n"):
235
+ """Log a success message (synchronous)"""
236
+ self.log("success", message, flush=flush, end=end)
237
+
238
+ def critical(self, message: str, flush: bool = False, end: str = "\n"):
239
+ """Log a critical message (synchronous)"""
240
+ self.log("critical", message, flush=flush, end=end)
241
+
242
+
243
+ class AsyncLogger:
244
+ """
245
+ Async logger class with optional callback support.
246
+
247
+ Provides a class-based interface for logging with instance-level
248
+ callback configuration. Useful for classes that need persistent
249
+ logging configuration.
250
+
251
+ Supports print-like flush and end parameters for advanced output control.
252
+ When flush=True, uses raw mode (no timestamp/level prefix) for clean output.
253
+
254
+ Example:
255
+ # With custom callback
256
+ async def my_callback(level: str, msg: str):
257
+ await websocket.send(f"{level}: {msg}")
258
+
259
+ logger = AsyncLogger(log_callback=my_callback)
260
+ await logger.info("Starting process")
261
+ await logger.error("Failed to connect")
262
+
263
+ # Progress indicator (flush=True removes prefix for clean output)
264
+ await logger.info("Processing", flush=True, end="")
265
+ await logger.info("...", flush=True, end="")
266
+ await logger.info(" Done!", flush=True)
267
+ # Output: Processing... Done!
268
+
269
+ # Without callback (uses loguru with normal formatting)
270
+ logger = AsyncLogger()
271
+ await logger.info("Using default logger")
272
+ # Output: 2026-01-16 10:30:00.123 | INFO | Using default logger
273
+ """
274
+
275
+ def __init__(self, log_callback: LogCallback = None):
276
+ """
277
+ Initialize async logger with optional callback.
278
+
279
+ Args:
280
+ log_callback: Optional callback function (sync or async)
281
+ """
282
+ self.log_callback = log_callback
283
+
284
+ async def log(self, level: str, message: str, flush: bool = False, end: str = "\n"):
285
+ """
286
+ Log a message at the specified level.
287
+
288
+ Args:
289
+ level: Log level
290
+ message: Message to log
291
+ flush: If True, force immediate output
292
+ end: String appended after message (default: "\n")
293
+ """
294
+ await log_with_callback_async(level, message, log_callback=self.log_callback, flush=flush, end=end)
295
+
296
+ async def debug(self, message: str, flush: bool = False, end: str = "\n"):
297
+ """
298
+ Log a debug message.
299
+
300
+ Args:
301
+ message: Message to log
302
+ flush: If True, force immediate output
303
+ end: String appended after message (default: "\n")
304
+ """
305
+ await self.log("debug", message, flush=flush, end=end)
306
+
307
+ async def info(self, message: str, flush: bool = False, end: str = "\n"):
308
+ """
309
+ Log an info message.
310
+
311
+ Args:
312
+ message: Message to log
313
+ flush: If True, force immediate output
314
+ end: String appended after message (default: "\n")
315
+ """
316
+ await self.log("info", message, flush=flush, end=end)
317
+
318
+ async def warning(self, message: str, flush: bool = False, end: str = "\n"):
319
+ """
320
+ Log a warning message.
321
+
322
+ Args:
323
+ message: Message to log
324
+ flush: If True, force immediate output
325
+ end: String appended after message (default: "\n")
326
+ """
327
+ await self.log("warning", message, flush=flush, end=end)
328
+
329
+ async def error(self, message: str, flush: bool = False, end: str = "\n"):
330
+ """
331
+ Log an error message.
332
+
333
+ Args:
334
+ message: Message to log
335
+ flush: If True, force immediate output
336
+ end: String appended after message (default: "\n")
337
+ """
338
+ await self.log("error", message, flush=flush, end=end)
339
+
340
+ async def success(self, message: str, flush: bool = False, end: str = "\n"):
341
+ """
342
+ Log a success message.
343
+
344
+ Args:
345
+ message: Message to log
346
+ flush: If True, force immediate output
347
+ end: String appended after message (default: "\n")
348
+ """
349
+ await self.log("success", message, flush=flush, end=end)
350
+
351
+ async def critical(self, message: str, flush: bool = False, end: str = "\n"):
352
+ """
353
+ Log a critical message.
354
+
355
+ Args:
356
+ message: Message to log
357
+ flush: If True, force immediate output
358
+ end: String appended after message (default: "\n")
359
+ """
360
+ await self.log("critical", message, flush=flush, end=end)
@@ -0,0 +1,55 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ from typing import List, Optional, Union
3
+
4
+
5
+ class TokenizerUtil:
6
+ """Fast tokenizer utility using modelscope AutoTokenizer."""
7
+
8
+ def __init__(self, model_id: Optional[str] = None):
9
+ """
10
+ Tokenizer encoding and counting utility.
11
+ Args:
12
+ model_id: Model ID for loading the tokenizer. Defaults to "Qwen/Qwen3-8B".
13
+ """
14
+ from modelscope import AutoTokenizer
15
+
16
+ model_id: str = model_id or "Qwen/Qwen3-8B"
17
+ self.tokenizer = AutoTokenizer.from_pretrained(model_id)
18
+
19
+ def encode(self, content: str) -> List[int]:
20
+ """Encode text into token IDs.
21
+
22
+ Args:
23
+ content: Input text string.
24
+
25
+ Returns:
26
+ List of token IDs.
27
+ """
28
+ if not content.strip():
29
+ return []
30
+
31
+ return self.tokenizer.encode(content.strip())
32
+
33
+ def count_tokens(self, contents: Union[str, List[str]]) -> Union[int, List[int]]:
34
+ """
35
+ Batch count tokens for multiple texts.
36
+
37
+ Args:
38
+ contents: List of input text strings.
39
+
40
+ Returns:
41
+ List of token counts corresponding to each input text, or an integer if a single string is provided.
42
+ """
43
+ if isinstance(contents, str):
44
+ contents = [contents]
45
+
46
+ counts = []
47
+ for content in contents:
48
+ if not content.strip():
49
+ counts.append(0)
50
+ else:
51
+ counts.append(len(self.tokenizer.encode(content.strip())))
52
+
53
+ if len(contents) == 1:
54
+ return counts[0]
55
+ return counts