PyPI - sirchmunk - Versions diffs - 0.0.0__py3-none-any.whl → 0.0.1__py3-none-any.whl - Mend

sirchmunk 0.0.0py3-none-any.whl → 0.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

sirchmunk/__init__.py +8 -0
sirchmunk/base.py +17 -0
sirchmunk/insight/__init__.py +4 -0
sirchmunk/insight/text_insights.py +292 -0
sirchmunk/learnings/__init__.py +1 -0
sirchmunk/learnings/evidence_processor.py +525 -0
sirchmunk/learnings/knowledge_base.py +232 -0
sirchmunk/llm/__init__.py +2 -0
sirchmunk/llm/openai_chat.py +247 -0
sirchmunk/llm/prompts.py +216 -0
sirchmunk/retrieve/__init__.py +1 -0
sirchmunk/retrieve/base.py +25 -0
sirchmunk/retrieve/text_retriever.py +1026 -0
sirchmunk/scan/__init__.py +1 -0
sirchmunk/scan/base.py +18 -0
sirchmunk/scan/file_scanner.py +373 -0
sirchmunk/scan/web_scanner.py +18 -0
sirchmunk/scheduler/__init__.py +0 -0
sirchmunk/schema/__init__.py +2 -0
sirchmunk/schema/cognition.py +106 -0
sirchmunk/schema/context.py +25 -0
sirchmunk/schema/knowledge.py +318 -0
sirchmunk/schema/metadata.py +658 -0
sirchmunk/schema/request.py +221 -0
sirchmunk/schema/response.py +20 -0
sirchmunk/schema/snapshot.py +346 -0
sirchmunk/search.py +475 -0
sirchmunk/storage/__init__.py +7 -0
sirchmunk/storage/duckdb.py +676 -0
sirchmunk/storage/knowledge_manager.py +720 -0
sirchmunk/utils/__init__.py +15 -0
sirchmunk/utils/constants.py +15 -0
sirchmunk/utils/deps.py +23 -0
sirchmunk/utils/file_utils.py +70 -0
sirchmunk/utils/install_rga.py +124 -0
sirchmunk/utils/log_utils.py +360 -0
sirchmunk/utils/tokenizer_util.py +55 -0
sirchmunk/utils/utils.py +108 -0
sirchmunk/version.py +1 -1
sirchmunk-0.0.1.dist-info/METADATA +416 -0
sirchmunk-0.0.1.dist-info/RECORD +45 -0
{sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/WHEEL +1 -1
sirchmunk-0.0.0.dist-info/METADATA +0 -26
sirchmunk-0.0.0.dist-info/RECORD +0 -8
{sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/entry_points.txt +0 -0
{sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/licenses/LICENSE +0 -0
{sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/top_level.txt +0 -0

sirchmunk/utils/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from sirchmunk.utils.log_utils import (
+    AsyncLogger,
+    SyncLogger,
+    LogCallback,
+    create_logger,
+)
+__all__ = [
+    "create_logger",
+    "AsyncLogger",
+    "SyncLogger",
+    "LogCallback",
+]

sirchmunk/utils/constants.py ADDED Viewed

@@ -0,0 +1,15 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import os
+from pathlib import Path
+# Limit for concurrent RGA requests
+GREP_CONCURRENT_LIMIT = int(os.getenv("GREP_CONCURRENT_LIMIT", "5"))
+# LLM Configuration
+LLM_BASE_URL = os.getenv("LLM_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1")
+LLM_API_KEY = os.getenv("LLM_API_KEY", "")
+LLM_MODEL_NAME = os.getenv("LLM_MODEL_NAME", "qwen3-max")
+# Search Configuration
+DEFAULT_WORK_PATH = os.path.expanduser("~/sirchmunk")
+WORK_PATH = os.getenv("WORK_PATH", DEFAULT_WORK_PATH)

sirchmunk/utils/deps.py ADDED Viewed

@@ -0,0 +1,23 @@
+import shutil
+import warnings
+def check_dependencies() -> bool:
+    """
+    Check if required dependencies 'rg' (ripgrep) and 'rga' (ripgrep-all) are installed.
+    """
+    if not shutil.which("rg"):
+        warnings.warn(
+            "\n\n"
+            "⚠️  [Sirchmunk Warning] Missing dependency: 'rg' (ripgrep).\n"
+        )
+        return False
+    if not shutil.which("rga"):
+        warnings.warn(
+            "\n\n"
+            "⚠️  [Sirchmunk Warning] Missing dependency: 'rga' (ripgrep-all).\n"
+        )
+        return False
+    return True

sirchmunk/utils/file_utils.py ADDED Viewed

@@ -0,0 +1,70 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import hashlib
+import os
+from pathlib import Path
+from typing import Union
+from kreuzberg import ExtractionResult, extract_file
+from loguru import logger
+async def fast_extract(file_path: Union[str, Path]) -> ExtractionResult:
+    """
+    Automatically detects and extracts text content from various file formats like docx, pptx, pdf, xlsx.
+    """
+    result: ExtractionResult = await extract_file(file_path=file_path)
+    return result
+def get_fast_hash(file_path: Union[str, Path], sample_size: int = 8192):
+    """
+    Computes a partial hash (fingerprint) by combining:
+    File Size + Head Chunk + Tail Chunk.
+    This is extremely efficient for large-scale file hash calculation.
+    """
+    file_path = Path(file_path)
+    try:
+        # Get metadata first (O(1) operation)
+        file_size = file_path.stat().st_size
+        # If the file is smaller than the combined sample size, read it entirely
+        if file_size <= sample_size * 2:
+            with open(file_path, "rb") as f:
+                return f"{hashlib.md5(f.read()).hexdigest()}_{file_size}"
+        # Large file sampling: Read head and tail to avoid full disk I/O
+        hash_content = hashlib.md5()
+        with open(file_path, "rb") as f:
+            hash_content.update(f.read(sample_size))
+            f.seek(-sample_size, os.SEEK_END)
+            hash_content.update(f.read(sample_size))
+        # Mix the file size into the hash string to minimize collisions
+        return f"{hash_content.hexdigest()}_{file_size}"
+    except (FileNotFoundError, PermissionError):
+        # Handle cases where files are deleted during scan or access is denied
+        logger.warning("File not found or inaccessible: {}", file_path)
+        return None
+class StorageStructure:
+    """
+    Standardized directory and file naming conventions for caching and storage.
+    """
+    CACHE_DIR = ".cache"
+    METADATA_DIR = "metadata"
+    GREP_DIR = "rga"
+    KNOWLEDGE_DIR = "knowledge"
+    COGNITION_DIR = "cognition"
+    # `.idx` -> Index file for fast lookup of cluster content
+    CLUSTER_INDEX_FILE = "cluster.idx"
+    # `.mpk` -> MessagePack serialized cluster content
+    CLUSTER_CONTENT_FILE = "cluster.mpk"

sirchmunk/utils/install_rga.py ADDED Viewed

@@ -0,0 +1,124 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import json
+import os
+import platform
+import shutil
+import subprocess
+import sys
+import tarfile
+import tempfile
+import urllib.request
+import zipfile
+from pathlib import Path
+from typing import Optional, List
+def _download_and_extract(url: str, ext: str, required_bins: List[str], install_dir: Path, bin_label: str):
+    """Downloads and extracts specific binaries from an archive."""
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp_file:
+            tmp_path = Path(tmp_file.name)
+            with urllib.request.urlopen(url, timeout=60) as response:
+                shutil.copyfileobj(response, tmp_file)
+        temp_extract_dir = Path(tempfile.mkdtemp())
+        if ext == ".zip":
+            with zipfile.ZipFile(tmp_path, "r") as zf:
+                for member in zf.namelist():
+                    fname = os.path.basename(member)
+                    if fname in required_bins:
+                        with zf.open(member) as source, open(install_dir / fname, "wb") as f:
+                            shutil.copyfileobj(source, f)
+                        (install_dir / fname).chmod(0o755)
+        else:  # .tar.gz
+            with tarfile.open(tmp_path, "r:gz") as tf:
+                for member in tf.getmembers():
+                    fname = os.path.basename(member.name)
+                    if fname in required_bins:
+                        # Fix for Python 3.14 DeprecationWarning
+                        tf.extract(member, temp_extract_dir, filter='data')
+                        target = install_dir / fname
+                        shutil.move(str(temp_extract_dir / member.name), str(target))
+                        target.chmod(0o755)
+    finally:
+        if 'tmp_path' in locals(): tmp_path.unlink(missing_ok=True)
+        if 'temp_extract_dir' in locals(): shutil.rmtree(temp_extract_dir, ignore_errors=True)
+def _verify_bin(path: Path, expected_name: str) -> bool:
+    """Check if binary exists and responds to --version."""
+    if not path.exists(): return False
+    try:
+        res = subprocess.run([str(path), "--version"], capture_output=True, text=True, timeout=5)
+        return res.returncode == 0
+    except:
+        return False
+def _install_component(repo: str, bin_name: str, required_bins: List[str], install_dir: Path, force: bool) -> str:
+    """Generic installer for ripgrep and rga."""
+    system = platform.system().lower()
+    machine = platform.machine().lower()
+    # Platform mapping
+    arch = "x86_64" if machine in ("x86_64", "amd64") else "aarch64" if machine in ("arm64", "aarch64") else None
+    if not arch: raise RuntimeError(f"Unsupported arch: {machine}")
+    if system == "windows":
+        os_tag, ext = "pc-windows-msvc", ".zip"
+    elif system == "darwin":
+        os_tag, ext = "apple-darwin", ".tar.gz"
+    else:
+        # ripgrep and rga both use musl for static linux binaries
+        os_tag, ext = "unknown-linux-musl", ".tar.gz"
+    final_bin = install_dir / (bin_name + (".exe" if system == "windows" else ""))
+    if not force and _verify_bin(final_bin, bin_name):
+        return str(final_bin)
+    print(f"Installing {bin_name} from {repo}...", file=sys.stderr)
+    try:
+        api_url = f"https://api.github.com/repos/{repo}/releases/latest"
+        req = urllib.request.Request(api_url, headers={"User-Agent": "Mozilla/5.0"})
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            assets = json.loads(resp.read())["assets"]
+        # Find asset (ripgrep assets often contain 'x86_64-unknown-linux-musl')
+        asset = next(a for a in assets if arch in a["name"] and os_tag in a["name"] and a["name"].endswith(ext))
+        _download_and_extract(asset["browser_download_url"], ext, required_bins, install_dir, bin_name)
+        if not _verify_bin(final_bin, bin_name):
+            raise RuntimeError(f"Verification failed for {bin_name}")
+        return str(final_bin)
+    except Exception as e:
+        raise RuntimeError(f"Failed to install {bin_name}: {e}")
+def install_rga(force_reinstall: bool = False, install_dir: Optional[str] = None) -> str:
+    """Main entry: Installs ripgrep (rg) then ripgrep-all (rga)."""
+    if install_dir is None:
+        if platform.system().lower() == "windows":
+            install_dir = os.path.expandvars(r"%LOCALAPPDATA%\bin")
+        else:
+            install_dir = os.path.expanduser("~/.local/bin")
+    path_dir = Path(install_dir)
+    path_dir.mkdir(parents=True, exist_ok=True)
+    # 1. Install ripgrep (rg)
+    rg_exe = "rg.exe" if platform.system().lower() == "windows" else "rg"
+    _install_component("BurntSushi/ripgrep", "rg", [rg_exe], path_dir, force_reinstall)
+    # 2. Install ripgrep-all (rga)
+    rga_bins = ["rga.exe", "rga-preproc.exe"] if platform.system().lower() == "windows" else ["rga", "rga-preproc"]
+    return _install_component("phiresky/ripgrep-all", "rga", rga_bins, path_dir, force_reinstall)
+if __name__ == "__main__":
+    try:
+        path = install_rga()
+        print(f"SUCCESS: ripgrep and ripgrep-all are ready at: {os.path.dirname(path)}")
+    except Exception as e:
+        print(f"ERROR: {e}", file=sys.stderr)
+        sys.exit(1)

sirchmunk/utils/log_utils.py ADDED Viewed

@@ -0,0 +1,360 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""
+Unified logging utilities for Sirchmunk
+Provides flexible logging with optional callbacks and fallback to loguru
+Supports both synchronous and asynchronous logging
+"""
+import asyncio
+from typing import Any, Awaitable, Callable, Optional, Union
+from loguru import logger as default_logger
+# Type alias for log callback function (can be sync or async)
+# Signature: (level: str, message: str, end: str, flush: bool) -> None or Awaitable[None]
+LogCallback = Optional[Callable[[str, str, str, bool], Union[None, Awaitable[None]]]]
+async def log_with_callback_async(
+    level: str,
+    message: str,
+    log_callback: LogCallback = None,
+    flush: bool = False,
+    end: str = "\n",
+) -> None:
+    """
+    Send log message through callback if available, otherwise use loguru logger.
+    This is a universal logging utility that supports both synchronous and
+    asynchronous callback functions, with automatic fallback to loguru.
+    Args:
+        level: Log level (e.g., "info", "debug", "error", "warning", "success")
+        message: Message content to log
+        log_callback: Optional callback function (sync or async) that takes (level, message).
+                     If None, uses loguru's default_logger.
+        flush: If True, force immediate output and use raw mode (no timestamp/level prefix).
+               Useful for progress indicators. Equivalent to logger.opt(raw=True).
+        end: String appended after the message (default: "\n")
+    Examples:
+        # Using default loguru logger (with prefix)
+        await log_with_callback("info", "Processing started")
+        # Output: 2026-01-16 10:30:00.123 | INFO | Processing started
+        # Progress indicator without prefix (flush=True removes formatting)
+        await log_with_callback("info", "Processing...", flush=True, end="")
+        await log_with_callback("info", " Done!", flush=True, end="\n")
+        # Output: Processing... Done!
+        # Using custom async callback
+        async def my_callback(level: str, msg: str):
+            await websocket.send_text(f"[{level}] {msg}")
+        await log_with_callback("debug", "Custom log", log_callback=my_callback)
+    """
+    if log_callback is not None:
+        # Pass original message, end, and flush to callback
+        # Let the callback handle message formatting
+        if asyncio.iscoroutinefunction(log_callback):
+            await log_callback(level, message, end, flush)
+        else:
+            # Call sync callback directly with all parameters
+            log_callback(level, message, end, flush)
+        # If flush is requested and callback is async, yield control to allow immediate processing
+        if flush and asyncio.iscoroutinefunction(log_callback):
+            await asyncio.sleep(0)
+    else:
+        # Fallback to loguru logger (process message locally)
+        full_message = message + end if end else message
+        if flush:
+            # Use raw mode (no prefix) for flush=True
+            default_logger.opt(raw=True).log(level.upper(), full_message)
+        else:
+            # Normal formatted output with prefix
+            getattr(default_logger, level.lower())(full_message.rstrip("\n"))
+def log_with_callback(
+    level: str,
+    message: str,
+    log_callback: LogCallback = None,
+    flush: bool = False,
+    end: str = "\n",
+) -> None:
+    """
+    Synchronous version of log_with_callback.
+    Args:
+        level: Log level (e.g., "info", "debug", "error", "warning", "success")
+        message: Message content to log
+        log_callback: Optional callback function (must be sync)
+        flush: If True, force immediate output and use raw mode (no timestamp/level prefix).
+               Useful for progress indicators. Equivalent to logger.opt(raw=True).
+        end: String appended after the message (default: "\n")
+    Examples:
+        # Normal logging (with prefix)
+        log_with_callback("info", "Processing started")
+        # Output: 2026-01-16 10:30:00.123 | INFO | Processing started
+        # Progress indicator without prefix (flush=True removes formatting)
+        log_with_callback("info", "Loading", flush=True, end="")
+        log_with_callback("info", "...", flush=True, end="")
+        log_with_callback("info", " Done!", flush=True)
+        # Output: Loading... Done!
+    """
+    if log_callback is not None:
+        # Pass original message, end, and flush to callback
+        # Let the callback handle message formatting
+        if not asyncio.iscoroutinefunction(log_callback):
+            log_callback(level, message, end, flush)
+        else:
+            # If async callback provided in sync mode, schedule safely.
+            # Avoid asyncio.run() when already inside a running event loop.
+            try:
+                running_loop = asyncio.get_running_loop()
+            except RuntimeError:
+                asyncio.run(log_callback(level, message, end, flush))
+            else:
+                running_loop.create_task(log_callback(level, message, end, flush))
+    else:
+        # Fallback to loguru logger (process message locally)
+        full_message = message + end if end else message
+        if flush:
+            # Use raw mode (no prefix) for flush=True
+            default_logger.opt(raw=True).log(level.upper(), full_message)
+        else:
+            # Normal formatted output with prefix
+            getattr(default_logger, level.lower())(full_message.rstrip("\n"))
+def create_logger(log_callback: LogCallback = None, enable_async: bool = True) -> Union["AsyncLogger", "SyncLogger"]:
+    """
+    Create a logger instance with a bound log_callback.
+    This factory function creates a logger with logger-style methods (info, warning, etc.)
+    pre-configured with a specific callback, compatible with loguru logger usage.
+    Args:
+        log_callback: Optional callback function to bind
+        enable_async: If True, create AsyncLogger; if False, create SyncLogger
+    Returns:
+        AsyncLogger or SyncLogger instance depending on enable_async parameter
+    Example:
+        # Create async logger (default)
+        async def my_callback(level: str, msg: str):
+            print(f"[{level}] {msg}")
+        logger = create_logger(log_callback=my_callback, enable_async=True)
+        await logger.info("Starting process")  # Async usage
+        # Create sync logger
+        def sync_callback(level: str, msg: str):
+            print(f"[{level}] {msg}")
+        logger = create_logger(log_callback=sync_callback, enable_async=False)
+        logger.info("Starting process")  # Sync usage (no await)
+        # Without callback (uses default loguru)
+        async_logger = create_logger(enable_async=True)
+        await async_logger.info("Async with loguru")
+        sync_logger = create_logger(enable_async=False)
+        sync_logger.info("Sync with loguru")
+    """
+    if enable_async:
+        return AsyncLogger(log_callback=log_callback)
+    else:
+        return SyncLogger(log_callback=log_callback)
+class SyncLogger:
+    """
+    Synchronous logger class with optional callback support.
+    Provides a synchronous interface for logging. Use this when you need
+    synchronous logging or when working in non-async contexts.
+    Supports print-like flush and end parameters for advanced output control.
+    When flush=True, uses raw mode (no timestamp/level prefix) for clean output.
+    Example:
+        # With custom sync callback
+        def my_callback(level: str, msg: str):
+            print(f"[{level}] {msg}", end="")
+        logger = SyncLogger(log_callback=my_callback)
+        logger.info("Starting process")
+        logger.error("Failed to connect")
+        # Progress indicator (flush=True removes prefix for clean output)
+        logger.info("Processing", flush=True, end="")
+        logger.info("...", flush=True, end="")
+        logger.info(" Done!", flush=True)
+        # Output: Processing... Done!
+        # Without callback (uses loguru with normal formatting)
+        logger = SyncLogger()
+        logger.info("Using default logger")
+        # Output: 2026-01-16 10:30:00.123 | INFO | Using default logger
+    """
+    def __init__(self, log_callback: LogCallback = None):
+        """
+        Initialize sync logger with optional callback.
+        Args:
+            log_callback: Optional callback function (preferably sync)
+        """
+        self.log_callback = log_callback
+    def log(self, level: str, message: str, flush: bool = False, end: str = "\n"):
+        """Log a message at the specified level (synchronous)"""
+        log_with_callback(level, message, log_callback=self.log_callback, flush=flush, end=end)
+    def debug(self, message: str, flush: bool = False, end: str = "\n"):
+        """Log a debug message (synchronous)"""
+        self.log("debug", message, flush=flush, end=end)
+    def info(self, message: str, flush: bool = False, end: str = "\n"):
+        """Log an info message (synchronous)"""
+        self.log("info", message, flush=flush, end=end)
+    def warning(self, message: str, flush: bool = False, end: str = "\n"):
+        """Log a warning message (synchronous)"""
+        self.log("warning", message, flush=flush, end=end)
+    def error(self, message: str, flush: bool = False, end: str = "\n"):
+        """Log an error message (synchronous)"""
+        self.log("error", message, flush=flush, end=end)
+    def success(self, message: str, flush: bool = False, end: str = "\n"):
+        """Log a success message (synchronous)"""
+        self.log("success", message, flush=flush, end=end)
+    def critical(self, message: str, flush: bool = False, end: str = "\n"):
+        """Log a critical message (synchronous)"""
+        self.log("critical", message, flush=flush, end=end)
+class AsyncLogger:
+    """
+    Async logger class with optional callback support.
+    Provides a class-based interface for logging with instance-level
+    callback configuration. Useful for classes that need persistent
+    logging configuration.
+    Supports print-like flush and end parameters for advanced output control.
+    When flush=True, uses raw mode (no timestamp/level prefix) for clean output.
+    Example:
+        # With custom callback
+        async def my_callback(level: str, msg: str):
+            await websocket.send(f"{level}: {msg}")
+        logger = AsyncLogger(log_callback=my_callback)
+        await logger.info("Starting process")
+        await logger.error("Failed to connect")
+        # Progress indicator (flush=True removes prefix for clean output)
+        await logger.info("Processing", flush=True, end="")
+        await logger.info("...", flush=True, end="")
+        await logger.info(" Done!", flush=True)
+        # Output: Processing... Done!
+        # Without callback (uses loguru with normal formatting)
+        logger = AsyncLogger()
+        await logger.info("Using default logger")
+        # Output: 2026-01-16 10:30:00.123 | INFO | Using default logger
+    """
+    def __init__(self, log_callback: LogCallback = None):
+        """
+        Initialize async logger with optional callback.
+        Args:
+            log_callback: Optional callback function (sync or async)
+        """
+        self.log_callback = log_callback
+    async def log(self, level: str, message: str, flush: bool = False, end: str = "\n"):
+        """
+        Log a message at the specified level.
+        Args:
+            level: Log level
+            message: Message to log
+            flush: If True, force immediate output
+            end: String appended after message (default: "\n")
+        """
+        await log_with_callback_async(level, message, log_callback=self.log_callback, flush=flush, end=end)
+    async def debug(self, message: str, flush: bool = False, end: str = "\n"):
+        """
+        Log a debug message.
+        Args:
+            message: Message to log
+            flush: If True, force immediate output
+            end: String appended after message (default: "\n")
+        """
+        await self.log("debug", message, flush=flush, end=end)
+    async def info(self, message: str, flush: bool = False, end: str = "\n"):
+        """
+        Log an info message.
+        Args:
+            message: Message to log
+            flush: If True, force immediate output
+            end: String appended after message (default: "\n")
+        """
+        await self.log("info", message, flush=flush, end=end)
+    async def warning(self, message: str, flush: bool = False, end: str = "\n"):
+        """
+        Log a warning message.
+        Args:
+            message: Message to log
+            flush: If True, force immediate output
+            end: String appended after message (default: "\n")
+        """
+        await self.log("warning", message, flush=flush, end=end)
+    async def error(self, message: str, flush: bool = False, end: str = "\n"):
+        """
+        Log an error message.
+        Args:
+            message: Message to log
+            flush: If True, force immediate output
+            end: String appended after message (default: "\n")
+        """
+        await self.log("error", message, flush=flush, end=end)
+    async def success(self, message: str, flush: bool = False, end: str = "\n"):
+        """
+        Log a success message.
+        Args:
+            message: Message to log
+            flush: If True, force immediate output
+            end: String appended after message (default: "\n")
+        """
+        await self.log("success", message, flush=flush, end=end)
+    async def critical(self, message: str, flush: bool = False, end: str = "\n"):
+        """
+        Log a critical message.
+        Args:
+            message: Message to log
+            flush: If True, force immediate output
+            end: String appended after message (default: "\n")
+        """
+        await self.log("critical", message, flush=flush, end=end)

sirchmunk/utils/tokenizer_util.py ADDED Viewed

@@ -0,0 +1,55 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from typing import List, Optional, Union
+class TokenizerUtil:
+    """Fast tokenizer utility using modelscope AutoTokenizer."""
+    def __init__(self, model_id: Optional[str] = None):
+        """
+        Tokenizer encoding and counting utility.
+        Args:
+            model_id: Model ID for loading the tokenizer. Defaults to "Qwen/Qwen3-8B".
+        """
+        from modelscope import AutoTokenizer
+        model_id: str = model_id or "Qwen/Qwen3-8B"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+    def encode(self, content: str) -> List[int]:
+        """Encode text into token IDs.
+        Args:
+            content: Input text string.
+        Returns:
+            List of token IDs.
+        """
+        if not content.strip():
+            return []
+        return self.tokenizer.encode(content.strip())
+    def count_tokens(self, contents: Union[str, List[str]]) -> Union[int, List[int]]:
+        """
+        Batch count tokens for multiple texts.
+        Args:
+            contents: List of input text strings.
+        Returns:
+            List of token counts corresponding to each input text, or an integer if a single string is provided.
+        """
+        if isinstance(contents, str):
+            contents = [contents]
+        counts = []
+        for content in contents:
+            if not content.strip():
+                counts.append(0)
+            else:
+                counts.append(len(self.tokenizer.encode(content.strip())))
+        if len(contents) == 1:
+            return counts[0]
+        return counts

sirchmunk 0.0.0__py3-none-any.whl → 0.0.1__py3-none-any.whl

sirchmunk 0.0.0py3-none-any.whl → 0.0.1py3-none-any.whl