PyPI - lionagi - Versions diffs - 0.9.13__py3-none-any.whl → 0.9.15__py3-none-any.whl - Mend

lionagi 0.9.13py3-none-any.whl → 0.9.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

lionagi/libs/file/chunk.py CHANGED Viewed

@@ -216,7 +216,7 @@ def chunk_content(
     chunk_size: int = 1024,
     overlap: float = 0,
     threshold: int = 256,
-    metadata: dict[str, Any] = {},
+    metadata: dict[str, Any] = None,
     return_tokens: bool = False,
     as_node: bool = False,
     **kwargs: Any,
@@ -268,7 +268,7 @@ def chunk_content(
                     "chunk_id": i + 1,
                     "total_chunks": len(chunks),
                     "chunk_size": len(chunk),
-                    **metadata,
+                    **(metadata or {}),
                 },
             )
             for i, chunk in enumerate(chunks)
@@ -280,7 +280,7 @@ def chunk_content(
             "chunk_id": i + 1,
             "total_chunks": len(chunks),
             "chunk_size": len(chunk),
-            **metadata,
+            **(metadata or {}),
         }
         for i, chunk in enumerate(chunks)
     ]

lionagi/libs/file/concat_files.py ADDED Viewed

@@ -0,0 +1,83 @@
+from pathlib import Path
+from lionagi.utils import create_path
+from .process import dir_to_files
+def concat_files(
+    data_path: str | Path | list,
+    file_types: list[str],
+    output_dir: str | Path = None,
+    output_filename: str = None,
+    file_exist_ok: bool = True,
+    recursive: bool = True,
+    verbose: bool = True,
+    threshold: int = 0,
+    return_fps: bool = False,
+    return_files: bool = False,
+    **kwargs,
+) -> list[str] | str | tuple[list[str], list[Path]] | tuple[str, list[Path]]:
+    """
+    data_path: str or Path or list of str or Path, the directory or file paths to concatenate.
+    file_types: list of str, the file types to concatenate. [e.g. ['.txt', '.md']]
+    output_dir: str or Path, the directory to save the concatenated file. If provided, will save the file.
+    output_filename: str, the filename to save the concatenated file.
+    file_exist_ok: bool, if True, overwrite the existing file. Default is True.
+    recursive: bool, if True, search files recursively. Default is True.
+    verbose: bool, if True, print the output path. Default is True.
+    threshold: int, the minimum number of chars for the file to be considered valid to concatenate.
+    kwargs: additional keyword arguments to pass to create_path.
+    """
+    persist_path = None
+    if output_dir:
+        if not output_filename:
+            output_filename = "concatenated_text.txt"
+            kwargs["timestamp"] = kwargs.get("timestamp", True)
+            kwargs["random_hash_digits"] = kwargs.get("random_hash_digits", 6)
+        output_filename = output_filename or "concatenated_text.txt"
+        persist_path = create_path(
+            output_dir, output_filename, file_exist_ok=file_exist_ok, **kwargs
+        )
+    texts = []
+    data_path = (
+        [str(data_path)] if not isinstance(data_path, list) else data_path
+    )
+    data_path = sorted(data_path)
+    data_path = [Path(dp) for dp in data_path if Path(dp).exists()]
+    for dp in data_path:
+        fps = dir_to_files(dp, recursive=recursive, file_types=file_types)
+        data_path = sorted([str(i) for i in fps])
+        data_path: list[Path] = [
+            Path(dp) for dp in data_path if Path(dp).exists()
+        ]
+        for fp in data_path:
+            text = fp.read_text(encoding="utf-8")
+            if len(text) >= threshold:
+                fp_text = (
+                    "\n----------------------------------------------------\n"
+                    f"{str(fp)}"
+                    "\n----------------------------------------------------\n"
+                )
+                text = fp_text + text
+                texts.append(text)
+    text = "\n".join(texts)
+    if persist_path:
+        persist_path.write_text(text, encoding="utf-8")
+    if verbose:
+        print(f"Concatenated {len(fps)} files to {persist_path}")
+        print(f"The file contains {len(text)} characters.")
+    if return_files:
+        if return_fps:
+            return texts, fps
+        return texts
+    if return_fps:
+        return text, fps
+    return text

lionagi/libs/file/process.py CHANGED Viewed

@@ -164,10 +164,12 @@ def file_to_chunks(
 def chunk(
-    url_or_path: str | Path,
     *,
+    text: str | None = None,
+    url_or_path: str | Path = None,
     file_types: list[str] | None = None,  # only local files
     recursive: bool = False,  # only local files
+    tokenizer: Callable[[str], list[str]] = None,
     chunk_by: Literal["chars", "tokens"] = "chars",
     chunk_size: int = 1500,
     overlap: float = 0.1,
@@ -175,45 +177,52 @@ def chunk(
     output_file: str | Path | None = None,
     metadata: dict[str, Any] | None = None,
     reader_tool: Callable = None,
-):
-    if isinstance(url_or_path, str):
-        url_or_path = Path(url_or_path)
-    chunks = None
-    files = None
-    if url_or_path.exists():
-        if url_or_path.is_dir():
-            files = dir_to_files(
-                directory=url_or_path,
-                file_types=file_types,
-                recursive=recursive,
+    as_node: bool = False,
+) -> list:
+    texts = []
+    if not text:
+        if isinstance(url_or_path, str):
+            url_or_path = Path(url_or_path)
+        chunks = None
+        files = None
+        if url_or_path.exists():
+            if url_or_path.is_dir():
+                files = dir_to_files(
+                    directory=url_or_path,
+                    file_types=file_types,
+                    recursive=recursive,
+                )
+            elif url_or_path.is_file():
+                files = [url_or_path]
+        else:
+            files = (
+                [str(url_or_path)]
+                if not isinstance(url_or_path, list)
+                else url_or_path
             )
-        elif url_or_path.is_file():
-            files = [url_or_path]
-    else:
-        files = (
-            [str(url_or_path)]
-            if not isinstance(url_or_path, list)
-            else url_or_path
-        )
-    if reader_tool is None:
-        reader_tool = lambda x: x.read_text(encoding="utf-8")
+        if reader_tool is None:
+            reader_tool = lambda x: x.read_text(encoding="utf-8")
-    if reader_tool == "docling":
-        from lionagi.libs.package.imports import check_import
+        if reader_tool == "docling":
+            from lionagi.libs.package.imports import check_import
-        DocumentConverter = check_import(
-            "docling",
-            module_name="document_converter",
-            import_name="DocumentConverter",
-        )
-        converter = DocumentConverter()
-        reader_tool = lambda x: converter.convert(
-            x
-        ).document.export_to_markdown()
+            DocumentConverter = check_import(
+                "docling",
+                module_name="document_converter",
+                import_name="DocumentConverter",
+            )
+            converter = DocumentConverter()
+            reader_tool = lambda x: converter.convert(
+                x
+            ).document.export_to_markdown()
+        texts = lcall(files, reader_tool)
+    else:
+        texts = [text]
-    texts = lcall(files, reader_tool)
     chunks = lcall(
         texts,
         chunk_content,
@@ -224,6 +233,7 @@ def chunk(
         metadata=metadata,
         as_node=True,
         flatten=True,
+        tokenizer=tokenizer or str.split,
     )
     if threshold:
         chunks = [c for c in chunks if len(c.content) > threshold]
@@ -247,4 +257,7 @@ def chunk(
         else:
             raise ValueError(f"Unsupported output file format: {output_file}")
-    return chunks
+    if as_node:
+        return chunks
+    return [c.content for c in chunks]

lionagi/libs/token_transform/base.py ADDED Viewed

@@ -0,0 +1,52 @@
+from __future__ import annotations
+from enum import Enum
+from pathlib import Path
+from pydantic import Field
+from lionagi.tools.base import Resource, ResourceCategory
+here = Path(__file__).parent.resolve()
+MAPPING_PATH = "synthlang_/resources/mapping"
+class TokenMappingTemplate(str, Enum):
+    RUST_CHINESE = "rust_chinese"
+    @property
+    def fp(self) -> Path:
+        return here / MAPPING_PATH / f"{self.value}_mapping.toml"
+class TokenMapping(Resource):
+    category: ResourceCategory = Field(
+        default=ResourceCategory.UTILITY, frozen=True
+    )
+    content: dict
+    @classmethod
+    def load_from_template(
+        cls, template: TokenMappingTemplate | str
+    ) -> TokenMapping:
+        if isinstance(template, str):
+            template = template.lower().strip()
+            template = (
+                template.replace(".toml", "")
+                .replace(" ", "_")
+                .replace("-", "_")
+                .strip()
+            )
+            if template.endswith("_mapping"):
+                template = template[:-8]
+            if "/" in template:
+                template = template.split("/")[-1]
+            template = TokenMappingTemplate(template)
+        if isinstance(template, TokenMappingTemplate):
+            template = template.fp
+            return cls.adapt_from(template, ".toml", many=False)
+        raise ValueError(
+            f"Invalid template: {template}. Must be a TokenMappingTemplate or a valid path."
+        )

lionagi/libs/token_transform/perplexity.py CHANGED Viewed

@@ -101,7 +101,7 @@ async def compute_perplexity(
     chat_model: iModel,
     initial_context: str = None,
     tokens: list[str] = None,
-    system_msg: str = None,
+    system: str = None,
     n_samples: int = 1,
     use_residue: bool = True,
     **kwargs,
@@ -142,9 +142,9 @@ async def compute_perplexity(
     api_calls = []
     for sample_txt in sampless:
         messages = []
-        if system_msg:
+        if system:
             if not chat_model.sequential_exchange:
-                messages.append({"role": "system", "content": system_msg})
+                messages.append({"role": "system", "content": system})
             messages.append({"role": "user", "content": sample_txt})
         else:
             messages.append({"role": "user", "content": sample_txt})
@@ -171,10 +171,10 @@ class LLMCompressor:
     def __init__(
         self,
         chat_model: iModel,
-        system_msg=None,
+        system=None,
         tokenizer=None,
         splitter=None,
-        target_ratio=0.2,
+        compression_ratio=0.2,
         n_samples=5,
         chunk_size=64,
         max_tokens_per_sample=80,
@@ -193,10 +193,8 @@ class LLMCompressor:
         self.chat_model = chat_model
         self.tokenizer = tokenizer
         self.splitter = splitter
-        self.system_msg = (
-            system_msg or "Concisely summarize content for storage:"
-        )
-        self.target_ratio = target_ratio
+        self.system = system or "Concisely summarize content for storage:"
+        self.compression_ratio = compression_ratio
         self.n_samples = n_samples
         self.chunk_size = chunk_size
         self.max_tokens_per_sample = max_tokens_per_sample
@@ -281,7 +279,7 @@ class LLMCompressor:
                 initial_context=initial_text,
                 tokens=item_toks,
                 n_samples=n_samples or self.n_samples,
-                system_msg=self.system_msg,
+                system=self.system,
                 use_residue=use_residue,
                 **kwargs,
             )
@@ -347,6 +345,7 @@ class LLMCompressor:
         # Tokenize once to get total length
         all_tokens = self.tokenize(text)
         original_len = len(all_tokens)
+        ttl_chars = len(text)
         # Split text
         items = self.split(text, **split_kwargs)
@@ -363,26 +362,26 @@ class LLMCompressor:
         # Select
         selected = self.select_by_pplex(
             ranked_items=ranked,
-            target_compression_ratio=compression_ratio or self.target_ratio,
+            target_compression_ratio=compression_ratio
+            or self.compression_ratio,
             original_length=original_len,
             min_pplx=min_pplx or self.min_pplx,
         )
-        if self.verbose:
-            compressed_len = sum(
-                len(to_list(self.tokenize(x), dropna=True, flatten=True))
-                for x in selected
-            )
-            ratio = compressed_len / original_len if original_len else 1
-            print(
-                f"Original tokens: {original_len}\n"
-                f"Selected tokens: {compressed_len}\n"
-                f"Compression ratio: {ratio:.3f}\n"
-                f"Time: {timer() - start:.3f}s\n"
-            )
         # Join final
         out_str = " ".join(selected)
+        if self.verbose:
+            compressed_chars = len(out_str)
+            ratio = compressed_chars / ttl_chars if original_len else 1
+            msg = "------------------------------------------\n"
+            msg += f"Compression Method: Perplexity\n"
+            msg += f"Compressed Characters number: {compressed_chars}\n"
+            msg += f"Character Compression Ratio: {ratio:.1%}\n"
+            msg += f"Compression Time: {timer() - start:.3f}s\n"
+            msg += f"Compression Model: {self.chat_model.model_name}\n"
+            print(msg)
         return out_str.strip()
     def select_by_pplex(
@@ -419,21 +418,34 @@ class LLMCompressor:
 async def compress_text(
     text: str,
     chat_model: iModel,
-    system_msg: str = None,
-    target_ratio: float = 0.2,
+    system: str = None,
+    compression_ratio: float = 0.2,
     n_samples: int = 5,
     max_tokens_per_sample=80,
     verbose=True,
+    initial_text=None,
+    cumulative=False,
+    split_kwargs=None,
+    min_pplx=None,
+    **kwargs,
 ) -> str:
     """
     Convenience function that instantiates LLMCompressor and compresses text.
     """
     compressor = LLMCompressor(
         chat_model=chat_model,
-        system_msg=system_msg,
-        target_ratio=target_ratio,
+        system=system,
+        compression_ratio=compression_ratio,
         n_samples=n_samples,
         max_tokens_per_sample=max_tokens_per_sample,
         verbose=verbose,
     )
-    return await compressor.compress(text)
+    return await compressor.compress(
+        text,
+        compression_ratio=compression_ratio,
+        initial_text=initial_text,
+        cumulative=cumulative,
+        split_kwargs=split_kwargs,
+        min_pplx=min_pplx,
+        **kwargs,
+    )

lionagi/libs/token_transform/symbolic_compress_context.py ADDED Viewed

@@ -0,0 +1,147 @@
+from collections.abc import Callable
+from pathlib import Path
+from typing import Literal
+from lionagi.service.imodel import iModel
+from lionagi.session.branch import Branch
+from lionagi.utils import alcall, get_bins
+from .base import TokenMapping, TokenMappingTemplate
+from .synthlang_.base import SynthlangFramework, SynthlangTemplate
+FRAMEWORK_OPTIONS = SynthlangFramework.load_framework_options()
+FRAMEWORK_CHOICES = Literal["math", "optim", "custom_algebra"]
+async def symbolic_compress_context(
+    *,
+    text: str = None,
+    url_or_path: str | Path = None,
+    chunk_by="tokens",
+    chunk_size: int = 1000,
+    chunk_tokenizer: Callable = None,
+    threshold=50,
+    output_path: Path | str = None,
+    overlap=0.025,
+    system: str = None,
+    chat_model: iModel = None,
+    use_lion_system_message: bool = True,
+    max_concurrent=10,
+    throttle_period=1,
+    framework: Literal["synthlang"] = "synthlang",
+    framework_template: (
+        SynthlangTemplate | SynthlangFramework
+    ) = SynthlangTemplate.REFLECTIVE_PATTERNS,
+    framework_options: list[FRAMEWORK_CHOICES] = None,
+    compress: bool = False,
+    compress_model: iModel = None,
+    compression_ratio: float = 0.2,
+    compress_initial_text=None,
+    compress_cumulative=False,
+    compress_split_kwargs=None,
+    compress_min_pplx=None,
+    encode_token_map: TokenMappingTemplate | dict | TokenMapping = None,
+    num_encodings: int = 3,
+    encode_output: bool = True,
+    num_output_encodings: int = 1,
+    verbose: bool = True,
+    branch: Branch = None,
+    additional_text: str = "",
+    **kwargs,
+):
+    if framework != "synthlang":
+        raise ValueError(f"Unsupported framework: {framework}")
+    if not text and not url_or_path:
+        raise ValueError("Either text or url_or_path must be provided.")
+    if text and url_or_path:
+        raise ValueError("Only one of text or url_or_path should be provided.")
+    from .synthlang_.translate_to_synthlang import translate_to_synthlang
+    async def _inner(text: str):
+        b_ = None
+        if branch:
+            b_ = await branch.aclone()
+        else:
+            b_ = Branch(
+                system=system,
+                use_lion_system_message=use_lion_system_message,
+                chat_model=chat_model,
+            )
+        return await translate_to_synthlang(
+            text,
+            branch=b_,
+            framework_template=framework_template,
+            framework_options=framework_options,
+            compress=compress,
+            compress_model=compress_model,
+            compression_ratio=compression_ratio,
+            compress_kwargs={
+                "initial_text": compress_initial_text,
+                "cumulative": compress_cumulative,
+                "split_kwargs": compress_split_kwargs,
+                "min_pplx": compress_min_pplx,
+            },
+            encode_token_map=encode_token_map,
+            num_encodings=num_encodings,
+            encode_output=encode_output,
+            num_output_encodings=num_output_encodings,
+            verbose=verbose,
+            additional_text=additional_text,
+            **kwargs,
+        )
+    from lionagi.libs.file.process import chunk, chunk_content
+    chunks = []
+    if url_or_path:
+        chunks = chunk(
+            url_or_path=url_or_path,
+            chunk_by=chunk_by,
+            chunk_size=chunk_size,
+            overlap=overlap,
+            threshold=threshold,
+        )
+    elif text:
+        chunks = chunk_content(
+            text=text,
+            chunk_by=chunk_by,
+            chunk_size=chunk_size,
+            overlap=overlap,
+            threshold=threshold,
+            tokenizer=chunk_tokenizer or str.split,
+        )
+    texts = [str(i).strip() for i in chunks if str(i).strip()]
+    bins = get_bins(texts, upper=chunk_size)
+    textss = []
+    for i in bins:
+        textss.append("\n".join([texts[j] for j in i]))
+    results = await alcall(
+        textss,
+        _inner,
+        max_concurrent=max_concurrent,
+        retry_default=None,
+        num_retries=2,
+        throttle_period=throttle_period,
+        retry_delay=1,
+        backoff_factor=2,
+        flatten=True,
+        dropna=True,
+        unique_output=True,
+    )
+    text = "\n".join(results)
+    if output_path:
+        fp = Path(output_path)
+        fp.write_text(text)
+        if verbose:
+            print(f"Results of {len(text)} characters saved to: {fp}")
+        return fp
+    return text

lionagi 0.9.13__py3-none-any.whl → 0.9.15__py3-none-any.whl

lionagi 0.9.13py3-none-any.whl → 0.9.15py3-none-any.whl