lionagi 0.9.12__py3-none-any.whl → 0.9.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lionagi/libs/file/chunk.py +3 -3
- lionagi/libs/token_transform/base.py +52 -0
- lionagi/libs/token_transform/perplexity.py +41 -29
- lionagi/libs/token_transform/symbolic_compress_context.py +138 -0
- lionagi/libs/token_transform/synthlang.py +9 -415
- lionagi/libs/token_transform/synthlang_/base.py +130 -0
- lionagi/libs/token_transform/synthlang_/resources/frameworks/abstract_algebra.toml +11 -0
- lionagi/libs/token_transform/synthlang_/resources/frameworks/category_theory.toml +11 -0
- lionagi/libs/token_transform/synthlang_/resources/frameworks/complex_analysis.toml +11 -0
- lionagi/libs/token_transform/synthlang_/resources/frameworks/framework_options.json +52 -0
- lionagi/libs/token_transform/synthlang_/resources/frameworks/group_theory.toml +11 -0
- lionagi/libs/token_transform/synthlang_/resources/frameworks/math_logic.toml +11 -0
- lionagi/libs/token_transform/synthlang_/resources/frameworks/reflective_patterns.toml +11 -0
- lionagi/libs/token_transform/synthlang_/resources/frameworks/set_theory.toml +11 -0
- lionagi/libs/token_transform/synthlang_/resources/frameworks/topology_fundamentals.toml +11 -0
- lionagi/libs/token_transform/synthlang_/resources/mapping/rust_chinese_mapping.toml +37 -0
- lionagi/libs/token_transform/synthlang_/resources/utility/base_synthlang_system_prompt.toml +11 -0
- lionagi/libs/token_transform/synthlang_/translate_to_synthlang.py +136 -0
- lionagi/libs/token_transform/types.py +15 -0
- lionagi/protocols/adapters/toml_adapter.py +204 -0
- lionagi/protocols/adapters/types.py +3 -0
- lionagi/protocols/generic/element.py +9 -0
- lionagi/protocols/graph/node.py +3 -0
- lionagi/service/endpoints/token_calculator.py +8 -0
- lionagi/service/imodel.py +14 -13
- lionagi/session/branch.py +6 -6
- lionagi/tools/base.py +62 -0
- lionagi/version.py +1 -1
- {lionagi-0.9.12.dist-info → lionagi-0.9.14.dist-info}/METADATA +2 -1
- {lionagi-0.9.12.dist-info → lionagi-0.9.14.dist-info}/RECORD +32 -15
- {lionagi-0.9.12.dist-info → lionagi-0.9.14.dist-info}/WHEEL +0 -0
- {lionagi-0.9.12.dist-info → lionagi-0.9.14.dist-info}/licenses/LICENSE +0 -0
lionagi/libs/file/chunk.py
CHANGED
@@ -216,7 +216,7 @@ def chunk_content(
|
|
216
216
|
chunk_size: int = 1024,
|
217
217
|
overlap: float = 0,
|
218
218
|
threshold: int = 256,
|
219
|
-
metadata: dict[str, Any] =
|
219
|
+
metadata: dict[str, Any] = None,
|
220
220
|
return_tokens: bool = False,
|
221
221
|
as_node: bool = False,
|
222
222
|
**kwargs: Any,
|
@@ -268,7 +268,7 @@ def chunk_content(
|
|
268
268
|
"chunk_id": i + 1,
|
269
269
|
"total_chunks": len(chunks),
|
270
270
|
"chunk_size": len(chunk),
|
271
|
-
**metadata,
|
271
|
+
**(metadata or {}),
|
272
272
|
},
|
273
273
|
)
|
274
274
|
for i, chunk in enumerate(chunks)
|
@@ -280,7 +280,7 @@ def chunk_content(
|
|
280
280
|
"chunk_id": i + 1,
|
281
281
|
"total_chunks": len(chunks),
|
282
282
|
"chunk_size": len(chunk),
|
283
|
-
**metadata,
|
283
|
+
**(metadata or {}),
|
284
284
|
}
|
285
285
|
for i, chunk in enumerate(chunks)
|
286
286
|
]
|
@@ -0,0 +1,52 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from enum import Enum
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
from pydantic import Field
|
7
|
+
|
8
|
+
from lionagi.tools.base import Resource, ResourceCategory
|
9
|
+
|
10
|
+
here = Path(__file__).parent.resolve()
|
11
|
+
MAPPING_PATH = "synthlang_/resources/mapping"
|
12
|
+
|
13
|
+
|
14
|
+
class TokenMappingTemplate(str, Enum):
|
15
|
+
RUST_CHINESE = "rust_chinese"
|
16
|
+
|
17
|
+
@property
|
18
|
+
def fp(self) -> Path:
|
19
|
+
return here / MAPPING_PATH / f"{self.value}_mapping.toml"
|
20
|
+
|
21
|
+
|
22
|
+
class TokenMapping(Resource):
|
23
|
+
category: ResourceCategory = Field(
|
24
|
+
default=ResourceCategory.UTILITY, frozen=True
|
25
|
+
)
|
26
|
+
content: dict
|
27
|
+
|
28
|
+
@classmethod
|
29
|
+
def load_from_template(
|
30
|
+
cls, template: TokenMappingTemplate | str
|
31
|
+
) -> TokenMapping:
|
32
|
+
if isinstance(template, str):
|
33
|
+
template = template.lower().strip()
|
34
|
+
template = (
|
35
|
+
template.replace(".toml", "")
|
36
|
+
.replace(" ", "_")
|
37
|
+
.replace("-", "_")
|
38
|
+
.strip()
|
39
|
+
)
|
40
|
+
if template.endswith("_mapping"):
|
41
|
+
template = template[:-8]
|
42
|
+
if "/" in template:
|
43
|
+
template = template.split("/")[-1]
|
44
|
+
template = TokenMappingTemplate(template)
|
45
|
+
|
46
|
+
if isinstance(template, TokenMappingTemplate):
|
47
|
+
template = template.fp
|
48
|
+
return cls.adapt_from(template, ".toml", many=False)
|
49
|
+
|
50
|
+
raise ValueError(
|
51
|
+
f"Invalid template: {template}. Must be a TokenMappingTemplate or a valid path."
|
52
|
+
)
|
@@ -101,7 +101,7 @@ async def compute_perplexity(
|
|
101
101
|
chat_model: iModel,
|
102
102
|
initial_context: str = None,
|
103
103
|
tokens: list[str] = None,
|
104
|
-
|
104
|
+
system: str = None,
|
105
105
|
n_samples: int = 1,
|
106
106
|
use_residue: bool = True,
|
107
107
|
**kwargs,
|
@@ -142,9 +142,9 @@ async def compute_perplexity(
|
|
142
142
|
api_calls = []
|
143
143
|
for sample_txt in sampless:
|
144
144
|
messages = []
|
145
|
-
if
|
145
|
+
if system:
|
146
146
|
if not chat_model.sequential_exchange:
|
147
|
-
messages.append({"role": "system", "content":
|
147
|
+
messages.append({"role": "system", "content": system})
|
148
148
|
messages.append({"role": "user", "content": sample_txt})
|
149
149
|
else:
|
150
150
|
messages.append({"role": "user", "content": sample_txt})
|
@@ -171,10 +171,10 @@ class LLMCompressor:
|
|
171
171
|
def __init__(
|
172
172
|
self,
|
173
173
|
chat_model: iModel,
|
174
|
-
|
174
|
+
system=None,
|
175
175
|
tokenizer=None,
|
176
176
|
splitter=None,
|
177
|
-
|
177
|
+
compression_ratio=0.2,
|
178
178
|
n_samples=5,
|
179
179
|
chunk_size=64,
|
180
180
|
max_tokens_per_sample=80,
|
@@ -193,10 +193,8 @@ class LLMCompressor:
|
|
193
193
|
self.chat_model = chat_model
|
194
194
|
self.tokenizer = tokenizer
|
195
195
|
self.splitter = splitter
|
196
|
-
self.
|
197
|
-
|
198
|
-
)
|
199
|
-
self.target_ratio = target_ratio
|
196
|
+
self.system = system or "Concisely summarize content for storage:"
|
197
|
+
self.compression_ratio = compression_ratio
|
200
198
|
self.n_samples = n_samples
|
201
199
|
self.chunk_size = chunk_size
|
202
200
|
self.max_tokens_per_sample = max_tokens_per_sample
|
@@ -281,7 +279,7 @@ class LLMCompressor:
|
|
281
279
|
initial_context=initial_text,
|
282
280
|
tokens=item_toks,
|
283
281
|
n_samples=n_samples or self.n_samples,
|
284
|
-
|
282
|
+
system=self.system,
|
285
283
|
use_residue=use_residue,
|
286
284
|
**kwargs,
|
287
285
|
)
|
@@ -347,6 +345,7 @@ class LLMCompressor:
|
|
347
345
|
# Tokenize once to get total length
|
348
346
|
all_tokens = self.tokenize(text)
|
349
347
|
original_len = len(all_tokens)
|
348
|
+
ttl_chars = len(text)
|
350
349
|
|
351
350
|
# Split text
|
352
351
|
items = self.split(text, **split_kwargs)
|
@@ -363,26 +362,26 @@ class LLMCompressor:
|
|
363
362
|
# Select
|
364
363
|
selected = self.select_by_pplex(
|
365
364
|
ranked_items=ranked,
|
366
|
-
target_compression_ratio=compression_ratio
|
365
|
+
target_compression_ratio=compression_ratio
|
366
|
+
or self.compression_ratio,
|
367
367
|
original_length=original_len,
|
368
368
|
min_pplx=min_pplx or self.min_pplx,
|
369
369
|
)
|
370
370
|
|
371
|
-
if self.verbose:
|
372
|
-
compressed_len = sum(
|
373
|
-
len(to_list(self.tokenize(x), dropna=True, flatten=True))
|
374
|
-
for x in selected
|
375
|
-
)
|
376
|
-
ratio = compressed_len / original_len if original_len else 1
|
377
|
-
print(
|
378
|
-
f"Original tokens: {original_len}\n"
|
379
|
-
f"Selected tokens: {compressed_len}\n"
|
380
|
-
f"Compression ratio: {ratio:.3f}\n"
|
381
|
-
f"Time: {timer() - start:.3f}s\n"
|
382
|
-
)
|
383
|
-
|
384
371
|
# Join final
|
385
372
|
out_str = " ".join(selected)
|
373
|
+
|
374
|
+
if self.verbose:
|
375
|
+
compressed_chars = len(out_str)
|
376
|
+
ratio = compressed_chars / ttl_chars if original_len else 1
|
377
|
+
msg = "------------------------------------------\n"
|
378
|
+
msg += f"Compression Method: Perplexity\n"
|
379
|
+
msg += f"Compressed Characters number: {compressed_chars}\n"
|
380
|
+
msg += f"Character Compression Ratio: {ratio:.1%}\n"
|
381
|
+
msg += f"Compression Time: {timer() - start:.3f}s\n"
|
382
|
+
msg += f"Compression Model: {self.chat_model.model_name}\n"
|
383
|
+
print(msg)
|
384
|
+
|
386
385
|
return out_str.strip()
|
387
386
|
|
388
387
|
def select_by_pplex(
|
@@ -419,21 +418,34 @@ class LLMCompressor:
|
|
419
418
|
async def compress_text(
|
420
419
|
text: str,
|
421
420
|
chat_model: iModel,
|
422
|
-
|
423
|
-
|
421
|
+
system: str = None,
|
422
|
+
compression_ratio: float = 0.2,
|
424
423
|
n_samples: int = 5,
|
425
424
|
max_tokens_per_sample=80,
|
426
425
|
verbose=True,
|
426
|
+
initial_text=None,
|
427
|
+
cumulative=False,
|
428
|
+
split_kwargs=None,
|
429
|
+
min_pplx=None,
|
430
|
+
**kwargs,
|
427
431
|
) -> str:
|
428
432
|
"""
|
429
433
|
Convenience function that instantiates LLMCompressor and compresses text.
|
430
434
|
"""
|
431
435
|
compressor = LLMCompressor(
|
432
436
|
chat_model=chat_model,
|
433
|
-
|
434
|
-
|
437
|
+
system=system,
|
438
|
+
compression_ratio=compression_ratio,
|
435
439
|
n_samples=n_samples,
|
436
440
|
max_tokens_per_sample=max_tokens_per_sample,
|
437
441
|
verbose=verbose,
|
438
442
|
)
|
439
|
-
return await compressor.compress(
|
443
|
+
return await compressor.compress(
|
444
|
+
text,
|
445
|
+
compression_ratio=compression_ratio,
|
446
|
+
initial_text=initial_text,
|
447
|
+
cumulative=cumulative,
|
448
|
+
split_kwargs=split_kwargs,
|
449
|
+
min_pplx=min_pplx,
|
450
|
+
**kwargs,
|
451
|
+
)
|
@@ -0,0 +1,138 @@
|
|
1
|
+
from collections.abc import Callable
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Literal
|
4
|
+
|
5
|
+
from lionagi.service.imodel import iModel
|
6
|
+
from lionagi.session.branch import Branch
|
7
|
+
from lionagi.utils import alcall
|
8
|
+
|
9
|
+
from .base import TokenMapping, TokenMappingTemplate
|
10
|
+
from .synthlang_.base import SynthlangFramework, SynthlangTemplate
|
11
|
+
|
12
|
+
FRAMEWORK_OPTIONS = SynthlangFramework.load_framework_options()
|
13
|
+
FRAMEWORK_CHOICES = Literal["math", "optim", "custom_algebra"]
|
14
|
+
|
15
|
+
|
16
|
+
async def symbolic_compress_context(
|
17
|
+
*,
|
18
|
+
text: str = None,
|
19
|
+
url_or_path: str | Path = None,
|
20
|
+
chunk_by="tokens",
|
21
|
+
chunk_size: int = 1000,
|
22
|
+
chunk_tokenizer: Callable = str.split,
|
23
|
+
threshold=50,
|
24
|
+
output_path: Path | str = None,
|
25
|
+
overlap=0.05,
|
26
|
+
system: str = None,
|
27
|
+
chat_model: iModel = None,
|
28
|
+
use_lion_system_message: bool = True,
|
29
|
+
max_concurrent=10,
|
30
|
+
throttle_period=1,
|
31
|
+
framework: Literal["synthlang"] = "synthlang",
|
32
|
+
framework_template: (
|
33
|
+
SynthlangTemplate | SynthlangFramework
|
34
|
+
) = SynthlangTemplate.REFLECTIVE_PATTERNS,
|
35
|
+
framework_options: list[FRAMEWORK_CHOICES] = None,
|
36
|
+
compress: bool = False,
|
37
|
+
compress_model: iModel = None,
|
38
|
+
compression_ratio: float = 0.2,
|
39
|
+
compress_initial_text=None,
|
40
|
+
compress_cumulative=False,
|
41
|
+
compress_split_kwargs=None,
|
42
|
+
compress_min_pplx=None,
|
43
|
+
encode_token_map: TokenMappingTemplate | dict | TokenMapping = None,
|
44
|
+
num_encodings: int = 3,
|
45
|
+
encode_output: bool = False,
|
46
|
+
num_output_encodings: int = None,
|
47
|
+
verbose: bool = True,
|
48
|
+
branch: Branch = None,
|
49
|
+
additional_text: str = "",
|
50
|
+
**kwargs,
|
51
|
+
):
|
52
|
+
if framework != "synthlang":
|
53
|
+
raise ValueError(f"Unsupported framework: {framework}")
|
54
|
+
|
55
|
+
if not text and not url_or_path:
|
56
|
+
raise ValueError("Either text or url_or_path must be provided.")
|
57
|
+
|
58
|
+
if text and url_or_path:
|
59
|
+
raise ValueError("Only one of text or url_or_path should be provided.")
|
60
|
+
|
61
|
+
from .synthlang_.translate_to_synthlang import translate_to_synthlang
|
62
|
+
|
63
|
+
async def _inner(text: str):
|
64
|
+
b_ = None
|
65
|
+
if branch:
|
66
|
+
b_ = await branch.aclone()
|
67
|
+
else:
|
68
|
+
b_ = Branch(
|
69
|
+
system=system,
|
70
|
+
use_lion_system_message=use_lion_system_message,
|
71
|
+
chat_model=chat_model,
|
72
|
+
)
|
73
|
+
|
74
|
+
return await translate_to_synthlang(
|
75
|
+
text,
|
76
|
+
branch=b_,
|
77
|
+
framework_template=framework_template,
|
78
|
+
framework_options=framework_options,
|
79
|
+
compress=compress,
|
80
|
+
compress_model=compress_model,
|
81
|
+
compression_ratio=compression_ratio,
|
82
|
+
compress_kwargs={
|
83
|
+
"initial_text": compress_initial_text,
|
84
|
+
"cumulative": compress_cumulative,
|
85
|
+
"split_kwargs": compress_split_kwargs,
|
86
|
+
"min_pplx": compress_min_pplx,
|
87
|
+
},
|
88
|
+
encode_token_map=encode_token_map,
|
89
|
+
num_encodings=num_encodings,
|
90
|
+
encode_output=encode_output,
|
91
|
+
num_output_encodings=num_output_encodings,
|
92
|
+
verbose=verbose,
|
93
|
+
additional_text=additional_text,
|
94
|
+
**kwargs,
|
95
|
+
)
|
96
|
+
|
97
|
+
from lionagi.libs.file.process import chunk, chunk_content
|
98
|
+
|
99
|
+
texts = []
|
100
|
+
if url_or_path:
|
101
|
+
chunks = chunk(
|
102
|
+
url_or_path=url_or_path,
|
103
|
+
chunk_by=chunk_by,
|
104
|
+
chunk_size=chunk_size,
|
105
|
+
overlap=overlap,
|
106
|
+
threshold=threshold,
|
107
|
+
)
|
108
|
+
texts = [i.content for i in chunks if i.content]
|
109
|
+
|
110
|
+
elif text:
|
111
|
+
texts = chunk_content(
|
112
|
+
text=text,
|
113
|
+
chunk_by=chunk_by,
|
114
|
+
chunk_size=chunk_size,
|
115
|
+
overlap=overlap,
|
116
|
+
threshold=threshold,
|
117
|
+
tokenizer=chunk_tokenizer,
|
118
|
+
)
|
119
|
+
|
120
|
+
results = await alcall(
|
121
|
+
texts,
|
122
|
+
_inner,
|
123
|
+
max_concurrent=max_concurrent,
|
124
|
+
retry_default=None,
|
125
|
+
throttle_period=throttle_period,
|
126
|
+
flatten=True,
|
127
|
+
dropna=True,
|
128
|
+
)
|
129
|
+
text = "\n".join(results)
|
130
|
+
|
131
|
+
if output_path:
|
132
|
+
fp = Path(output_path)
|
133
|
+
fp.write_text(text)
|
134
|
+
if verbose:
|
135
|
+
print(f"Results of {len(text)} characters saved to: {fp}")
|
136
|
+
|
137
|
+
return fp
|
138
|
+
return text
|