lionagi 0.9.12__py3-none-any.whl → 0.9.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. lionagi/libs/file/chunk.py +3 -3
  2. lionagi/libs/token_transform/base.py +52 -0
  3. lionagi/libs/token_transform/perplexity.py +41 -29
  4. lionagi/libs/token_transform/symbolic_compress_context.py +138 -0
  5. lionagi/libs/token_transform/synthlang.py +9 -415
  6. lionagi/libs/token_transform/synthlang_/base.py +130 -0
  7. lionagi/libs/token_transform/synthlang_/resources/frameworks/abstract_algebra.toml +11 -0
  8. lionagi/libs/token_transform/synthlang_/resources/frameworks/category_theory.toml +11 -0
  9. lionagi/libs/token_transform/synthlang_/resources/frameworks/complex_analysis.toml +11 -0
  10. lionagi/libs/token_transform/synthlang_/resources/frameworks/framework_options.json +52 -0
  11. lionagi/libs/token_transform/synthlang_/resources/frameworks/group_theory.toml +11 -0
  12. lionagi/libs/token_transform/synthlang_/resources/frameworks/math_logic.toml +11 -0
  13. lionagi/libs/token_transform/synthlang_/resources/frameworks/reflective_patterns.toml +11 -0
  14. lionagi/libs/token_transform/synthlang_/resources/frameworks/set_theory.toml +11 -0
  15. lionagi/libs/token_transform/synthlang_/resources/frameworks/topology_fundamentals.toml +11 -0
  16. lionagi/libs/token_transform/synthlang_/resources/mapping/rust_chinese_mapping.toml +37 -0
  17. lionagi/libs/token_transform/synthlang_/resources/utility/base_synthlang_system_prompt.toml +11 -0
  18. lionagi/libs/token_transform/synthlang_/translate_to_synthlang.py +136 -0
  19. lionagi/libs/token_transform/types.py +15 -0
  20. lionagi/protocols/adapters/toml_adapter.py +204 -0
  21. lionagi/protocols/adapters/types.py +3 -0
  22. lionagi/protocols/generic/element.py +9 -0
  23. lionagi/protocols/graph/node.py +3 -0
  24. lionagi/service/endpoints/token_calculator.py +8 -0
  25. lionagi/service/imodel.py +14 -13
  26. lionagi/session/branch.py +6 -6
  27. lionagi/tools/base.py +62 -0
  28. lionagi/version.py +1 -1
  29. {lionagi-0.9.12.dist-info → lionagi-0.9.14.dist-info}/METADATA +2 -1
  30. {lionagi-0.9.12.dist-info → lionagi-0.9.14.dist-info}/RECORD +32 -15
  31. {lionagi-0.9.12.dist-info → lionagi-0.9.14.dist-info}/WHEEL +0 -0
  32. {lionagi-0.9.12.dist-info → lionagi-0.9.14.dist-info}/licenses/LICENSE +0 -0
@@ -216,7 +216,7 @@ def chunk_content(
216
216
  chunk_size: int = 1024,
217
217
  overlap: float = 0,
218
218
  threshold: int = 256,
219
- metadata: dict[str, Any] = {},
219
+ metadata: dict[str, Any] = None,
220
220
  return_tokens: bool = False,
221
221
  as_node: bool = False,
222
222
  **kwargs: Any,
@@ -268,7 +268,7 @@ def chunk_content(
268
268
  "chunk_id": i + 1,
269
269
  "total_chunks": len(chunks),
270
270
  "chunk_size": len(chunk),
271
- **metadata,
271
+ **(metadata or {}),
272
272
  },
273
273
  )
274
274
  for i, chunk in enumerate(chunks)
@@ -280,7 +280,7 @@ def chunk_content(
280
280
  "chunk_id": i + 1,
281
281
  "total_chunks": len(chunks),
282
282
  "chunk_size": len(chunk),
283
- **metadata,
283
+ **(metadata or {}),
284
284
  }
285
285
  for i, chunk in enumerate(chunks)
286
286
  ]
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import Enum
4
+ from pathlib import Path
5
+
6
+ from pydantic import Field
7
+
8
+ from lionagi.tools.base import Resource, ResourceCategory
9
+
10
+ here = Path(__file__).parent.resolve()
11
+ MAPPING_PATH = "synthlang_/resources/mapping"
12
+
13
+
14
+ class TokenMappingTemplate(str, Enum):
15
+ RUST_CHINESE = "rust_chinese"
16
+
17
+ @property
18
+ def fp(self) -> Path:
19
+ return here / MAPPING_PATH / f"{self.value}_mapping.toml"
20
+
21
+
22
+ class TokenMapping(Resource):
23
+ category: ResourceCategory = Field(
24
+ default=ResourceCategory.UTILITY, frozen=True
25
+ )
26
+ content: dict
27
+
28
+ @classmethod
29
+ def load_from_template(
30
+ cls, template: TokenMappingTemplate | str
31
+ ) -> TokenMapping:
32
+ if isinstance(template, str):
33
+ template = template.lower().strip()
34
+ template = (
35
+ template.replace(".toml", "")
36
+ .replace(" ", "_")
37
+ .replace("-", "_")
38
+ .strip()
39
+ )
40
+ if template.endswith("_mapping"):
41
+ template = template[:-8]
42
+ if "/" in template:
43
+ template = template.split("/")[-1]
44
+ template = TokenMappingTemplate(template)
45
+
46
+ if isinstance(template, TokenMappingTemplate):
47
+ template = template.fp
48
+ return cls.adapt_from(template, ".toml", many=False)
49
+
50
+ raise ValueError(
51
+ f"Invalid template: {template}. Must be a TokenMappingTemplate or a valid path."
52
+ )
@@ -101,7 +101,7 @@ async def compute_perplexity(
101
101
  chat_model: iModel,
102
102
  initial_context: str = None,
103
103
  tokens: list[str] = None,
104
- system_msg: str = None,
104
+ system: str = None,
105
105
  n_samples: int = 1,
106
106
  use_residue: bool = True,
107
107
  **kwargs,
@@ -142,9 +142,9 @@ async def compute_perplexity(
142
142
  api_calls = []
143
143
  for sample_txt in sampless:
144
144
  messages = []
145
- if system_msg:
145
+ if system:
146
146
  if not chat_model.sequential_exchange:
147
- messages.append({"role": "system", "content": system_msg})
147
+ messages.append({"role": "system", "content": system})
148
148
  messages.append({"role": "user", "content": sample_txt})
149
149
  else:
150
150
  messages.append({"role": "user", "content": sample_txt})
@@ -171,10 +171,10 @@ class LLMCompressor:
171
171
  def __init__(
172
172
  self,
173
173
  chat_model: iModel,
174
- system_msg=None,
174
+ system=None,
175
175
  tokenizer=None,
176
176
  splitter=None,
177
- target_ratio=0.2,
177
+ compression_ratio=0.2,
178
178
  n_samples=5,
179
179
  chunk_size=64,
180
180
  max_tokens_per_sample=80,
@@ -193,10 +193,8 @@ class LLMCompressor:
193
193
  self.chat_model = chat_model
194
194
  self.tokenizer = tokenizer
195
195
  self.splitter = splitter
196
- self.system_msg = (
197
- system_msg or "Concisely summarize content for storage:"
198
- )
199
- self.target_ratio = target_ratio
196
+ self.system = system or "Concisely summarize content for storage:"
197
+ self.compression_ratio = compression_ratio
200
198
  self.n_samples = n_samples
201
199
  self.chunk_size = chunk_size
202
200
  self.max_tokens_per_sample = max_tokens_per_sample
@@ -281,7 +279,7 @@ class LLMCompressor:
281
279
  initial_context=initial_text,
282
280
  tokens=item_toks,
283
281
  n_samples=n_samples or self.n_samples,
284
- system_msg=self.system_msg,
282
+ system=self.system,
285
283
  use_residue=use_residue,
286
284
  **kwargs,
287
285
  )
@@ -347,6 +345,7 @@ class LLMCompressor:
347
345
  # Tokenize once to get total length
348
346
  all_tokens = self.tokenize(text)
349
347
  original_len = len(all_tokens)
348
+ ttl_chars = len(text)
350
349
 
351
350
  # Split text
352
351
  items = self.split(text, **split_kwargs)
@@ -363,26 +362,26 @@ class LLMCompressor:
363
362
  # Select
364
363
  selected = self.select_by_pplex(
365
364
  ranked_items=ranked,
366
- target_compression_ratio=compression_ratio or self.target_ratio,
365
+ target_compression_ratio=compression_ratio
366
+ or self.compression_ratio,
367
367
  original_length=original_len,
368
368
  min_pplx=min_pplx or self.min_pplx,
369
369
  )
370
370
 
371
- if self.verbose:
372
- compressed_len = sum(
373
- len(to_list(self.tokenize(x), dropna=True, flatten=True))
374
- for x in selected
375
- )
376
- ratio = compressed_len / original_len if original_len else 1
377
- print(
378
- f"Original tokens: {original_len}\n"
379
- f"Selected tokens: {compressed_len}\n"
380
- f"Compression ratio: {ratio:.3f}\n"
381
- f"Time: {timer() - start:.3f}s\n"
382
- )
383
-
384
371
  # Join final
385
372
  out_str = " ".join(selected)
373
+
374
+ if self.verbose:
375
+ compressed_chars = len(out_str)
376
+ ratio = compressed_chars / ttl_chars if original_len else 1
377
+ msg = "------------------------------------------\n"
378
+ msg += f"Compression Method: Perplexity\n"
379
+ msg += f"Compressed Characters number: {compressed_chars}\n"
380
+ msg += f"Character Compression Ratio: {ratio:.1%}\n"
381
+ msg += f"Compression Time: {timer() - start:.3f}s\n"
382
+ msg += f"Compression Model: {self.chat_model.model_name}\n"
383
+ print(msg)
384
+
386
385
  return out_str.strip()
387
386
 
388
387
  def select_by_pplex(
@@ -419,21 +418,34 @@ class LLMCompressor:
419
418
  async def compress_text(
420
419
  text: str,
421
420
  chat_model: iModel,
422
- system_msg: str = None,
423
- target_ratio: float = 0.2,
421
+ system: str = None,
422
+ compression_ratio: float = 0.2,
424
423
  n_samples: int = 5,
425
424
  max_tokens_per_sample=80,
426
425
  verbose=True,
426
+ initial_text=None,
427
+ cumulative=False,
428
+ split_kwargs=None,
429
+ min_pplx=None,
430
+ **kwargs,
427
431
  ) -> str:
428
432
  """
429
433
  Convenience function that instantiates LLMCompressor and compresses text.
430
434
  """
431
435
  compressor = LLMCompressor(
432
436
  chat_model=chat_model,
433
- system_msg=system_msg,
434
- target_ratio=target_ratio,
437
+ system=system,
438
+ compression_ratio=compression_ratio,
435
439
  n_samples=n_samples,
436
440
  max_tokens_per_sample=max_tokens_per_sample,
437
441
  verbose=verbose,
438
442
  )
439
- return await compressor.compress(text)
443
+ return await compressor.compress(
444
+ text,
445
+ compression_ratio=compression_ratio,
446
+ initial_text=initial_text,
447
+ cumulative=cumulative,
448
+ split_kwargs=split_kwargs,
449
+ min_pplx=min_pplx,
450
+ **kwargs,
451
+ )
@@ -0,0 +1,138 @@
1
+ from collections.abc import Callable
2
+ from pathlib import Path
3
+ from typing import Literal
4
+
5
+ from lionagi.service.imodel import iModel
6
+ from lionagi.session.branch import Branch
7
+ from lionagi.utils import alcall
8
+
9
+ from .base import TokenMapping, TokenMappingTemplate
10
+ from .synthlang_.base import SynthlangFramework, SynthlangTemplate
11
+
12
+ FRAMEWORK_OPTIONS = SynthlangFramework.load_framework_options()
13
+ FRAMEWORK_CHOICES = Literal["math", "optim", "custom_algebra"]
14
+
15
+
16
+ async def symbolic_compress_context(
17
+ *,
18
+ text: str = None,
19
+ url_or_path: str | Path = None,
20
+ chunk_by="tokens",
21
+ chunk_size: int = 1000,
22
+ chunk_tokenizer: Callable = str.split,
23
+ threshold=50,
24
+ output_path: Path | str = None,
25
+ overlap=0.05,
26
+ system: str = None,
27
+ chat_model: iModel = None,
28
+ use_lion_system_message: bool = True,
29
+ max_concurrent=10,
30
+ throttle_period=1,
31
+ framework: Literal["synthlang"] = "synthlang",
32
+ framework_template: (
33
+ SynthlangTemplate | SynthlangFramework
34
+ ) = SynthlangTemplate.REFLECTIVE_PATTERNS,
35
+ framework_options: list[FRAMEWORK_CHOICES] = None,
36
+ compress: bool = False,
37
+ compress_model: iModel = None,
38
+ compression_ratio: float = 0.2,
39
+ compress_initial_text=None,
40
+ compress_cumulative=False,
41
+ compress_split_kwargs=None,
42
+ compress_min_pplx=None,
43
+ encode_token_map: TokenMappingTemplate | dict | TokenMapping = None,
44
+ num_encodings: int = 3,
45
+ encode_output: bool = False,
46
+ num_output_encodings: int = None,
47
+ verbose: bool = True,
48
+ branch: Branch = None,
49
+ additional_text: str = "",
50
+ **kwargs,
51
+ ):
52
+ if framework != "synthlang":
53
+ raise ValueError(f"Unsupported framework: {framework}")
54
+
55
+ if not text and not url_or_path:
56
+ raise ValueError("Either text or url_or_path must be provided.")
57
+
58
+ if text and url_or_path:
59
+ raise ValueError("Only one of text or url_or_path should be provided.")
60
+
61
+ from .synthlang_.translate_to_synthlang import translate_to_synthlang
62
+
63
+ async def _inner(text: str):
64
+ b_ = None
65
+ if branch:
66
+ b_ = await branch.aclone()
67
+ else:
68
+ b_ = Branch(
69
+ system=system,
70
+ use_lion_system_message=use_lion_system_message,
71
+ chat_model=chat_model,
72
+ )
73
+
74
+ return await translate_to_synthlang(
75
+ text,
76
+ branch=b_,
77
+ framework_template=framework_template,
78
+ framework_options=framework_options,
79
+ compress=compress,
80
+ compress_model=compress_model,
81
+ compression_ratio=compression_ratio,
82
+ compress_kwargs={
83
+ "initial_text": compress_initial_text,
84
+ "cumulative": compress_cumulative,
85
+ "split_kwargs": compress_split_kwargs,
86
+ "min_pplx": compress_min_pplx,
87
+ },
88
+ encode_token_map=encode_token_map,
89
+ num_encodings=num_encodings,
90
+ encode_output=encode_output,
91
+ num_output_encodings=num_output_encodings,
92
+ verbose=verbose,
93
+ additional_text=additional_text,
94
+ **kwargs,
95
+ )
96
+
97
+ from lionagi.libs.file.process import chunk, chunk_content
98
+
99
+ texts = []
100
+ if url_or_path:
101
+ chunks = chunk(
102
+ url_or_path=url_or_path,
103
+ chunk_by=chunk_by,
104
+ chunk_size=chunk_size,
105
+ overlap=overlap,
106
+ threshold=threshold,
107
+ )
108
+ texts = [i.content for i in chunks if i.content]
109
+
110
+ elif text:
111
+ texts = chunk_content(
112
+ text=text,
113
+ chunk_by=chunk_by,
114
+ chunk_size=chunk_size,
115
+ overlap=overlap,
116
+ threshold=threshold,
117
+ tokenizer=chunk_tokenizer,
118
+ )
119
+
120
+ results = await alcall(
121
+ texts,
122
+ _inner,
123
+ max_concurrent=max_concurrent,
124
+ retry_default=None,
125
+ throttle_period=throttle_period,
126
+ flatten=True,
127
+ dropna=True,
128
+ )
129
+ text = "\n".join(results)
130
+
131
+ if output_path:
132
+ fp = Path(output_path)
133
+ fp.write_text(text)
134
+ if verbose:
135
+ print(f"Results of {len(text)} characters saved to: {fp}")
136
+
137
+ return fp
138
+ return text