lionagi 0.9.13__py3-none-any.whl → 0.9.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. lionagi/libs/file/chunk.py +3 -3
  2. lionagi/libs/file/concat_files.py +83 -0
  3. lionagi/libs/file/process.py +49 -36
  4. lionagi/libs/token_transform/base.py +52 -0
  5. lionagi/libs/token_transform/perplexity.py +41 -29
  6. lionagi/libs/token_transform/symbolic_compress_context.py +147 -0
  7. lionagi/libs/token_transform/synthlang.py +9 -415
  8. lionagi/libs/token_transform/synthlang_/base.py +130 -0
  9. lionagi/libs/token_transform/synthlang_/resources/frameworks/abstract_algebra.toml +11 -0
  10. lionagi/libs/token_transform/synthlang_/resources/frameworks/category_theory.toml +11 -0
  11. lionagi/libs/token_transform/synthlang_/resources/frameworks/complex_analysis.toml +11 -0
  12. lionagi/libs/token_transform/synthlang_/resources/frameworks/framework_options.json +52 -0
  13. lionagi/libs/token_transform/synthlang_/resources/frameworks/group_theory.toml +11 -0
  14. lionagi/libs/token_transform/synthlang_/resources/frameworks/math_logic.toml +11 -0
  15. lionagi/libs/token_transform/synthlang_/resources/frameworks/reflective_patterns.toml +11 -0
  16. lionagi/libs/token_transform/synthlang_/resources/frameworks/set_theory.toml +11 -0
  17. lionagi/libs/token_transform/synthlang_/resources/frameworks/topology_fundamentals.toml +11 -0
  18. lionagi/libs/token_transform/synthlang_/resources/mapping/rust_chinese_mapping.toml +60 -0
  19. lionagi/libs/token_transform/synthlang_/resources/utility/base_synthlang_system_prompt.toml +11 -0
  20. lionagi/libs/token_transform/synthlang_/translate_to_synthlang.py +140 -0
  21. lionagi/libs/token_transform/types.py +15 -0
  22. lionagi/protocols/adapters/toml_adapter.py +204 -0
  23. lionagi/protocols/adapters/types.py +3 -0
  24. lionagi/protocols/graph/node.py +3 -0
  25. lionagi/service/endpoints/token_calculator.py +8 -0
  26. lionagi/service/imodel.py +14 -13
  27. lionagi/session/branch.py +6 -6
  28. lionagi/tools/base.py +62 -0
  29. lionagi/version.py +1 -1
  30. {lionagi-0.9.13.dist-info → lionagi-0.9.15.dist-info}/METADATA +2 -1
  31. {lionagi-0.9.13.dist-info → lionagi-0.9.15.dist-info}/RECORD +33 -15
  32. {lionagi-0.9.13.dist-info → lionagi-0.9.15.dist-info}/WHEEL +0 -0
  33. {lionagi-0.9.13.dist-info → lionagi-0.9.15.dist-info}/licenses/LICENSE +0 -0
@@ -216,7 +216,7 @@ def chunk_content(
216
216
  chunk_size: int = 1024,
217
217
  overlap: float = 0,
218
218
  threshold: int = 256,
219
- metadata: dict[str, Any] = {},
219
+ metadata: dict[str, Any] = None,
220
220
  return_tokens: bool = False,
221
221
  as_node: bool = False,
222
222
  **kwargs: Any,
@@ -268,7 +268,7 @@ def chunk_content(
268
268
  "chunk_id": i + 1,
269
269
  "total_chunks": len(chunks),
270
270
  "chunk_size": len(chunk),
271
- **metadata,
271
+ **(metadata or {}),
272
272
  },
273
273
  )
274
274
  for i, chunk in enumerate(chunks)
@@ -280,7 +280,7 @@ def chunk_content(
280
280
  "chunk_id": i + 1,
281
281
  "total_chunks": len(chunks),
282
282
  "chunk_size": len(chunk),
283
- **metadata,
283
+ **(metadata or {}),
284
284
  }
285
285
  for i, chunk in enumerate(chunks)
286
286
  ]
@@ -0,0 +1,83 @@
1
+ from pathlib import Path
2
+
3
+ from lionagi.utils import create_path
4
+
5
+ from .process import dir_to_files
6
+
7
+
8
+ def concat_files(
9
+ data_path: str | Path | list,
10
+ file_types: list[str],
11
+ output_dir: str | Path = None,
12
+ output_filename: str = None,
13
+ file_exist_ok: bool = True,
14
+ recursive: bool = True,
15
+ verbose: bool = True,
16
+ threshold: int = 0,
17
+ return_fps: bool = False,
18
+ return_files: bool = False,
19
+ **kwargs,
20
+ ) -> list[str] | str | tuple[list[str], list[Path]] | tuple[str, list[Path]]:
21
+ """
22
+ data_path: str or Path or list of str or Path, the directory or file paths to concatenate.
23
+ file_types: list of str, the file types to concatenate. [e.g. ['.txt', '.md']]
24
+ output_dir: str or Path, the directory to save the concatenated file. If provided, will save the file.
25
+ output_filename: str, the filename to save the concatenated file.
26
+ file_exist_ok: bool, if True, overwrite the existing file. Default is True.
27
+ recursive: bool, if True, search files recursively. Default is True.
28
+ verbose: bool, if True, print the output path. Default is True.
29
+ threshold: int, the minimum number of chars for the file to be considered valid to concatenate.
30
+ kwargs: additional keyword arguments to pass to create_path.
31
+ """
32
+ persist_path = None
33
+ if output_dir:
34
+ if not output_filename:
35
+ output_filename = "concatenated_text.txt"
36
+ kwargs["timestamp"] = kwargs.get("timestamp", True)
37
+ kwargs["random_hash_digits"] = kwargs.get("random_hash_digits", 6)
38
+ output_filename = output_filename or "concatenated_text.txt"
39
+ persist_path = create_path(
40
+ output_dir, output_filename, file_exist_ok=file_exist_ok, **kwargs
41
+ )
42
+
43
+ texts = []
44
+ data_path = (
45
+ [str(data_path)] if not isinstance(data_path, list) else data_path
46
+ )
47
+ data_path = sorted(data_path)
48
+ data_path = [Path(dp) for dp in data_path if Path(dp).exists()]
49
+
50
+ for dp in data_path:
51
+ fps = dir_to_files(dp, recursive=recursive, file_types=file_types)
52
+
53
+ data_path = sorted([str(i) for i in fps])
54
+ data_path: list[Path] = [
55
+ Path(dp) for dp in data_path if Path(dp).exists()
56
+ ]
57
+
58
+ for fp in data_path:
59
+ text = fp.read_text(encoding="utf-8")
60
+ if len(text) >= threshold:
61
+ fp_text = (
62
+ "\n----------------------------------------------------\n"
63
+ f"{str(fp)}"
64
+ "\n----------------------------------------------------\n"
65
+ )
66
+ text = fp_text + text
67
+ texts.append(text)
68
+
69
+ text = "\n".join(texts)
70
+ if persist_path:
71
+ persist_path.write_text(text, encoding="utf-8")
72
+ if verbose:
73
+ print(f"Concatenated {len(fps)} files to {persist_path}")
74
+ print(f"The file contains {len(text)} characters.")
75
+
76
+ if return_files:
77
+ if return_fps:
78
+ return texts, fps
79
+ return texts
80
+
81
+ if return_fps:
82
+ return text, fps
83
+ return text
@@ -164,10 +164,12 @@ def file_to_chunks(
164
164
 
165
165
 
166
166
  def chunk(
167
- url_or_path: str | Path,
168
167
  *,
168
+ text: str | None = None,
169
+ url_or_path: str | Path = None,
169
170
  file_types: list[str] | None = None, # only local files
170
171
  recursive: bool = False, # only local files
172
+ tokenizer: Callable[[str], list[str]] = None,
171
173
  chunk_by: Literal["chars", "tokens"] = "chars",
172
174
  chunk_size: int = 1500,
173
175
  overlap: float = 0.1,
@@ -175,45 +177,52 @@ def chunk(
175
177
  output_file: str | Path | None = None,
176
178
  metadata: dict[str, Any] | None = None,
177
179
  reader_tool: Callable = None,
178
- ):
179
- if isinstance(url_or_path, str):
180
- url_or_path = Path(url_or_path)
181
-
182
- chunks = None
183
- files = None
184
- if url_or_path.exists():
185
- if url_or_path.is_dir():
186
- files = dir_to_files(
187
- directory=url_or_path,
188
- file_types=file_types,
189
- recursive=recursive,
180
+ as_node: bool = False,
181
+ ) -> list:
182
+ texts = []
183
+ if not text:
184
+ if isinstance(url_or_path, str):
185
+ url_or_path = Path(url_or_path)
186
+
187
+ chunks = None
188
+ files = None
189
+ if url_or_path.exists():
190
+ if url_or_path.is_dir():
191
+ files = dir_to_files(
192
+ directory=url_or_path,
193
+ file_types=file_types,
194
+ recursive=recursive,
195
+ )
196
+ elif url_or_path.is_file():
197
+ files = [url_or_path]
198
+ else:
199
+ files = (
200
+ [str(url_or_path)]
201
+ if not isinstance(url_or_path, list)
202
+ else url_or_path
190
203
  )
191
- elif url_or_path.is_file():
192
- files = [url_or_path]
193
- else:
194
- files = (
195
- [str(url_or_path)]
196
- if not isinstance(url_or_path, list)
197
- else url_or_path
198
- )
199
204
 
200
- if reader_tool is None:
201
- reader_tool = lambda x: x.read_text(encoding="utf-8")
205
+ if reader_tool is None:
206
+ reader_tool = lambda x: x.read_text(encoding="utf-8")
202
207
 
203
- if reader_tool == "docling":
204
- from lionagi.libs.package.imports import check_import
208
+ if reader_tool == "docling":
209
+ from lionagi.libs.package.imports import check_import
205
210
 
206
- DocumentConverter = check_import(
207
- "docling",
208
- module_name="document_converter",
209
- import_name="DocumentConverter",
210
- )
211
- converter = DocumentConverter()
212
- reader_tool = lambda x: converter.convert(
213
- x
214
- ).document.export_to_markdown()
211
+ DocumentConverter = check_import(
212
+ "docling",
213
+ module_name="document_converter",
214
+ import_name="DocumentConverter",
215
+ )
216
+ converter = DocumentConverter()
217
+ reader_tool = lambda x: converter.convert(
218
+ x
219
+ ).document.export_to_markdown()
220
+
221
+ texts = lcall(files, reader_tool)
222
+
223
+ else:
224
+ texts = [text]
215
225
 
216
- texts = lcall(files, reader_tool)
217
226
  chunks = lcall(
218
227
  texts,
219
228
  chunk_content,
@@ -224,6 +233,7 @@ def chunk(
224
233
  metadata=metadata,
225
234
  as_node=True,
226
235
  flatten=True,
236
+ tokenizer=tokenizer or str.split,
227
237
  )
228
238
  if threshold:
229
239
  chunks = [c for c in chunks if len(c.content) > threshold]
@@ -247,4 +257,7 @@ def chunk(
247
257
  else:
248
258
  raise ValueError(f"Unsupported output file format: {output_file}")
249
259
 
250
- return chunks
260
+ if as_node:
261
+ return chunks
262
+
263
+ return [c.content for c in chunks]
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import Enum
4
+ from pathlib import Path
5
+
6
+ from pydantic import Field
7
+
8
+ from lionagi.tools.base import Resource, ResourceCategory
9
+
10
+ here = Path(__file__).parent.resolve()
11
+ MAPPING_PATH = "synthlang_/resources/mapping"
12
+
13
+
14
+ class TokenMappingTemplate(str, Enum):
15
+ RUST_CHINESE = "rust_chinese"
16
+
17
+ @property
18
+ def fp(self) -> Path:
19
+ return here / MAPPING_PATH / f"{self.value}_mapping.toml"
20
+
21
+
22
+ class TokenMapping(Resource):
23
+ category: ResourceCategory = Field(
24
+ default=ResourceCategory.UTILITY, frozen=True
25
+ )
26
+ content: dict
27
+
28
+ @classmethod
29
+ def load_from_template(
30
+ cls, template: TokenMappingTemplate | str
31
+ ) -> TokenMapping:
32
+ if isinstance(template, str):
33
+ template = template.lower().strip()
34
+ template = (
35
+ template.replace(".toml", "")
36
+ .replace(" ", "_")
37
+ .replace("-", "_")
38
+ .strip()
39
+ )
40
+ if template.endswith("_mapping"):
41
+ template = template[:-8]
42
+ if "/" in template:
43
+ template = template.split("/")[-1]
44
+ template = TokenMappingTemplate(template)
45
+
46
+ if isinstance(template, TokenMappingTemplate):
47
+ template = template.fp
48
+ return cls.adapt_from(template, ".toml", many=False)
49
+
50
+ raise ValueError(
51
+ f"Invalid template: {template}. Must be a TokenMappingTemplate or a valid path."
52
+ )
@@ -101,7 +101,7 @@ async def compute_perplexity(
101
101
  chat_model: iModel,
102
102
  initial_context: str = None,
103
103
  tokens: list[str] = None,
104
- system_msg: str = None,
104
+ system: str = None,
105
105
  n_samples: int = 1,
106
106
  use_residue: bool = True,
107
107
  **kwargs,
@@ -142,9 +142,9 @@ async def compute_perplexity(
142
142
  api_calls = []
143
143
  for sample_txt in sampless:
144
144
  messages = []
145
- if system_msg:
145
+ if system:
146
146
  if not chat_model.sequential_exchange:
147
- messages.append({"role": "system", "content": system_msg})
147
+ messages.append({"role": "system", "content": system})
148
148
  messages.append({"role": "user", "content": sample_txt})
149
149
  else:
150
150
  messages.append({"role": "user", "content": sample_txt})
@@ -171,10 +171,10 @@ class LLMCompressor:
171
171
  def __init__(
172
172
  self,
173
173
  chat_model: iModel,
174
- system_msg=None,
174
+ system=None,
175
175
  tokenizer=None,
176
176
  splitter=None,
177
- target_ratio=0.2,
177
+ compression_ratio=0.2,
178
178
  n_samples=5,
179
179
  chunk_size=64,
180
180
  max_tokens_per_sample=80,
@@ -193,10 +193,8 @@ class LLMCompressor:
193
193
  self.chat_model = chat_model
194
194
  self.tokenizer = tokenizer
195
195
  self.splitter = splitter
196
- self.system_msg = (
197
- system_msg or "Concisely summarize content for storage:"
198
- )
199
- self.target_ratio = target_ratio
196
+ self.system = system or "Concisely summarize content for storage:"
197
+ self.compression_ratio = compression_ratio
200
198
  self.n_samples = n_samples
201
199
  self.chunk_size = chunk_size
202
200
  self.max_tokens_per_sample = max_tokens_per_sample
@@ -281,7 +279,7 @@ class LLMCompressor:
281
279
  initial_context=initial_text,
282
280
  tokens=item_toks,
283
281
  n_samples=n_samples or self.n_samples,
284
- system_msg=self.system_msg,
282
+ system=self.system,
285
283
  use_residue=use_residue,
286
284
  **kwargs,
287
285
  )
@@ -347,6 +345,7 @@ class LLMCompressor:
347
345
  # Tokenize once to get total length
348
346
  all_tokens = self.tokenize(text)
349
347
  original_len = len(all_tokens)
348
+ ttl_chars = len(text)
350
349
 
351
350
  # Split text
352
351
  items = self.split(text, **split_kwargs)
@@ -363,26 +362,26 @@ class LLMCompressor:
363
362
  # Select
364
363
  selected = self.select_by_pplex(
365
364
  ranked_items=ranked,
366
- target_compression_ratio=compression_ratio or self.target_ratio,
365
+ target_compression_ratio=compression_ratio
366
+ or self.compression_ratio,
367
367
  original_length=original_len,
368
368
  min_pplx=min_pplx or self.min_pplx,
369
369
  )
370
370
 
371
- if self.verbose:
372
- compressed_len = sum(
373
- len(to_list(self.tokenize(x), dropna=True, flatten=True))
374
- for x in selected
375
- )
376
- ratio = compressed_len / original_len if original_len else 1
377
- print(
378
- f"Original tokens: {original_len}\n"
379
- f"Selected tokens: {compressed_len}\n"
380
- f"Compression ratio: {ratio:.3f}\n"
381
- f"Time: {timer() - start:.3f}s\n"
382
- )
383
-
384
371
  # Join final
385
372
  out_str = " ".join(selected)
373
+
374
+ if self.verbose:
375
+ compressed_chars = len(out_str)
376
+ ratio = compressed_chars / ttl_chars if original_len else 1
377
+ msg = "------------------------------------------\n"
378
+ msg += f"Compression Method: Perplexity\n"
379
+ msg += f"Compressed Characters number: {compressed_chars}\n"
380
+ msg += f"Character Compression Ratio: {ratio:.1%}\n"
381
+ msg += f"Compression Time: {timer() - start:.3f}s\n"
382
+ msg += f"Compression Model: {self.chat_model.model_name}\n"
383
+ print(msg)
384
+
386
385
  return out_str.strip()
387
386
 
388
387
  def select_by_pplex(
@@ -419,21 +418,34 @@ class LLMCompressor:
419
418
  async def compress_text(
420
419
  text: str,
421
420
  chat_model: iModel,
422
- system_msg: str = None,
423
- target_ratio: float = 0.2,
421
+ system: str = None,
422
+ compression_ratio: float = 0.2,
424
423
  n_samples: int = 5,
425
424
  max_tokens_per_sample=80,
426
425
  verbose=True,
426
+ initial_text=None,
427
+ cumulative=False,
428
+ split_kwargs=None,
429
+ min_pplx=None,
430
+ **kwargs,
427
431
  ) -> str:
428
432
  """
429
433
  Convenience function that instantiates LLMCompressor and compresses text.
430
434
  """
431
435
  compressor = LLMCompressor(
432
436
  chat_model=chat_model,
433
- system_msg=system_msg,
434
- target_ratio=target_ratio,
437
+ system=system,
438
+ compression_ratio=compression_ratio,
435
439
  n_samples=n_samples,
436
440
  max_tokens_per_sample=max_tokens_per_sample,
437
441
  verbose=verbose,
438
442
  )
439
- return await compressor.compress(text)
443
+ return await compressor.compress(
444
+ text,
445
+ compression_ratio=compression_ratio,
446
+ initial_text=initial_text,
447
+ cumulative=cumulative,
448
+ split_kwargs=split_kwargs,
449
+ min_pplx=min_pplx,
450
+ **kwargs,
451
+ )
@@ -0,0 +1,147 @@
1
+ from collections.abc import Callable
2
+ from pathlib import Path
3
+ from typing import Literal
4
+
5
+ from lionagi.service.imodel import iModel
6
+ from lionagi.session.branch import Branch
7
+ from lionagi.utils import alcall, get_bins
8
+
9
+ from .base import TokenMapping, TokenMappingTemplate
10
+ from .synthlang_.base import SynthlangFramework, SynthlangTemplate
11
+
12
+ FRAMEWORK_OPTIONS = SynthlangFramework.load_framework_options()
13
+ FRAMEWORK_CHOICES = Literal["math", "optim", "custom_algebra"]
14
+
15
+
16
+ async def symbolic_compress_context(
17
+ *,
18
+ text: str = None,
19
+ url_or_path: str | Path = None,
20
+ chunk_by="tokens",
21
+ chunk_size: int = 1000,
22
+ chunk_tokenizer: Callable = None,
23
+ threshold=50,
24
+ output_path: Path | str = None,
25
+ overlap=0.025,
26
+ system: str = None,
27
+ chat_model: iModel = None,
28
+ use_lion_system_message: bool = True,
29
+ max_concurrent=10,
30
+ throttle_period=1,
31
+ framework: Literal["synthlang"] = "synthlang",
32
+ framework_template: (
33
+ SynthlangTemplate | SynthlangFramework
34
+ ) = SynthlangTemplate.REFLECTIVE_PATTERNS,
35
+ framework_options: list[FRAMEWORK_CHOICES] = None,
36
+ compress: bool = False,
37
+ compress_model: iModel = None,
38
+ compression_ratio: float = 0.2,
39
+ compress_initial_text=None,
40
+ compress_cumulative=False,
41
+ compress_split_kwargs=None,
42
+ compress_min_pplx=None,
43
+ encode_token_map: TokenMappingTemplate | dict | TokenMapping = None,
44
+ num_encodings: int = 3,
45
+ encode_output: bool = True,
46
+ num_output_encodings: int = 1,
47
+ verbose: bool = True,
48
+ branch: Branch = None,
49
+ additional_text: str = "",
50
+ **kwargs,
51
+ ):
52
+ if framework != "synthlang":
53
+ raise ValueError(f"Unsupported framework: {framework}")
54
+
55
+ if not text and not url_or_path:
56
+ raise ValueError("Either text or url_or_path must be provided.")
57
+
58
+ if text and url_or_path:
59
+ raise ValueError("Only one of text or url_or_path should be provided.")
60
+
61
+ from .synthlang_.translate_to_synthlang import translate_to_synthlang
62
+
63
+ async def _inner(text: str):
64
+ b_ = None
65
+ if branch:
66
+ b_ = await branch.aclone()
67
+ else:
68
+ b_ = Branch(
69
+ system=system,
70
+ use_lion_system_message=use_lion_system_message,
71
+ chat_model=chat_model,
72
+ )
73
+
74
+ return await translate_to_synthlang(
75
+ text,
76
+ branch=b_,
77
+ framework_template=framework_template,
78
+ framework_options=framework_options,
79
+ compress=compress,
80
+ compress_model=compress_model,
81
+ compression_ratio=compression_ratio,
82
+ compress_kwargs={
83
+ "initial_text": compress_initial_text,
84
+ "cumulative": compress_cumulative,
85
+ "split_kwargs": compress_split_kwargs,
86
+ "min_pplx": compress_min_pplx,
87
+ },
88
+ encode_token_map=encode_token_map,
89
+ num_encodings=num_encodings,
90
+ encode_output=encode_output,
91
+ num_output_encodings=num_output_encodings,
92
+ verbose=verbose,
93
+ additional_text=additional_text,
94
+ **kwargs,
95
+ )
96
+
97
+ from lionagi.libs.file.process import chunk, chunk_content
98
+
99
+ chunks = []
100
+ if url_or_path:
101
+ chunks = chunk(
102
+ url_or_path=url_or_path,
103
+ chunk_by=chunk_by,
104
+ chunk_size=chunk_size,
105
+ overlap=overlap,
106
+ threshold=threshold,
107
+ )
108
+
109
+ elif text:
110
+ chunks = chunk_content(
111
+ text=text,
112
+ chunk_by=chunk_by,
113
+ chunk_size=chunk_size,
114
+ overlap=overlap,
115
+ threshold=threshold,
116
+ tokenizer=chunk_tokenizer or str.split,
117
+ )
118
+
119
+ texts = [str(i).strip() for i in chunks if str(i).strip()]
120
+ bins = get_bins(texts, upper=chunk_size)
121
+ textss = []
122
+ for i in bins:
123
+ textss.append("\n".join([texts[j] for j in i]))
124
+
125
+ results = await alcall(
126
+ textss,
127
+ _inner,
128
+ max_concurrent=max_concurrent,
129
+ retry_default=None,
130
+ num_retries=2,
131
+ throttle_period=throttle_period,
132
+ retry_delay=1,
133
+ backoff_factor=2,
134
+ flatten=True,
135
+ dropna=True,
136
+ unique_output=True,
137
+ )
138
+ text = "\n".join(results)
139
+
140
+ if output_path:
141
+ fp = Path(output_path)
142
+ fp.write_text(text)
143
+ if verbose:
144
+ print(f"Results of {len(text)} characters saved to: {fp}")
145
+
146
+ return fp
147
+ return text