lionagi 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- lionagi/__init__.py +60 -5
- lionagi/core/__init__.py +0 -25
- lionagi/core/_setting/_setting.py +59 -0
- lionagi/core/action/__init__.py +14 -0
- lionagi/core/action/function_calling.py +136 -0
- lionagi/core/action/manual.py +1 -0
- lionagi/core/action/node.py +109 -0
- lionagi/core/action/tool.py +114 -0
- lionagi/core/action/tool_manager.py +356 -0
- lionagi/core/agent/base_agent.py +27 -13
- lionagi/core/agent/eval/evaluator.py +1 -0
- lionagi/core/agent/eval/vote.py +40 -0
- lionagi/core/agent/learn/learner.py +59 -0
- lionagi/core/agent/plan/unit_template.py +1 -0
- lionagi/core/collections/__init__.py +17 -0
- lionagi/core/{generic/data_logger.py → collections/_logger.py} +69 -55
- lionagi/core/collections/abc/__init__.py +53 -0
- lionagi/core/collections/abc/component.py +615 -0
- lionagi/core/collections/abc/concepts.py +297 -0
- lionagi/core/collections/abc/exceptions.py +150 -0
- lionagi/core/collections/abc/util.py +45 -0
- lionagi/core/collections/exchange.py +161 -0
- lionagi/core/collections/flow.py +426 -0
- lionagi/core/collections/model.py +419 -0
- lionagi/core/collections/pile.py +913 -0
- lionagi/core/collections/progression.py +236 -0
- lionagi/core/collections/util.py +64 -0
- lionagi/core/director/direct.py +314 -0
- lionagi/core/director/director.py +2 -0
- lionagi/core/{execute/branch_executor.py → engine/branch_engine.py} +134 -97
- lionagi/core/{execute/instruction_map_executor.py → engine/instruction_map_engine.py} +80 -55
- lionagi/{experimental/directive/evaluator → core/engine}/script_engine.py +17 -1
- lionagi/core/executor/base_executor.py +90 -0
- lionagi/core/{execute/structure_executor.py → executor/graph_executor.py} +83 -67
- lionagi/core/{execute → executor}/neo4j_executor.py +70 -67
- lionagi/core/generic/__init__.py +3 -33
- lionagi/core/generic/edge.py +42 -92
- lionagi/core/generic/edge_condition.py +16 -0
- lionagi/core/generic/graph.py +236 -0
- lionagi/core/generic/hyperedge.py +1 -0
- lionagi/core/generic/node.py +156 -221
- lionagi/core/generic/tree.py +48 -0
- lionagi/core/generic/tree_node.py +79 -0
- lionagi/core/mail/__init__.py +12 -0
- lionagi/core/mail/mail.py +25 -0
- lionagi/core/mail/mail_manager.py +139 -58
- lionagi/core/mail/package.py +45 -0
- lionagi/core/mail/start_mail.py +36 -0
- lionagi/core/message/__init__.py +19 -0
- lionagi/core/message/action_request.py +133 -0
- lionagi/core/message/action_response.py +135 -0
- lionagi/core/message/assistant_response.py +95 -0
- lionagi/core/message/instruction.py +234 -0
- lionagi/core/message/message.py +101 -0
- lionagi/core/message/system.py +86 -0
- lionagi/core/message/util.py +283 -0
- lionagi/core/report/__init__.py +4 -0
- lionagi/core/report/base.py +217 -0
- lionagi/core/report/form.py +231 -0
- lionagi/core/report/report.py +166 -0
- lionagi/core/report/util.py +28 -0
- lionagi/core/rule/_default.py +16 -0
- lionagi/core/rule/action.py +99 -0
- lionagi/core/rule/base.py +238 -0
- lionagi/core/rule/boolean.py +56 -0
- lionagi/core/rule/choice.py +47 -0
- lionagi/core/rule/mapping.py +96 -0
- lionagi/core/rule/number.py +71 -0
- lionagi/core/rule/rulebook.py +109 -0
- lionagi/core/rule/string.py +52 -0
- lionagi/core/rule/util.py +35 -0
- lionagi/core/session/branch.py +431 -0
- lionagi/core/session/directive_mixin.py +287 -0
- lionagi/core/session/session.py +229 -903
- lionagi/core/structure/__init__.py +1 -0
- lionagi/core/structure/chain.py +1 -0
- lionagi/core/structure/forest.py +1 -0
- lionagi/core/structure/graph.py +1 -0
- lionagi/core/structure/tree.py +1 -0
- lionagi/core/unit/__init__.py +5 -0
- lionagi/core/unit/parallel_unit.py +245 -0
- lionagi/core/unit/template/action.py +81 -0
- lionagi/core/unit/template/base.py +51 -0
- lionagi/core/unit/template/plan.py +84 -0
- lionagi/core/unit/template/predict.py +109 -0
- lionagi/core/unit/template/score.py +124 -0
- lionagi/core/unit/template/select.py +104 -0
- lionagi/core/unit/unit.py +362 -0
- lionagi/core/unit/unit_form.py +305 -0
- lionagi/core/unit/unit_mixin.py +1168 -0
- lionagi/core/unit/util.py +71 -0
- lionagi/core/validator/validator.py +364 -0
- lionagi/core/work/work.py +74 -0
- lionagi/core/work/work_function.py +92 -0
- lionagi/core/work/work_queue.py +81 -0
- lionagi/core/work/worker.py +195 -0
- lionagi/core/work/worklog.py +124 -0
- lionagi/experimental/compressor/base.py +46 -0
- lionagi/experimental/compressor/llm_compressor.py +247 -0
- lionagi/experimental/compressor/llm_summarizer.py +61 -0
- lionagi/experimental/compressor/util.py +70 -0
- lionagi/experimental/directive/__init__.py +19 -0
- lionagi/experimental/directive/parser/base_parser.py +69 -2
- lionagi/experimental/directive/{template_ → template}/base_template.py +17 -1
- lionagi/{libs/ln_tokenizer.py → experimental/directive/tokenizer.py} +16 -0
- lionagi/experimental/{directive/evaluator → evaluator}/ast_evaluator.py +16 -0
- lionagi/experimental/{directive/evaluator → evaluator}/base_evaluator.py +16 -0
- lionagi/experimental/knowledge/__init__.py +0 -0
- lionagi/experimental/knowledge/base.py +10 -0
- lionagi/experimental/knowledge/graph.py +0 -0
- lionagi/experimental/memory/__init__.py +0 -0
- lionagi/experimental/strategies/__init__.py +0 -0
- lionagi/experimental/strategies/base.py +1 -0
- lionagi/integrations/bridge/langchain_/documents.py +4 -0
- lionagi/integrations/bridge/llamaindex_/index.py +30 -0
- lionagi/integrations/bridge/llamaindex_/llama_index_bridge.py +6 -0
- lionagi/integrations/chunker/chunk.py +161 -24
- lionagi/integrations/config/oai_configs.py +34 -3
- lionagi/integrations/config/openrouter_configs.py +14 -2
- lionagi/integrations/loader/load.py +122 -21
- lionagi/integrations/loader/load_util.py +6 -77
- lionagi/integrations/provider/_mapping.py +46 -0
- lionagi/integrations/provider/litellm.py +2 -1
- lionagi/integrations/provider/mlx_service.py +16 -9
- lionagi/integrations/provider/oai.py +91 -4
- lionagi/integrations/provider/ollama.py +6 -5
- lionagi/integrations/provider/openrouter.py +115 -8
- lionagi/integrations/provider/services.py +2 -2
- lionagi/integrations/provider/transformers.py +18 -22
- lionagi/integrations/storage/__init__.py +3 -3
- lionagi/integrations/storage/neo4j.py +52 -60
- lionagi/integrations/storage/storage_util.py +45 -47
- lionagi/integrations/storage/structure_excel.py +285 -0
- lionagi/integrations/storage/to_excel.py +23 -7
- lionagi/libs/__init__.py +26 -1
- lionagi/libs/ln_api.py +75 -20
- lionagi/libs/ln_context.py +37 -0
- lionagi/libs/ln_convert.py +21 -9
- lionagi/libs/ln_func_call.py +69 -28
- lionagi/libs/ln_image.py +107 -0
- lionagi/libs/ln_nested.py +26 -11
- lionagi/libs/ln_parse.py +82 -23
- lionagi/libs/ln_queue.py +16 -0
- lionagi/libs/ln_tokenize.py +164 -0
- lionagi/libs/ln_validate.py +16 -0
- lionagi/libs/special_tokens.py +172 -0
- lionagi/libs/sys_util.py +95 -24
- lionagi/lions/coder/code_form.py +13 -0
- lionagi/lions/coder/coder.py +50 -3
- lionagi/lions/coder/util.py +30 -25
- lionagi/tests/libs/test_func_call.py +23 -21
- lionagi/tests/libs/test_nested.py +36 -21
- lionagi/tests/libs/test_parse.py +1 -1
- lionagi/tests/test_core/collections/__init__.py +0 -0
- lionagi/tests/test_core/collections/test_component.py +206 -0
- lionagi/tests/test_core/collections/test_exchange.py +138 -0
- lionagi/tests/test_core/collections/test_flow.py +145 -0
- lionagi/tests/test_core/collections/test_pile.py +171 -0
- lionagi/tests/test_core/collections/test_progression.py +129 -0
- lionagi/tests/test_core/generic/__init__.py +0 -0
- lionagi/tests/test_core/generic/test_edge.py +67 -0
- lionagi/tests/test_core/generic/test_graph.py +96 -0
- lionagi/tests/test_core/generic/test_node.py +106 -0
- lionagi/tests/test_core/generic/test_tree_node.py +73 -0
- lionagi/tests/test_core/test_branch.py +115 -294
- lionagi/tests/test_core/test_form.py +46 -0
- lionagi/tests/test_core/test_report.py +105 -0
- lionagi/tests/test_core/test_validator.py +111 -0
- lionagi/version.py +1 -1
- lionagi-0.2.0.dist-info/LICENSE +202 -0
- lionagi-0.2.0.dist-info/METADATA +272 -0
- lionagi-0.2.0.dist-info/RECORD +240 -0
- lionagi/core/branch/base.py +0 -653
- lionagi/core/branch/branch.py +0 -474
- lionagi/core/branch/flow_mixin.py +0 -96
- lionagi/core/branch/util.py +0 -323
- lionagi/core/direct/__init__.py +0 -19
- lionagi/core/direct/cot.py +0 -123
- lionagi/core/direct/plan.py +0 -164
- lionagi/core/direct/predict.py +0 -166
- lionagi/core/direct/react.py +0 -171
- lionagi/core/direct/score.py +0 -279
- lionagi/core/direct/select.py +0 -170
- lionagi/core/direct/sentiment.py +0 -1
- lionagi/core/direct/utils.py +0 -110
- lionagi/core/direct/vote.py +0 -64
- lionagi/core/execute/base_executor.py +0 -47
- lionagi/core/flow/baseflow.py +0 -23
- lionagi/core/flow/monoflow/ReAct.py +0 -238
- lionagi/core/flow/monoflow/__init__.py +0 -9
- lionagi/core/flow/monoflow/chat.py +0 -95
- lionagi/core/flow/monoflow/chat_mixin.py +0 -253
- lionagi/core/flow/monoflow/followup.py +0 -213
- lionagi/core/flow/polyflow/__init__.py +0 -1
- lionagi/core/flow/polyflow/chat.py +0 -251
- lionagi/core/form/action_form.py +0 -26
- lionagi/core/form/field_validator.py +0 -287
- lionagi/core/form/form.py +0 -302
- lionagi/core/form/mixin.py +0 -214
- lionagi/core/form/scored_form.py +0 -13
- lionagi/core/generic/action.py +0 -26
- lionagi/core/generic/component.py +0 -455
- lionagi/core/generic/condition.py +0 -44
- lionagi/core/generic/mail.py +0 -90
- lionagi/core/generic/mailbox.py +0 -36
- lionagi/core/generic/relation.py +0 -70
- lionagi/core/generic/signal.py +0 -22
- lionagi/core/generic/structure.py +0 -362
- lionagi/core/generic/transfer.py +0 -20
- lionagi/core/generic/work.py +0 -40
- lionagi/core/graph/graph.py +0 -126
- lionagi/core/graph/tree.py +0 -190
- lionagi/core/mail/schema.py +0 -63
- lionagi/core/messages/schema.py +0 -325
- lionagi/core/tool/__init__.py +0 -5
- lionagi/core/tool/tool.py +0 -28
- lionagi/core/tool/tool_manager.py +0 -282
- lionagi/experimental/tool/function_calling.py +0 -43
- lionagi/experimental/tool/manual.py +0 -66
- lionagi/experimental/tool/schema.py +0 -59
- lionagi/experimental/tool/tool_manager.py +0 -138
- lionagi/experimental/tool/util.py +0 -16
- lionagi/experimental/work/_logger.py +0 -25
- lionagi/experimental/work/schema.py +0 -30
- lionagi/experimental/work/tests.py +0 -72
- lionagi/experimental/work/work_function.py +0 -89
- lionagi/experimental/work/worker.py +0 -12
- lionagi/integrations/bridge/llamaindex_/get_index.py +0 -294
- lionagi/tests/test_core/test_base_branch.py +0 -426
- lionagi/tests/test_core/test_chat_flow.py +0 -63
- lionagi/tests/test_core/test_mail_manager.py +0 -75
- lionagi/tests/test_core/test_prompts.py +0 -51
- lionagi/tests/test_core/test_session.py +0 -254
- lionagi/tests/test_core/test_session_base_util.py +0 -313
- lionagi/tests/test_core/test_tool_manager.py +0 -95
- lionagi-0.1.1.dist-info/LICENSE +0 -9
- lionagi-0.1.1.dist-info/METADATA +0 -174
- lionagi-0.1.1.dist-info/RECORD +0 -190
- /lionagi/core/{branch → _setting}/__init__.py +0 -0
- /lionagi/core/{execute → agent/eval}/__init__.py +0 -0
- /lionagi/core/{flow → agent/learn}/__init__.py +0 -0
- /lionagi/core/{form → agent/plan}/__init__.py +0 -0
- /lionagi/core/{branch/executable_branch.py → agent/plan/plan.py} +0 -0
- /lionagi/core/{graph → director}/__init__.py +0 -0
- /lionagi/core/{messages → engine}/__init__.py +0 -0
- /lionagi/{experimental/directive/evaluator → core/engine}/sandbox_.py +0 -0
- /lionagi/{experimental/directive/evaluator → core/executor}/__init__.py +0 -0
- /lionagi/{experimental/directive/template_ → core/rule}/__init__.py +0 -0
- /lionagi/{experimental/tool → core/unit/template}/__init__.py +0 -0
- /lionagi/{experimental/work → core/validator}/__init__.py +0 -0
- /lionagi/core/{flow/mono_chat_mixin.py → work/__init__.py} +0 -0
- /lionagi/experimental/{work/exchange.py → compressor/__init__.py} +0 -0
- /lionagi/experimental/{work/util.py → directive/template/__init__.py} +0 -0
- /lionagi/experimental/directive/{schema.py → template/schema.py} +0 -0
- /lionagi/{tests/libs/test_async.py → experimental/evaluator/__init__.py} +0 -0
- {lionagi-0.1.1.dist-info → lionagi-0.2.0.dist-info}/WHEEL +0 -0
- {lionagi-0.1.1.dist-info → lionagi-0.2.0.dist-info}/top_level.txt +0 -0
lionagi/libs/ln_parse.py
CHANGED
@@ -1,9 +1,23 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2024 HaiyangLi
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
"""
|
16
|
+
|
1
17
|
from collections.abc import Callable
|
2
18
|
import re
|
3
19
|
import inspect
|
4
20
|
import itertools
|
5
|
-
import contextlib
|
6
|
-
from functools import singledispatchmethod
|
7
21
|
from typing import Any
|
8
22
|
import numpy as np
|
9
23
|
import lionagi.libs.ln_convert as convert
|
@@ -103,12 +117,12 @@ class ParseUtil:
|
|
103
117
|
# inspired by langchain_core.output_parsers.json (MIT License)
|
104
118
|
# https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/output_parsers/json.py
|
105
119
|
@staticmethod
|
106
|
-
def
|
120
|
+
def extract_json_block(
|
107
121
|
str_to_parse: str,
|
108
122
|
language: str | None = None,
|
109
123
|
regex_pattern: str | None = None,
|
110
124
|
*,
|
111
|
-
parser: Callable[[str], Any],
|
125
|
+
parser: Callable[[str], Any] = None,
|
112
126
|
) -> Any:
|
113
127
|
"""
|
114
128
|
Extracts and parses a code block from Markdown content.
|
@@ -148,9 +162,37 @@ class ParseUtil:
|
|
148
162
|
raise ValueError(
|
149
163
|
f"No {language or 'specified'} code block found in the Markdown content."
|
150
164
|
)
|
165
|
+
if not match:
|
166
|
+
str_to_parse = str_to_parse.strip()
|
167
|
+
if str_to_parse.startswith("```json\n") and str_to_parse.endswith("\n```"):
|
168
|
+
str_to_parse = str_to_parse[8:-4].strip()
|
151
169
|
|
170
|
+
parser = parser or ParseUtil.fuzzy_parse_json
|
152
171
|
return parser(code_str)
|
153
172
|
|
173
|
+
@staticmethod
|
174
|
+
def extract_code_blocks(code):
|
175
|
+
code_blocks = []
|
176
|
+
lines = code.split("\n")
|
177
|
+
inside_code_block = False
|
178
|
+
current_block = []
|
179
|
+
|
180
|
+
for line in lines:
|
181
|
+
if line.startswith("```"):
|
182
|
+
if inside_code_block:
|
183
|
+
code_blocks.append("\n".join(current_block))
|
184
|
+
current_block = []
|
185
|
+
inside_code_block = False
|
186
|
+
else:
|
187
|
+
inside_code_block = True
|
188
|
+
elif inside_code_block:
|
189
|
+
current_block.append(line)
|
190
|
+
|
191
|
+
if current_block:
|
192
|
+
code_blocks.append("\n".join(current_block))
|
193
|
+
|
194
|
+
return "\n\n".join(code_blocks)
|
195
|
+
|
154
196
|
@staticmethod
|
155
197
|
def md_to_json(
|
156
198
|
str_to_parse: str,
|
@@ -181,7 +223,7 @@ class ParseUtil:
|
|
181
223
|
>>> md_to_json('```json\\n{"key": "value"}\\n```', expected_keys=['key'])
|
182
224
|
{'key': 'value'}
|
183
225
|
"""
|
184
|
-
json_obj = ParseUtil.
|
226
|
+
json_obj = ParseUtil.extract_json_block(
|
185
227
|
str_to_parse, language="json", parser=parser or ParseUtil.fuzzy_parse_json
|
186
228
|
)
|
187
229
|
|
@@ -385,7 +427,9 @@ class ParseUtil:
|
|
385
427
|
return type_mapping.get(py_type, "object")
|
386
428
|
|
387
429
|
@staticmethod
|
388
|
-
def _func_to_schema(
|
430
|
+
def _func_to_schema(
|
431
|
+
func, style="google", func_description=None, params_description=None
|
432
|
+
):
|
389
433
|
"""
|
390
434
|
Generates a schema description for a given function, using typing hints and
|
391
435
|
docstrings. The schema includes the function's name, description, and parameters.
|
@@ -412,9 +456,11 @@ class ParseUtil:
|
|
412
456
|
"""
|
413
457
|
# Extracting function name and docstring details
|
414
458
|
func_name = func.__name__
|
415
|
-
|
416
|
-
|
417
|
-
|
459
|
+
|
460
|
+
if not func_description:
|
461
|
+
func_description, _ = ParseUtil._extract_docstring_details(func, style)
|
462
|
+
if not params_description:
|
463
|
+
_, params_description = ParseUtil._extract_docstring_details(func, style)
|
418
464
|
|
419
465
|
# Extracting parameters with typing hints
|
420
466
|
sig = inspect.signature(func)
|
@@ -634,7 +680,7 @@ class StringMatch:
|
|
634
680
|
# Calculate Jaro-Winkler similarity scores for each potential match
|
635
681
|
scores = np.array(
|
636
682
|
[
|
637
|
-
score_func(
|
683
|
+
score_func(str(word), str(correct_word))
|
638
684
|
for correct_word in correct_words_list
|
639
685
|
]
|
640
686
|
)
|
@@ -648,26 +694,39 @@ class StringMatch:
|
|
648
694
|
|
649
695
|
if isinstance(out_, str):
|
650
696
|
# first try to parse it straight as a fuzzy json
|
697
|
+
|
651
698
|
try:
|
652
699
|
out_ = ParseUtil.fuzzy_parse_json(out_)
|
653
|
-
|
700
|
+
return StringMatch.correct_dict_keys(keys, out_)
|
701
|
+
|
702
|
+
except:
|
654
703
|
try:
|
655
|
-
# if failed we try to extract the json block and parse it
|
656
704
|
out_ = ParseUtil.md_to_json(out_)
|
705
|
+
return StringMatch.correct_dict_keys(keys, out_)
|
706
|
+
|
657
707
|
except Exception:
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
708
|
+
try:
|
709
|
+
# if failed we try to extract the json block and parse it
|
710
|
+
out_ = ParseUtil.md_to_json(out_)
|
711
|
+
return StringMatch.correct_dict_keys(keys, out_)
|
712
|
+
|
713
|
+
except Exception:
|
714
|
+
# if still failed we try to extract the json block using re and parse it again
|
715
|
+
match = re.search(r"```json\n({.*?})\n```", out_, re.DOTALL)
|
716
|
+
if match:
|
717
|
+
out_ = match.group(1)
|
665
718
|
try:
|
666
|
-
out_ = ParseUtil.fuzzy_parse_json(
|
667
|
-
|
668
|
-
|
719
|
+
out_ = ParseUtil.fuzzy_parse_json(out_)
|
720
|
+
return StringMatch.correct_dict_keys(keys, out_)
|
721
|
+
|
669
722
|
except:
|
670
|
-
|
723
|
+
try:
|
724
|
+
out_ = ParseUtil.fuzzy_parse_json(
|
725
|
+
out_.replace("'", '"')
|
726
|
+
)
|
727
|
+
return StringMatch.correct_dict_keys(keys, out_)
|
728
|
+
except:
|
729
|
+
pass
|
671
730
|
|
672
731
|
if isinstance(out_, dict):
|
673
732
|
try:
|
lionagi/libs/ln_queue.py
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2024 HaiyangLi
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
"""
|
16
|
+
|
1
17
|
"""
|
2
18
|
A class that manages asynchronous task processing with controlled concurrency.
|
3
19
|
"""
|
@@ -0,0 +1,164 @@
|
|
1
|
+
import tiktoken
|
2
|
+
import math
|
3
|
+
from .ln_convert import to_str
|
4
|
+
from .special_tokens import disallowed_tokens
|
5
|
+
|
6
|
+
|
7
|
+
class TokenizeUtil:
|
8
|
+
|
9
|
+
@staticmethod
|
10
|
+
def tokenize(
|
11
|
+
text,
|
12
|
+
encoding_model=None,
|
13
|
+
encoding_name=None,
|
14
|
+
return_byte=False,
|
15
|
+
disallowed_tokens=disallowed_tokens,
|
16
|
+
):
|
17
|
+
encoding = None
|
18
|
+
|
19
|
+
if encoding_model:
|
20
|
+
try:
|
21
|
+
encoding_name = tiktoken.encoding_name_for_model(encoding_model)
|
22
|
+
except:
|
23
|
+
encoding_name = encoding_name or "cl100k_base"
|
24
|
+
|
25
|
+
if not encoding_name or encoding_name in tiktoken.list_encoding_names():
|
26
|
+
encoding_name = encoding_name or "cl100k_base"
|
27
|
+
encoding = tiktoken.get_encoding(encoding_name)
|
28
|
+
|
29
|
+
special_encodings = (
|
30
|
+
[encoding.encode(token) for token in disallowed_tokens]
|
31
|
+
if disallowed_tokens
|
32
|
+
else []
|
33
|
+
)
|
34
|
+
codes = encoding.encode(text)
|
35
|
+
if special_encodings and len(special_encodings) > 0:
|
36
|
+
codes = [code for code in codes if code not in special_encodings]
|
37
|
+
|
38
|
+
if return_byte:
|
39
|
+
return codes
|
40
|
+
|
41
|
+
return [encoding.decode([code]) for code in codes]
|
42
|
+
|
43
|
+
@staticmethod
|
44
|
+
def chunk_by_chars(
|
45
|
+
text: str, chunk_size: int, overlap: float, threshold: int
|
46
|
+
) -> list[str | None]:
|
47
|
+
"""
|
48
|
+
Chunks the input text into smaller parts, with optional overlap and threshold for final chunk.
|
49
|
+
|
50
|
+
Parameters:
|
51
|
+
text (str): The input text to chunk.
|
52
|
+
|
53
|
+
chunk_size (int): The size of each chunk.
|
54
|
+
|
55
|
+
overlap (float): The amount of overlap between chunks.
|
56
|
+
|
57
|
+
threshold (int): The minimum size of the final chunk.
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
List[Union[str, None]]: A list of text chunks.
|
61
|
+
|
62
|
+
Raises:
|
63
|
+
ValueError: If an error occurs during chunking.
|
64
|
+
"""
|
65
|
+
|
66
|
+
def _chunk_n1():
|
67
|
+
return [text]
|
68
|
+
|
69
|
+
def _chunk_n2():
|
70
|
+
chunks = []
|
71
|
+
chunks.append(text[: chunk_size + overlap_size])
|
72
|
+
|
73
|
+
if len(text) - chunk_size > threshold:
|
74
|
+
chunks.append(text[chunk_size - overlap_size :])
|
75
|
+
else:
|
76
|
+
return _chunk_n1()
|
77
|
+
|
78
|
+
return chunks
|
79
|
+
|
80
|
+
def _chunk_n3():
|
81
|
+
chunks = []
|
82
|
+
chunks.append(text[: chunk_size + overlap_size])
|
83
|
+
for i in range(1, n_chunks - 1):
|
84
|
+
start_idx = chunk_size * i - overlap_size
|
85
|
+
end_idx = chunk_size * (i + 1) + overlap_size
|
86
|
+
chunks.append(text[start_idx:end_idx])
|
87
|
+
|
88
|
+
if len(text) - chunk_size * (n_chunks - 1) > threshold:
|
89
|
+
chunks.append(text[chunk_size * (n_chunks - 1) - overlap_size :])
|
90
|
+
else:
|
91
|
+
chunks[-1] += text[chunk_size * (n_chunks - 1) + overlap_size :]
|
92
|
+
|
93
|
+
return chunks
|
94
|
+
|
95
|
+
try:
|
96
|
+
if not isinstance(text, str):
|
97
|
+
text = to_str(text)
|
98
|
+
|
99
|
+
n_chunks = math.ceil(len(text) / chunk_size)
|
100
|
+
overlap_size = int(overlap / 2)
|
101
|
+
|
102
|
+
if n_chunks == 1:
|
103
|
+
return _chunk_n1()
|
104
|
+
|
105
|
+
elif n_chunks == 2:
|
106
|
+
return _chunk_n2()
|
107
|
+
|
108
|
+
elif n_chunks > 2:
|
109
|
+
return _chunk_n3()
|
110
|
+
|
111
|
+
except Exception as e:
|
112
|
+
raise ValueError(f"An error occurred while chunking the text. {e}")
|
113
|
+
|
114
|
+
@staticmethod
|
115
|
+
def chunk_by_tokens(
|
116
|
+
text: str,
|
117
|
+
chunk_size: int,
|
118
|
+
overlap: float,
|
119
|
+
threshold: int, # minimum size of the final chunk in number of tokens
|
120
|
+
encoding_model=None,
|
121
|
+
encoding_name=None,
|
122
|
+
return_tokens=False,
|
123
|
+
return_byte=False,
|
124
|
+
) -> list[str | None]:
|
125
|
+
|
126
|
+
tokens = TokenizeUtil.tokenize(
|
127
|
+
text, encoding_model, encoding_name, return_byte=return_byte
|
128
|
+
)
|
129
|
+
|
130
|
+
n_chunks = math.ceil(len(tokens) / chunk_size)
|
131
|
+
overlap_size = int(overlap * chunk_size / 2)
|
132
|
+
residue = len(tokens) % chunk_size
|
133
|
+
|
134
|
+
if n_chunks == 1:
|
135
|
+
return text if not return_tokens else [tokens]
|
136
|
+
|
137
|
+
elif n_chunks == 2:
|
138
|
+
chunks = [tokens[: chunk_size + overlap_size]]
|
139
|
+
if residue > threshold:
|
140
|
+
chunks.append(tokens[chunk_size - overlap_size :])
|
141
|
+
return (
|
142
|
+
[" ".join(chunk).strip() for chunk in chunks]
|
143
|
+
if not return_tokens
|
144
|
+
else chunks
|
145
|
+
)
|
146
|
+
else:
|
147
|
+
return text if not return_tokens else [tokens]
|
148
|
+
|
149
|
+
elif n_chunks > 2:
|
150
|
+
chunks = []
|
151
|
+
chunks.append(tokens[: chunk_size + overlap_size])
|
152
|
+
for i in range(1, n_chunks - 1):
|
153
|
+
start_idx = chunk_size * i - overlap_size
|
154
|
+
end_idx = chunk_size * (i + 1) + overlap_size
|
155
|
+
chunks.append(tokens[start_idx:end_idx])
|
156
|
+
|
157
|
+
if len(tokens) - chunk_size * (n_chunks - 1) > threshold:
|
158
|
+
chunks.append(tokens[chunk_size * (n_chunks - 1) - overlap_size :])
|
159
|
+
else:
|
160
|
+
chunks[-1] += tokens[-residue:]
|
161
|
+
|
162
|
+
return (
|
163
|
+
[" ".join(chunk) for chunk in chunks] if not return_tokens else chunks
|
164
|
+
)
|
lionagi/libs/ln_validate.py
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2024 HaiyangLi
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
"""
|
16
|
+
|
1
17
|
"""
|
2
18
|
This module provides functions for validating and fixing field values based on their data types.
|
3
19
|
|
@@ -0,0 +1,172 @@
|
|
1
|
+
# disallowed special tokens
|
2
|
+
|
3
|
+
disallowed_tokens = [
|
4
|
+
"▄",
|
5
|
+
"▅",
|
6
|
+
"▆",
|
7
|
+
"▇",
|
8
|
+
"█",
|
9
|
+
"▏",
|
10
|
+
"▎",
|
11
|
+
"▍",
|
12
|
+
"▌",
|
13
|
+
"▋",
|
14
|
+
"▊",
|
15
|
+
"▉",
|
16
|
+
"▔",
|
17
|
+
"▕",
|
18
|
+
"▁▁",
|
19
|
+
"▁▂",
|
20
|
+
"▁▃",
|
21
|
+
"▁▄",
|
22
|
+
"▁▅",
|
23
|
+
"▁▆",
|
24
|
+
"▁▇",
|
25
|
+
"▁█",
|
26
|
+
"▁▏",
|
27
|
+
"▁▎",
|
28
|
+
"▁▍",
|
29
|
+
"▁▌",
|
30
|
+
"▁▋",
|
31
|
+
"▁▊",
|
32
|
+
"▁▉",
|
33
|
+
"▁▔",
|
34
|
+
"▁▕",
|
35
|
+
"▄▄",
|
36
|
+
"▄▅",
|
37
|
+
"▄▆",
|
38
|
+
"▄▇",
|
39
|
+
"▄█",
|
40
|
+
"▄▏",
|
41
|
+
"▄▎",
|
42
|
+
"▄▍",
|
43
|
+
"▄▌",
|
44
|
+
"▄▋",
|
45
|
+
"▄▊",
|
46
|
+
"▄▉",
|
47
|
+
"▄▔",
|
48
|
+
"▄▕",
|
49
|
+
"▅▅",
|
50
|
+
"▅▆",
|
51
|
+
"▅▇",
|
52
|
+
"▅█",
|
53
|
+
"▅▏",
|
54
|
+
"▅▎",
|
55
|
+
"▅▍",
|
56
|
+
"▅▌",
|
57
|
+
"▅▋",
|
58
|
+
"▅▊",
|
59
|
+
"▅▉",
|
60
|
+
"▅▔",
|
61
|
+
"▅▕",
|
62
|
+
"▆▆",
|
63
|
+
"▆▇",
|
64
|
+
"▆█",
|
65
|
+
"▆▏",
|
66
|
+
"▆▎",
|
67
|
+
"▆▍",
|
68
|
+
"▆▌",
|
69
|
+
"▆▋",
|
70
|
+
"▆▊",
|
71
|
+
"▆▉",
|
72
|
+
"▆▔",
|
73
|
+
"▆▕",
|
74
|
+
"▇▇",
|
75
|
+
"▇█",
|
76
|
+
"▇▏",
|
77
|
+
"▇▎",
|
78
|
+
"▇▍",
|
79
|
+
"▇▌",
|
80
|
+
"▇▋",
|
81
|
+
"▇▊",
|
82
|
+
"▇▉",
|
83
|
+
"▇▔",
|
84
|
+
"▇▕",
|
85
|
+
"██",
|
86
|
+
"█▏",
|
87
|
+
"█▎",
|
88
|
+
"█▍",
|
89
|
+
"█▌",
|
90
|
+
"█▋",
|
91
|
+
"█▊",
|
92
|
+
"█▉",
|
93
|
+
"█",
|
94
|
+
"█▔",
|
95
|
+
"█▕",
|
96
|
+
"▏▏",
|
97
|
+
"▏▎",
|
98
|
+
"▏▍",
|
99
|
+
"▏▌",
|
100
|
+
"▏▋",
|
101
|
+
"▏▊",
|
102
|
+
"▏▉",
|
103
|
+
"▏",
|
104
|
+
"▏▔",
|
105
|
+
"▏▕",
|
106
|
+
"▎▎",
|
107
|
+
"▎▍",
|
108
|
+
"▎▌",
|
109
|
+
"▎▋",
|
110
|
+
"▎▊",
|
111
|
+
"▎▉",
|
112
|
+
"▎",
|
113
|
+
"▎▔",
|
114
|
+
"▎▕",
|
115
|
+
"▍▍",
|
116
|
+
"▍▌",
|
117
|
+
"▍▋",
|
118
|
+
"▍▊",
|
119
|
+
"▍▉",
|
120
|
+
"▍",
|
121
|
+
"▍▔",
|
122
|
+
"▍▕",
|
123
|
+
"▌▌",
|
124
|
+
"▌▋",
|
125
|
+
"▌▊",
|
126
|
+
"▌▉",
|
127
|
+
"▌",
|
128
|
+
"▌▔",
|
129
|
+
"▌▕",
|
130
|
+
"▋▋",
|
131
|
+
"▋▊",
|
132
|
+
"▋▉",
|
133
|
+
"▋",
|
134
|
+
"▋▔",
|
135
|
+
"▋▕",
|
136
|
+
"▊▊",
|
137
|
+
"▊▉",
|
138
|
+
"▊",
|
139
|
+
"▊▔",
|
140
|
+
"▊▕",
|
141
|
+
"▉▉",
|
142
|
+
"▉",
|
143
|
+
"▉▔",
|
144
|
+
"▉▕",
|
145
|
+
"▔▔",
|
146
|
+
"▔▕",
|
147
|
+
"▕▕",
|
148
|
+
"▁▁▁",
|
149
|
+
"▁▁▂",
|
150
|
+
"▁▁▃",
|
151
|
+
"▁▁▄",
|
152
|
+
"▁▁▅",
|
153
|
+
"▁▁▆",
|
154
|
+
"▁▁▇",
|
155
|
+
"▁▁█",
|
156
|
+
"▁▁▏",
|
157
|
+
"▁▁▎",
|
158
|
+
"▁▁▍",
|
159
|
+
"▁▁▌",
|
160
|
+
"▁▁▋",
|
161
|
+
"▁▁▊",
|
162
|
+
"▁▁▉",
|
163
|
+
"▁▁▔",
|
164
|
+
"▁▁▕",
|
165
|
+
"▁▂▂",
|
166
|
+
"▁▂▃",
|
167
|
+
"▁▂▄",
|
168
|
+
"▁▂▅",
|
169
|
+
"▁▂▆",
|
170
|
+
"▁▂▇",
|
171
|
+
"▁▂█",
|
172
|
+
]
|
lionagi/libs/sys_util.py
CHANGED
@@ -1,28 +1,17 @@
|
|
1
1
|
"""
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
The above copyright notice and this permission notice shall be
|
16
|
-
included in all copies or substantial portions of the Software.
|
17
|
-
|
18
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
19
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
20
|
-
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
21
|
-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
22
|
-
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
23
|
-
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
24
|
-
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
25
|
-
SOFTWARE.
|
2
|
+
Copyright 2024 HaiyangLi
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
26
15
|
"""
|
27
16
|
|
28
17
|
import copy
|
@@ -430,6 +419,19 @@ class SysUtil:
|
|
430
419
|
|
431
420
|
@staticmethod
|
432
421
|
def list_files(dir_path: Path | str, extension: str = None) -> list[Path]:
|
422
|
+
"""
|
423
|
+
Lists all files in a specified directory with an optional filter for file extensions.
|
424
|
+
|
425
|
+
Args:
|
426
|
+
dir_path (Path | str): The directory path where files are listed.
|
427
|
+
extension (str, optional): Filter files by extension. Default is None, which lists all files.
|
428
|
+
|
429
|
+
Returns:
|
430
|
+
list[Path]: A list of Path objects representing files in the directory.
|
431
|
+
|
432
|
+
Raises:
|
433
|
+
NotADirectoryError: If the provided dir_path is not a directory.
|
434
|
+
"""
|
433
435
|
dir_path = Path(dir_path)
|
434
436
|
if not dir_path.is_dir():
|
435
437
|
raise NotADirectoryError(f"{dir_path} is not a directory.")
|
@@ -440,6 +442,16 @@ class SysUtil:
|
|
440
442
|
|
441
443
|
@staticmethod
|
442
444
|
def copy_file(src: Path | str, dest: Path | str) -> None:
|
445
|
+
"""
|
446
|
+
Copies a file from a source path to a destination path.
|
447
|
+
|
448
|
+
Args:
|
449
|
+
src (Path | str): The source file path.
|
450
|
+
dest (Path | str): The destination file path.
|
451
|
+
|
452
|
+
Raises:
|
453
|
+
FileNotFoundError: If the source file does not exist or is not a file.
|
454
|
+
"""
|
443
455
|
from shutil import copy2
|
444
456
|
|
445
457
|
src, dest = Path(src), Path(dest)
|
@@ -450,6 +462,18 @@ class SysUtil:
|
|
450
462
|
|
451
463
|
@staticmethod
|
452
464
|
def get_size(path: Path | str) -> int:
|
465
|
+
"""
|
466
|
+
Gets the size of a file or total size of files in a directory.
|
467
|
+
|
468
|
+
Args:
|
469
|
+
path (Path | str): The file or directory path.
|
470
|
+
|
471
|
+
Returns:
|
472
|
+
int: The size in bytes.
|
473
|
+
|
474
|
+
Raises:
|
475
|
+
FileNotFoundError: If the path does not exist.
|
476
|
+
"""
|
453
477
|
path = Path(path)
|
454
478
|
if path.is_file():
|
455
479
|
return path.stat().st_size
|
@@ -457,3 +481,50 @@ class SysUtil:
|
|
457
481
|
return sum(f.stat().st_size for f in path.glob("**/*") if f.is_file())
|
458
482
|
else:
|
459
483
|
raise FileNotFoundError(f"{path} does not exist.")
|
484
|
+
|
485
|
+
@staticmethod
|
486
|
+
def save_to_file(
|
487
|
+
text,
|
488
|
+
directory: Path | str,
|
489
|
+
filename: str,
|
490
|
+
timestamp: bool = True,
|
491
|
+
dir_exist_ok: bool = True,
|
492
|
+
time_prefix: bool = False,
|
493
|
+
custom_timestamp_format: str | None = None,
|
494
|
+
random_hash_digits=0,
|
495
|
+
verbose=True,
|
496
|
+
):
|
497
|
+
"""
|
498
|
+
Saves text to a file within a specified directory, optionally adding a timestamp, hash, and verbose logging.
|
499
|
+
|
500
|
+
Args:
|
501
|
+
text (str): The text to save.
|
502
|
+
directory (Path | str): The directory path to save the file.
|
503
|
+
filename (str): The filename for the saved text.
|
504
|
+
timestamp (bool): If True, append a timestamp to the filename. Default is True.
|
505
|
+
dir_exist_ok (bool): If True, creates the directory if it does not exist. Default is True.
|
506
|
+
time_prefix (bool): If True, prepend the timestamp instead of appending. Default is False.
|
507
|
+
custom_timestamp_format (str | None): A custom format for the timestamp, if None uses default format. Default is None.
|
508
|
+
random_hash_digits (int): Number of random hash digits to append to filename. Default is 0.
|
509
|
+
verbose (bool): If True, prints the file path after saving. Default is True.
|
510
|
+
|
511
|
+
Returns:
|
512
|
+
bool: True if the text was successfully saved.
|
513
|
+
"""
|
514
|
+
file_path = SysUtil.create_path(
|
515
|
+
directory=directory,
|
516
|
+
filename=filename,
|
517
|
+
timestamp=timestamp,
|
518
|
+
dir_exist_ok=dir_exist_ok,
|
519
|
+
time_prefix=time_prefix,
|
520
|
+
custom_timestamp_format=custom_timestamp_format,
|
521
|
+
random_hash_digits=random_hash_digits,
|
522
|
+
)
|
523
|
+
|
524
|
+
with open(file_path, "w") as file:
|
525
|
+
file.write(text)
|
526
|
+
|
527
|
+
if verbose:
|
528
|
+
print(f"Text saved to: {file_path}")
|
529
|
+
|
530
|
+
return True
|