lionagi 0.0.314__py3-none-any.whl → 0.0.315__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- lionagi/__init__.py +3 -0
- lionagi/core/direct/cot.py +1 -0
- lionagi/core/direct/plan.py +0 -0
- lionagi/core/direct/react.py +1 -1
- lionagi/core/direct/select.py +1 -3
- lionagi/core/direct/sentiment.py +1 -0
- lionagi/integrations/bridge/llamaindex_/node_parser.py +6 -9
- lionagi/integrations/chunker/__init__.py +0 -0
- lionagi/integrations/chunker/chunk.py +175 -0
- lionagi/integrations/loader/__init__.py +0 -0
- lionagi/integrations/loader/load.py +152 -0
- lionagi/integrations/loader/load_util.py +266 -0
- lionagi/version.py +1 -1
- {lionagi-0.0.314.dist-info → lionagi-0.0.315.dist-info}/METADATA +1 -1
- {lionagi-0.0.314.dist-info → lionagi-0.0.315.dist-info}/RECORD +18 -10
- {lionagi-0.0.314.dist-info → lionagi-0.0.315.dist-info}/LICENSE +0 -0
- {lionagi-0.0.314.dist-info → lionagi-0.0.315.dist-info}/WHEEL +0 -0
- {lionagi-0.0.314.dist-info → lionagi-0.0.315.dist-info}/top_level.txt +0 -0
lionagi/__init__.py
CHANGED
@@ -8,6 +8,9 @@ from dotenv import load_dotenv
|
|
8
8
|
|
9
9
|
from .core import direct, Branch, Session, func_to_tool
|
10
10
|
from .integrations.provider.services import Services
|
11
|
+
from .integrations.chunker.chunk import chunk
|
12
|
+
from .integrations.loader.load import load
|
13
|
+
|
11
14
|
|
12
15
|
logger = logging.getLogger(__name__)
|
13
16
|
logger.setLevel(logging.INFO)
|
@@ -0,0 +1 @@
|
|
1
|
+
# TODO: chain of thoughts
|
File without changes
|
lionagi/core/direct/react.py
CHANGED
lionagi/core/direct/select.py
CHANGED
@@ -39,9 +39,7 @@ class SelectTemplate(ScoredTemplate):
|
|
39
39
|
answer: Enum | str = Field(
|
40
40
|
default_factory=str, description="selection from given choices"
|
41
41
|
)
|
42
|
-
choices: list = Field(
|
43
|
-
default_factory=list, description="the given choices"
|
44
|
-
)
|
42
|
+
choices: list = Field(default_factory=list, description="the given choices")
|
45
43
|
|
46
44
|
signature: str = "sentence -> answer"
|
47
45
|
|
@@ -0,0 +1 @@
|
|
1
|
+
# TODO: sentiment analysis
|
@@ -29,19 +29,18 @@ def get_llama_index_node_parser(node_parser: Any):
|
|
29
29
|
import llama_index.core.node_parser
|
30
30
|
|
31
31
|
if not isinstance(node_parser, str) and not issubclass(node_parser, NodeParser):
|
32
|
-
raise TypeError(
|
32
|
+
raise TypeError("node_parser must be a string or NodeParser.")
|
33
33
|
|
34
34
|
if isinstance(node_parser, str):
|
35
35
|
if node_parser == "CodeSplitter":
|
36
36
|
SysUtil.check_import("tree_sitter_languages")
|
37
37
|
|
38
38
|
try:
|
39
|
-
|
40
|
-
return parser
|
39
|
+
return getattr(llama_index.core.node_parser, node_parser)
|
41
40
|
except Exception as e:
|
42
41
|
raise AttributeError(
|
43
42
|
f"llama_index_core has no such attribute:" f" {node_parser}, Error: {e}"
|
44
|
-
)
|
43
|
+
) from e
|
45
44
|
|
46
45
|
elif isinstance(node_parser, NodeParser):
|
47
46
|
return node_parser
|
@@ -75,10 +74,8 @@ def llama_index_parse_node(
|
|
75
74
|
parser = get_llama_index_node_parser(node_parser)
|
76
75
|
try:
|
77
76
|
parser = parser(*parser_args, **parser_kwargs)
|
78
|
-
except:
|
77
|
+
except Exception:
|
79
78
|
parser = parser.from_defaults(*parser_args, **parser_kwargs)
|
80
|
-
|
81
|
-
return nodes
|
82
|
-
|
79
|
+
return parser.get_nodes_from_documents(documents)
|
83
80
|
except Exception as e:
|
84
|
-
raise ValueError(f"Failed to parse. Error: {e}")
|
81
|
+
raise ValueError(f"Failed to parse. Error: {e}") from e
|
File without changes
|
@@ -0,0 +1,175 @@
|
|
1
|
+
from typing import Union, Callable
|
2
|
+
|
3
|
+
from lionagi.libs import func_call
|
4
|
+
from lionagi.core.schema import DataNode
|
5
|
+
from ..bridge.langchain_.langchain_bridge import LangchainBridge
|
6
|
+
from ..bridge.llamaindex_.llama_index_bridge import LlamaIndexBridge
|
7
|
+
|
8
|
+
|
9
|
+
from ..loader.load_util import ChunkerType, file_to_chunks, _datanode_parser
|
10
|
+
|
11
|
+
|
12
|
+
def datanodes_convert(documents, chunker_type):
|
13
|
+
|
14
|
+
for i in range(len(documents)):
|
15
|
+
if type(documents[i]) == DataNode:
|
16
|
+
if chunker_type == ChunkerType.LLAMAINDEX:
|
17
|
+
documents[i] = documents[i].to_llama_index()
|
18
|
+
elif chunker_type == ChunkerType.LANGCHAIN:
|
19
|
+
documents[i] = documents[i].to_langchain()
|
20
|
+
return documents
|
21
|
+
|
22
|
+
|
23
|
+
def text_chunker(documents, args, kwargs):
|
24
|
+
|
25
|
+
def chunk_node(node):
|
26
|
+
chunks = file_to_chunks(node.to_dict(), *args, **kwargs)
|
27
|
+
func_call.lcall(chunks, lambda chunk: chunk.pop("node_id"))
|
28
|
+
return [DataNode.from_obj({**chunk}) for chunk in chunks]
|
29
|
+
|
30
|
+
return [chunk_node(doc) for doc in documents]
|
31
|
+
|
32
|
+
|
33
|
+
def chunk(
|
34
|
+
documents,
|
35
|
+
chunker,
|
36
|
+
chunker_type=ChunkerType.PLAIN,
|
37
|
+
chunker_args=None,
|
38
|
+
chunker_kwargs=None,
|
39
|
+
chunking_kwargs=None,
|
40
|
+
documents_convert_func=None,
|
41
|
+
to_datanode: bool | Callable = True,
|
42
|
+
):
|
43
|
+
|
44
|
+
if chunker_args is None:
|
45
|
+
chunker_args = []
|
46
|
+
if chunker_kwargs is None:
|
47
|
+
chunker_kwargs = {}
|
48
|
+
if chunking_kwargs is None:
|
49
|
+
chunking_kwargs = {}
|
50
|
+
|
51
|
+
if chunker_type == ChunkerType.PLAIN:
|
52
|
+
return chunk_funcs[ChunkerType.PLAIN](
|
53
|
+
documents, chunker, chunker_args, chunker_kwargs
|
54
|
+
)
|
55
|
+
|
56
|
+
elif chunker_type == ChunkerType.LANGCHAIN:
|
57
|
+
return chunk_funcs[ChunkerType.LANGCHAIN](
|
58
|
+
documents,
|
59
|
+
documents_convert_func,
|
60
|
+
chunker,
|
61
|
+
chunker_args,
|
62
|
+
chunker_kwargs,
|
63
|
+
to_datanode,
|
64
|
+
)
|
65
|
+
|
66
|
+
elif chunker_type == ChunkerType.LLAMAINDEX:
|
67
|
+
return chunk_funcs[ChunkerType.LLAMAINDEX](
|
68
|
+
documents,
|
69
|
+
documents_convert_func,
|
70
|
+
chunker,
|
71
|
+
chunker_args,
|
72
|
+
chunker_kwargs,
|
73
|
+
to_datanode,
|
74
|
+
)
|
75
|
+
|
76
|
+
elif chunker_type == ChunkerType.SELFDEFINED:
|
77
|
+
return chunk_funcs[ChunkerType.SELFDEFINED](
|
78
|
+
documents,
|
79
|
+
chunker,
|
80
|
+
chunker_args,
|
81
|
+
chunker_kwargs,
|
82
|
+
chunking_kwargs,
|
83
|
+
to_datanode,
|
84
|
+
)
|
85
|
+
|
86
|
+
else:
|
87
|
+
raise ValueError(
|
88
|
+
f"{chunker_type} is not supported. Please choose from {list(ChunkerType)}"
|
89
|
+
)
|
90
|
+
|
91
|
+
|
92
|
+
def _self_defined_chunker(
|
93
|
+
documents,
|
94
|
+
chunker,
|
95
|
+
chunker_args,
|
96
|
+
chunker_kwargs,
|
97
|
+
chunking_kwargs,
|
98
|
+
to_datanode: bool | Callable,
|
99
|
+
):
|
100
|
+
try:
|
101
|
+
splitter = chunker(*chunker_args, **chunker_kwargs)
|
102
|
+
nodes = splitter.split(documents, **chunking_kwargs)
|
103
|
+
except Exception as e:
|
104
|
+
raise ValueError(
|
105
|
+
f"Self defined chunker {chunker} is not valid. Error: {e}"
|
106
|
+
) from e
|
107
|
+
|
108
|
+
if isinstance(to_datanode, bool) and to_datanode is True:
|
109
|
+
raise ValueError("Please define a valid parser to DataNode.")
|
110
|
+
elif isinstance(to_datanode, Callable):
|
111
|
+
nodes = _datanode_parser(nodes, to_datanode)
|
112
|
+
return nodes
|
113
|
+
|
114
|
+
|
115
|
+
def _llama_index_chunker(
|
116
|
+
documents,
|
117
|
+
documents_convert_func,
|
118
|
+
chunker,
|
119
|
+
chunker_args,
|
120
|
+
chunker_kwargs,
|
121
|
+
to_datanode: bool | Callable,
|
122
|
+
):
|
123
|
+
if documents_convert_func:
|
124
|
+
documents = documents_convert_func(documents, "llama_index")
|
125
|
+
nodes = LlamaIndexBridge.llama_index_parse_node(
|
126
|
+
documents, chunker, chunker_args, chunker_kwargs
|
127
|
+
)
|
128
|
+
|
129
|
+
if isinstance(to_datanode, bool) and to_datanode is True:
|
130
|
+
nodes = [DataNode.from_llama_index(i) for i in nodes]
|
131
|
+
elif isinstance(to_datanode, Callable):
|
132
|
+
nodes = _datanode_parser(nodes, to_datanode)
|
133
|
+
return nodes
|
134
|
+
|
135
|
+
|
136
|
+
def _langchain_chunker(
|
137
|
+
documents,
|
138
|
+
documents_convert_func,
|
139
|
+
chunker,
|
140
|
+
chunker_args,
|
141
|
+
chunker_kwargs,
|
142
|
+
to_datanode: bool | Callable,
|
143
|
+
):
|
144
|
+
if documents_convert_func:
|
145
|
+
documents = documents_convert_func(documents, "langchain")
|
146
|
+
nodes = LangchainBridge.langchain_text_splitter(
|
147
|
+
documents, chunker, chunker_args, chunker_kwargs
|
148
|
+
)
|
149
|
+
if isinstance(to_datanode, bool) and to_datanode is True:
|
150
|
+
if isinstance(documents, str):
|
151
|
+
nodes = [DataNode(content=i) for i in nodes]
|
152
|
+
else:
|
153
|
+
nodes = [DataNode.from_langchain(i) for i in nodes]
|
154
|
+
elif isinstance(to_datanode, Callable):
|
155
|
+
nodes = _datanode_parser(nodes, to_datanode)
|
156
|
+
return nodes
|
157
|
+
|
158
|
+
|
159
|
+
def _plain_chunker(documents, chunker, chunker_args, chunker_kwargs):
|
160
|
+
try:
|
161
|
+
if chunker == "text_chunker":
|
162
|
+
chunker = text_chunker
|
163
|
+
return chunker(documents, chunker_args, chunker_kwargs)
|
164
|
+
except Exception as e:
|
165
|
+
raise ValueError(
|
166
|
+
f"Reader {chunker} is currently not supported. Error: {e}"
|
167
|
+
) from e
|
168
|
+
|
169
|
+
|
170
|
+
chunk_funcs = {
|
171
|
+
ChunkerType.PLAIN: _plain_chunker,
|
172
|
+
ChunkerType.LANGCHAIN: _langchain_chunker,
|
173
|
+
ChunkerType.LLAMAINDEX: _llama_index_chunker,
|
174
|
+
ChunkerType.SELFDEFINED: _self_defined_chunker,
|
175
|
+
}
|
File without changes
|
@@ -0,0 +1,152 @@
|
|
1
|
+
from typing import Callable
|
2
|
+
|
3
|
+
from lionagi.core.schema import DataNode
|
4
|
+
from ..bridge.langchain_.langchain_bridge import LangchainBridge
|
5
|
+
from ..bridge.llamaindex_.llama_index_bridge import LlamaIndexBridge
|
6
|
+
|
7
|
+
from .load_util import dir_to_nodes, ReaderType, _datanode_parser
|
8
|
+
|
9
|
+
|
10
|
+
def text_reader(args, kwargs):
|
11
|
+
"""
|
12
|
+
Reads text files from a directory and converts them to DataNode instances.
|
13
|
+
|
14
|
+
Args:
|
15
|
+
args: Positional arguments for the dir_to_nodes function.
|
16
|
+
kwargs: Keyword arguments for the dir_to_nodes function.
|
17
|
+
|
18
|
+
Returns:
|
19
|
+
A list of DataNode instances.
|
20
|
+
|
21
|
+
Example usage:
|
22
|
+
>>> args = ['path/to/text/files']
|
23
|
+
>>> kwargs = {'file_extension': 'txt'}
|
24
|
+
>>> nodes = text_reader(args, kwargs)
|
25
|
+
"""
|
26
|
+
return dir_to_nodes(*args, **kwargs)
|
27
|
+
|
28
|
+
|
29
|
+
def load(
|
30
|
+
reader: str | Callable = "SimpleDirectoryReader",
|
31
|
+
input_dir=None,
|
32
|
+
input_files=None,
|
33
|
+
recursive: bool = False,
|
34
|
+
required_exts: list[str] = None,
|
35
|
+
reader_type=ReaderType.LLAMAINDEX,
|
36
|
+
reader_args=None,
|
37
|
+
reader_kwargs=None,
|
38
|
+
load_args=None,
|
39
|
+
load_kwargs=None,
|
40
|
+
to_datanode: bool | Callable = True,
|
41
|
+
):
|
42
|
+
|
43
|
+
if reader_args is None:
|
44
|
+
reader_args = []
|
45
|
+
if reader_kwargs is None:
|
46
|
+
reader_kwargs = {}
|
47
|
+
if load_args is None:
|
48
|
+
load_args = []
|
49
|
+
if load_kwargs is None:
|
50
|
+
load_kwargs = {}
|
51
|
+
|
52
|
+
if reader_type == ReaderType.PLAIN:
|
53
|
+
return read_funcs[ReaderType.PLAIN](reader, reader_args, reader_kwargs)
|
54
|
+
|
55
|
+
if reader_type == ReaderType.LANGCHAIN:
|
56
|
+
return read_funcs[ReaderType.LANGCHAIN](
|
57
|
+
reader, reader_args, reader_kwargs, to_datanode
|
58
|
+
)
|
59
|
+
|
60
|
+
elif reader_type == ReaderType.LLAMAINDEX:
|
61
|
+
if input_dir is not None:
|
62
|
+
reader_kwargs["input_dir"] = input_dir
|
63
|
+
if input_files is not None:
|
64
|
+
reader_kwargs["input_files"] = input_files
|
65
|
+
if recursive:
|
66
|
+
reader_kwargs["recursive"] = True
|
67
|
+
if required_exts is not None:
|
68
|
+
reader_kwargs["required_exts"] = required_exts
|
69
|
+
|
70
|
+
return read_funcs[ReaderType.LLAMAINDEX](
|
71
|
+
reader, reader_args, reader_kwargs, load_args, load_kwargs, to_datanode
|
72
|
+
)
|
73
|
+
|
74
|
+
elif reader_type == ReaderType.SELFDEFINED:
|
75
|
+
return read_funcs[ReaderType.SELFDEFINED](
|
76
|
+
reader, reader_args, reader_kwargs, load_args, load_kwargs, to_datanode
|
77
|
+
)
|
78
|
+
|
79
|
+
else:
|
80
|
+
raise ValueError(
|
81
|
+
f"{reader_type} is not supported. Please choose from {list(ReaderType)}"
|
82
|
+
)
|
83
|
+
|
84
|
+
|
85
|
+
def _plain_reader(reader, reader_args, reader_kwargs):
|
86
|
+
try:
|
87
|
+
if reader == "text_reader":
|
88
|
+
reader = text_reader
|
89
|
+
return reader(reader_args, reader_kwargs)
|
90
|
+
except Exception as e:
|
91
|
+
raise ValueError(
|
92
|
+
f"Reader {reader} is currently not supported. Error: {e}"
|
93
|
+
) from e
|
94
|
+
|
95
|
+
|
96
|
+
def _langchain_reader(reader, reader_args, reader_kwargs, to_datanode: bool | Callable):
|
97
|
+
nodes = LangchainBridge.langchain_loader(reader, reader_args, reader_kwargs)
|
98
|
+
if isinstance(to_datanode, bool) and to_datanode is True:
|
99
|
+
nodes = [DataNode.from_langchain(i) for i in nodes]
|
100
|
+
|
101
|
+
elif isinstance(to_datanode, Callable):
|
102
|
+
nodes = _datanode_parser(nodes, to_datanode)
|
103
|
+
return nodes
|
104
|
+
|
105
|
+
|
106
|
+
def _llama_index_reader(
|
107
|
+
reader,
|
108
|
+
reader_args,
|
109
|
+
reader_kwargs,
|
110
|
+
load_args,
|
111
|
+
load_kwargs,
|
112
|
+
to_datanode: bool | Callable,
|
113
|
+
):
|
114
|
+
nodes = LlamaIndexBridge.llama_index_read_data(
|
115
|
+
reader, reader_args, reader_kwargs, load_args, load_kwargs
|
116
|
+
)
|
117
|
+
if isinstance(to_datanode, bool) and to_datanode is True:
|
118
|
+
nodes = [DataNode.from_llama_index(i) for i in nodes]
|
119
|
+
elif isinstance(to_datanode, Callable):
|
120
|
+
nodes = _datanode_parser(nodes, to_datanode)
|
121
|
+
return nodes
|
122
|
+
|
123
|
+
|
124
|
+
def _self_defined_reader(
|
125
|
+
reader,
|
126
|
+
reader_args,
|
127
|
+
reader_kwargs,
|
128
|
+
load_args,
|
129
|
+
load_kwargs,
|
130
|
+
to_datanode: bool | Callable,
|
131
|
+
):
|
132
|
+
try:
|
133
|
+
loader = reader(*reader_args, **reader_kwargs)
|
134
|
+
nodes = loader.load(*load_args, **load_kwargs)
|
135
|
+
except Exception as e:
|
136
|
+
raise ValueError(
|
137
|
+
f"Self defined reader {reader} is not valid. Error: {e}"
|
138
|
+
) from e
|
139
|
+
|
140
|
+
if isinstance(to_datanode, bool) and to_datanode is True:
|
141
|
+
raise ValueError("Please define a valid parser to DataNode.")
|
142
|
+
elif isinstance(to_datanode, Callable):
|
143
|
+
nodes = _datanode_parser(nodes, to_datanode)
|
144
|
+
return nodes
|
145
|
+
|
146
|
+
|
147
|
+
read_funcs = {
|
148
|
+
ReaderType.PLAIN: _plain_reader,
|
149
|
+
ReaderType.LANGCHAIN: _langchain_reader,
|
150
|
+
ReaderType.LLAMAINDEX: _llama_index_reader,
|
151
|
+
ReaderType.SELFDEFINED: _self_defined_reader,
|
152
|
+
}
|
@@ -0,0 +1,266 @@
|
|
1
|
+
# use utils and schema
|
2
|
+
import math
|
3
|
+
from enum import Enum
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import List, Union, Dict, Any, Tuple
|
6
|
+
|
7
|
+
from lionagi.libs import convert, func_call
|
8
|
+
from lionagi.core.schema import DataNode
|
9
|
+
|
10
|
+
|
11
|
+
class ReaderType(str, Enum):
|
12
|
+
PLAIN = "plain"
|
13
|
+
LANGCHAIN = "langchain"
|
14
|
+
LLAMAINDEX = "llama_index"
|
15
|
+
SELFDEFINED = "self_defined"
|
16
|
+
|
17
|
+
|
18
|
+
class ChunkerType(str, Enum):
|
19
|
+
PLAIN = "plain" # default
|
20
|
+
LANGCHAIN = "langchain" # using langchain functions
|
21
|
+
LLAMAINDEX = "llama_index" # using llamaindex functions
|
22
|
+
SELFDEFINED = "self_defined" # create custom functions
|
23
|
+
|
24
|
+
|
25
|
+
def dir_to_path(
|
26
|
+
dir: str, ext: str, recursive: bool = False, flatten: bool = True
|
27
|
+
) -> List[Path]:
|
28
|
+
"""
|
29
|
+
Generates a list of file paths from a directory with the given file extension.
|
30
|
+
|
31
|
+
Parameters:
|
32
|
+
dir (str): The directory to search for files.
|
33
|
+
|
34
|
+
ext (str): The file extension to filter by.
|
35
|
+
|
36
|
+
recursive (bool): Whether to search subdirectories recursively. Defaults to False.
|
37
|
+
|
38
|
+
flatten (bool): Whether to flatten the list. Defaults to True.
|
39
|
+
|
40
|
+
Returns:
|
41
|
+
List[Path]: A list of Paths to the files.
|
42
|
+
|
43
|
+
Raises:
|
44
|
+
ValueError: If the directory or extension is invalid.
|
45
|
+
"""
|
46
|
+
|
47
|
+
def _dir_to_path(ext):
|
48
|
+
tem = "**/*" if recursive else "*"
|
49
|
+
return list(Path(dir).glob(tem + ext))
|
50
|
+
|
51
|
+
try:
|
52
|
+
return convert.to_list(
|
53
|
+
func_call.lcall(ext, _dir_to_path, flatten=True), flatten=flatten
|
54
|
+
)
|
55
|
+
except:
|
56
|
+
raise ValueError("Invalid directory or extension, please check the path")
|
57
|
+
|
58
|
+
|
59
|
+
def dir_to_nodes(
|
60
|
+
dir: str,
|
61
|
+
ext: Union[List[str], str],
|
62
|
+
recursive: bool = False,
|
63
|
+
flatten: bool = True,
|
64
|
+
clean_text: bool = True,
|
65
|
+
) -> List[DataNode]:
|
66
|
+
"""
|
67
|
+
Converts directory contents into DataNode objects based on specified file extensions.
|
68
|
+
|
69
|
+
This function first retrieves a list of file paths from the specified directory, matching the given file extension. It then reads the content of these files, optionally cleaning the text, and converts each file's content into a DataNode object.
|
70
|
+
|
71
|
+
Parameters:
|
72
|
+
dir (str): The directory path from which to read files.
|
73
|
+
ext: The file extension(s) to include. Can be a single string or a list/tuple of strings.
|
74
|
+
recursive (bool, optional): If True, the function searches for files recursively in subdirectories. Defaults to False.
|
75
|
+
flatten (bool, optional): If True, flattens the directory structure in the returned paths. Defaults to True.
|
76
|
+
clean_text (bool, optional): If True, cleans the text read from files. Defaults to True.
|
77
|
+
|
78
|
+
Returns:
|
79
|
+
list: A list of DataNode objects created from the files in the specified directory.
|
80
|
+
|
81
|
+
Example:
|
82
|
+
nodes = dir_to_nodes("/path/to/dir", ".txt", recursive=True)
|
83
|
+
# This would read all .txt files in /path/to/dir and its subdirectories,
|
84
|
+
# converting them into DataNode objects.
|
85
|
+
"""
|
86
|
+
|
87
|
+
path_list = dir_to_path(dir, ext, recursive, flatten)
|
88
|
+
files_info = func_call.lcall(path_list, read_text, clean=clean_text)
|
89
|
+
return func_call.lcall(files_info, lambda x: DataNode(content=x[0], metadata=x[1]))
|
90
|
+
|
91
|
+
|
92
|
+
def chunk_text(
|
93
|
+
input: str, chunk_size: int, overlap: float, threshold: int
|
94
|
+
) -> List[Union[str, None]]:
|
95
|
+
"""
|
96
|
+
Chunks the input text into smaller parts, with optional overlap and threshold for final chunk.
|
97
|
+
|
98
|
+
Parameters:
|
99
|
+
input (str): The input text to chunk.
|
100
|
+
|
101
|
+
chunk_size (int): The size of each chunk.
|
102
|
+
|
103
|
+
overlap (float): The amount of overlap between chunks.
|
104
|
+
|
105
|
+
threshold (int): The minimum size of the final chunk.
|
106
|
+
|
107
|
+
Returns:
|
108
|
+
List[Union[str, None]]: A list of text chunks.
|
109
|
+
|
110
|
+
Raises:
|
111
|
+
ValueError: If an error occurs during chunking.
|
112
|
+
"""
|
113
|
+
|
114
|
+
def _chunk_n1():
|
115
|
+
return [input]
|
116
|
+
|
117
|
+
def _chunk_n2():
|
118
|
+
chunks = []
|
119
|
+
chunks.append(input[: chunk_size + overlap_size])
|
120
|
+
|
121
|
+
if len(input) - chunk_size > threshold:
|
122
|
+
chunks.append(input[chunk_size - overlap_size :])
|
123
|
+
else:
|
124
|
+
return _chunk_n1()
|
125
|
+
|
126
|
+
return chunks
|
127
|
+
|
128
|
+
def _chunk_n3():
|
129
|
+
chunks = []
|
130
|
+
chunks.append(input[: chunk_size + overlap_size])
|
131
|
+
for i in range(1, n_chunks - 1):
|
132
|
+
start_idx = chunk_size * i - overlap_size
|
133
|
+
end_idx = chunk_size * (i + 1) + overlap_size
|
134
|
+
chunks.append(input[start_idx:end_idx])
|
135
|
+
|
136
|
+
if len(input) - chunk_size * (n_chunks - 1) > threshold:
|
137
|
+
chunks.append(input[chunk_size * (n_chunks - 1) - overlap_size :])
|
138
|
+
else:
|
139
|
+
chunks[-1] += input[chunk_size * (n_chunks - 1) + overlap_size :]
|
140
|
+
|
141
|
+
return chunks
|
142
|
+
|
143
|
+
try:
|
144
|
+
if not isinstance(input, str):
|
145
|
+
input = convert.to_str(input)
|
146
|
+
|
147
|
+
n_chunks = math.ceil(len(input) / chunk_size)
|
148
|
+
overlap_size = int(overlap / 2)
|
149
|
+
|
150
|
+
if n_chunks == 1:
|
151
|
+
return _chunk_n1()
|
152
|
+
|
153
|
+
elif n_chunks == 2:
|
154
|
+
return _chunk_n2()
|
155
|
+
|
156
|
+
elif n_chunks > 2:
|
157
|
+
return _chunk_n3()
|
158
|
+
|
159
|
+
except Exception as e:
|
160
|
+
raise ValueError(f"An error occurred while chunking the text. {e}")
|
161
|
+
|
162
|
+
|
163
|
+
def read_text(filepath: str, clean: bool = True) -> Tuple[str, dict]:
|
164
|
+
"""
|
165
|
+
Reads text from a file and optionally cleans it, returning the content and metadata.
|
166
|
+
|
167
|
+
Parameters:
|
168
|
+
filepath (str): The path to the file to read.
|
169
|
+
|
170
|
+
clean (bool): Whether to clean the text by replacing certain characters. Defaults to True.
|
171
|
+
|
172
|
+
Returns:
|
173
|
+
Tuple[str, dict]: A tuple containing the content and metadata of the file.
|
174
|
+
|
175
|
+
Raises:
|
176
|
+
FileNotFoundError: If the file cannot be found.
|
177
|
+
|
178
|
+
PermissionError: If there are permissions issues.
|
179
|
+
|
180
|
+
OSError: For other OS-related errors.
|
181
|
+
"""
|
182
|
+
|
183
|
+
def _get_metadata():
|
184
|
+
import os
|
185
|
+
from datetime import datetime
|
186
|
+
|
187
|
+
file = filepath
|
188
|
+
size = os.path.getsize(filepath)
|
189
|
+
creation_date = datetime.fromtimestamp(os.path.getctime(filepath)).date()
|
190
|
+
modified_date = datetime.fromtimestamp(os.path.getmtime(filepath)).date()
|
191
|
+
last_accessed_date = datetime.fromtimestamp(os.path.getatime(filepath)).date()
|
192
|
+
return {
|
193
|
+
"file": convert.to_str(file),
|
194
|
+
"size": size,
|
195
|
+
"creation_date": str(creation_date),
|
196
|
+
"modified_date": str(modified_date),
|
197
|
+
"last_accessed_date": str(last_accessed_date),
|
198
|
+
}
|
199
|
+
|
200
|
+
try:
|
201
|
+
with open(filepath, "r") as f:
|
202
|
+
content = f.read()
|
203
|
+
if clean:
|
204
|
+
# Define characters to replace and their replacements
|
205
|
+
replacements = {"\\": " ", "\n": " ", "\t": " ", " ": " ", "'": " "}
|
206
|
+
for old, new in replacements.items():
|
207
|
+
content = content.replace(old, new)
|
208
|
+
metadata = _get_metadata()
|
209
|
+
return content, metadata
|
210
|
+
except Exception as e:
|
211
|
+
raise e
|
212
|
+
|
213
|
+
|
214
|
+
def _file_to_chunks(
|
215
|
+
input: Dict[str, Any],
|
216
|
+
field: str = "content",
|
217
|
+
chunk_size: int = 1500,
|
218
|
+
overlap: float = 0.1,
|
219
|
+
threshold: int = 200,
|
220
|
+
) -> List[Dict[str, Any]]:
|
221
|
+
try:
|
222
|
+
out = {key: value for key, value in input.items() if key != field} | {
|
223
|
+
"chunk_overlap": overlap,
|
224
|
+
"chunk_threshold": threshold,
|
225
|
+
}
|
226
|
+
chunks = chunk_text(
|
227
|
+
input[field], chunk_size=chunk_size, overlap=overlap, threshold=threshold
|
228
|
+
)
|
229
|
+
logs = []
|
230
|
+
for i, chunk in enumerate(chunks):
|
231
|
+
chunk_dict = out | {
|
232
|
+
"file_chunks": len(chunks),
|
233
|
+
"chunk_id": i + 1,
|
234
|
+
"chunk_size": len(chunk),
|
235
|
+
f"chunk_{field}": chunk,
|
236
|
+
}
|
237
|
+
logs.append(chunk_dict)
|
238
|
+
|
239
|
+
return logs
|
240
|
+
|
241
|
+
except Exception as e:
|
242
|
+
raise ValueError(f"An error occurred while chunking the file. {e}") from e
|
243
|
+
|
244
|
+
|
245
|
+
# needs doing TODO
|
246
|
+
def file_to_chunks(
|
247
|
+
input,
|
248
|
+
# project='project',
|
249
|
+
# output_dir='data/logs/sources/',
|
250
|
+
chunk_func=_file_to_chunks,
|
251
|
+
**kwargs,
|
252
|
+
):
|
253
|
+
# out_to_csv=False,
|
254
|
+
# filename=None,
|
255
|
+
# verbose=True,
|
256
|
+
# timestamp=True,
|
257
|
+
# logger=None,
|
258
|
+
return convert.to_list(func_call.lcall(input, chunk_func, **kwargs), flatten=True)
|
259
|
+
|
260
|
+
|
261
|
+
def _datanode_parser(nodes, parser):
|
262
|
+
|
263
|
+
try:
|
264
|
+
return parser(nodes)
|
265
|
+
except Exception as e:
|
266
|
+
raise ValueError(f"DataNode parser {parser} failed. Error:{e}") from e
|
lionagi/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.0.
|
1
|
+
__version__ = "0.0.315"
|
@@ -1,5 +1,5 @@
|
|
1
|
-
lionagi/__init__.py,sha256=
|
2
|
-
lionagi/version.py,sha256=
|
1
|
+
lionagi/__init__.py,sha256=i6Ci7FebU2s4EVVnBFj1Dsi5RvP80JqeSqW-iripRPg,418
|
2
|
+
lionagi/version.py,sha256=Zazlk4sxt5cxFTrUeqVNVrVkGcIAkFTm-b9a6VLDqkw,24
|
3
3
|
lionagi/core/__init__.py,sha256=M5YXmJJiLcR5QB1VRmYvec14cHT6pKvxZOEs737BmP8,322
|
4
4
|
lionagi/core/agent/__init__.py,sha256=IVcw9yn_QMBJGBou1Atck98Us9uwPGFs-gERTv0RWew,59
|
5
5
|
lionagi/core/agent/base_agent.py,sha256=CRUpl7Zc5d2H9uCa17nMiFAnhKM_UH5Ujo1NHo3JAxg,3371
|
@@ -11,10 +11,13 @@ lionagi/core/branch/executable_branch.py,sha256=Yi0t4fDNMa5UaHo15sX-zBchr5auvXOt
|
|
11
11
|
lionagi/core/branch/util.py,sha256=os7Qp7HpDfyyCvdkbBTyIQ3AYHfzUP0M684W4XMDHN4,11813
|
12
12
|
lionagi/core/branch/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
lionagi/core/direct/__init__.py,sha256=P17UfY3tLFgu0ncxMy4FRoVDlvOGUc7jzeowN41akBk,188
|
14
|
+
lionagi/core/direct/cot.py,sha256=3hz0CjFN2Bw5IW1tOh26fzd1UVrV_41KKIS7pzCd6ok,26
|
15
|
+
lionagi/core/direct/plan.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
16
|
lionagi/core/direct/predict.py,sha256=tkxvN9m_XOf3SW8xTi5yanXylV8xVNRn9a8DeGd9xgs,6355
|
15
|
-
lionagi/core/direct/react.py,sha256=
|
17
|
+
lionagi/core/direct/react.py,sha256=IJ6sKgajCjhQ_UpJHf-j71tnVehEtIXFnyeB6bNlZwk,4196
|
16
18
|
lionagi/core/direct/score.py,sha256=QHO11WtAUfMEdfa1K-SRyn5uqf6_N0UmyCbEJsiqcQw,10328
|
17
|
-
lionagi/core/direct/select.py,sha256=
|
19
|
+
lionagi/core/direct/select.py,sha256=pPwesq29C3JZ5J3piwjBHqjOCsEM4uChPKMGBRxtSTE,6127
|
20
|
+
lionagi/core/direct/sentiment.py,sha256=rNwBs-I2XICOwsXxFvfM1Tlc_afsVcRCNCXCxfxm_2k,27
|
18
21
|
lionagi/core/direct/utils.py,sha256=yqu4qv9aaU4qzUD9QovtN2m21QySzdMLmcBp5recWC0,2333
|
19
22
|
lionagi/core/direct/vote.py,sha256=tjs-EYDGlGB3J6d_nSl1oIuJYHtxncjustBbU_pXDqQ,2449
|
20
23
|
lionagi/core/flow/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -59,16 +62,21 @@ lionagi/integrations/bridge/langchain_/langchain_bridge.py,sha256=-lnJtyf4iJEwHK
|
|
59
62
|
lionagi/integrations/bridge/llamaindex_/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
60
63
|
lionagi/integrations/bridge/llamaindex_/index.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
61
64
|
lionagi/integrations/bridge/llamaindex_/llama_index_bridge.py,sha256=SSnLSIu06xVsghTios01QKGrc2xCq3oC8fVNGO4QkF8,5064
|
62
|
-
lionagi/integrations/bridge/llamaindex_/node_parser.py,sha256=
|
65
|
+
lionagi/integrations/bridge/llamaindex_/node_parser.py,sha256=d8SPD6EMf9bZ6824jjeZOWmwm7BHBZQ0qGq1JnsKh9k,3458
|
63
66
|
lionagi/integrations/bridge/llamaindex_/reader.py,sha256=VxdTk5h3a3_5RQzN15q75XGli52umhz9gLUrKk1Sg90,8235
|
64
67
|
lionagi/integrations/bridge/llamaindex_/textnode.py,sha256=OszGitHZ36zbG4DCGWUnSV6EO7wChEH2VA5M50iBojs,2322
|
65
68
|
lionagi/integrations/bridge/pydantic_/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
66
69
|
lionagi/integrations/bridge/pydantic_/pydantic_bridge.py,sha256=TVh7sQX_LKERUvv1nxsA2JICY1S6ptPr3qFqzgHfGCY,87
|
70
|
+
lionagi/integrations/chunker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
|
+
lionagi/integrations/chunker/chunk.py,sha256=huLtaLVzs2Py6F9tFHdU0o67JncOiX4WfmmmFccbcgI,5107
|
67
72
|
lionagi/integrations/config/__init__.py,sha256=zzQGZe3H5vofcNWSjjoqe_gqHpCO8Yl7FefmrUpLqnw,133
|
68
73
|
lionagi/integrations/config/mlx_configs.py,sha256=xbostqjnk3aAN-qKyC54YBprHPA38C8YDevXMMEHXWY,44
|
69
74
|
lionagi/integrations/config/oai_configs.py,sha256=aoKx91Nv5eQU2F8v8EsALXQCEEfy3sfCgUYjCYEGJPU,2754
|
70
75
|
lionagi/integrations/config/ollama_configs.py,sha256=Np73p86bTJtxYwAj3lr5l8V9IMu7rHJPdyzHEqyzI2Q,17
|
71
76
|
lionagi/integrations/config/openrouter_configs.py,sha256=Sz4IHrriXoB8RQ0Pj23Q13Ps4AnZ0BWrh5DhL18NLwQ,1379
|
77
|
+
lionagi/integrations/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
78
|
+
lionagi/integrations/loader/load.py,sha256=XYG93HzxKvnWIrPq-MotY6JTiuB_11rSADjOq_OdeiM,4582
|
79
|
+
lionagi/integrations/loader/load_util.py,sha256=7VyoQ3z4kYscjqOGFYrhPq1U0yPIR9NHWCESfv2gmp4,8369
|
72
80
|
lionagi/integrations/provider/__init__.py,sha256=MJhnq2tkBRcMH-3utc0G-Co20MmsxLBbp3fUwHrJGQ8,198
|
73
81
|
lionagi/integrations/provider/litellm.py,sha256=l3sTtIPDeM_9soTLj9gpVfFWWDzFfIZ7rbVcuzeql2w,1181
|
74
82
|
lionagi/integrations/provider/mistralai.py,sha256=G-StbfrnUcWZvl0eRby6CZYXxmJf6BRMFzDaix-brmU,7
|
@@ -106,8 +114,8 @@ lionagi/tests/test_libs/test_func_call.py,sha256=xvs19YBNxqh3RbWLjQXY19L06b1_uZY
|
|
106
114
|
lionagi/tests/test_libs/test_nested.py,sha256=eEcE4BXJEkjoPZsd9-0rUxOJHjmu8W2hgVClUTwXEFY,13106
|
107
115
|
lionagi/tests/test_libs/test_parse.py,sha256=aa74kfOoJwDU7L7-59EcgBGYc5-OtafPIP2oGTI3Zrk,6814
|
108
116
|
lionagi/tests/test_libs/test_sys_util.py,sha256=Y-9jxLGxgbFNp78Z0PJyGUjRROMuRAG3Vo3i5LAH8Hs,7849
|
109
|
-
lionagi-0.0.
|
110
|
-
lionagi-0.0.
|
111
|
-
lionagi-0.0.
|
112
|
-
lionagi-0.0.
|
113
|
-
lionagi-0.0.
|
117
|
+
lionagi-0.0.315.dist-info/LICENSE,sha256=vfczrx-xFNkybZ7Ef-lGUnA1Vorky6wL4kwb1Fd5o3I,1089
|
118
|
+
lionagi-0.0.315.dist-info/METADATA,sha256=FVnSivifINUlYoYjEh7s01WKZ3h1Hn1AW_uKW3KfdLg,7934
|
119
|
+
lionagi-0.0.315.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
120
|
+
lionagi-0.0.315.dist-info/top_level.txt,sha256=szvch_d2jE1Lu9ZIKsl26Ll6BGfYfbOgt5lm-UpFSo4,8
|
121
|
+
lionagi-0.0.315.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|