lionagi 0.0.103__py3-none-any.whl → 0.0.105__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- lionagi/__init__.py +0 -1
- lionagi/api/__init__.py +3 -2
- lionagi/{config/llmconfig.py → api/oai_config.py} +2 -2
- lionagi/api/{OAIService.py → oai_service.py} +23 -13
- lionagi/session/__init__.py +0 -4
- lionagi/session/conversation.py +47 -43
- lionagi/session/message.py +102 -36
- lionagi/session/session.py +214 -72
- lionagi/tools/__init__.py +0 -0
- lionagi/utils/__init__.py +4 -5
- lionagi/utils/api_util.py +12 -20
- lionagi/utils/doc_util.py +38 -38
- lionagi/utils/sys_util.py +6 -3
- lionagi/utils/tool_util.py +194 -0
- lionagi/version.py +1 -1
- lionagi-0.0.105.dist-info/METADATA +311 -0
- lionagi-0.0.105.dist-info/RECORD +21 -0
- lionagi/config/__init__.py +0 -4
- lionagi/config/oaiconfig.py +0 -19
- lionagi-0.0.103.dist-info/METADATA +0 -97
- lionagi-0.0.103.dist-info/RECORD +0 -21
- {lionagi-0.0.103.dist-info → lionagi-0.0.105.dist-info}/LICENSE +0 -0
- {lionagi-0.0.103.dist-info → lionagi-0.0.105.dist-info}/WHEEL +0 -0
- {lionagi-0.0.103.dist-info → lionagi-0.0.105.dist-info}/top_level.txt +0 -0
lionagi/session/session.py
CHANGED
@@ -3,111 +3,226 @@ import asyncio
|
|
3
3
|
from typing import Any
|
4
4
|
|
5
5
|
from .conversation import Conversation
|
6
|
-
from ..
|
6
|
+
from ..utils.sys_util import to_list
|
7
7
|
from ..utils.log_util import DataLogger
|
8
8
|
from ..utils.api_util import StatusTracker
|
9
|
-
from ..
|
9
|
+
from ..utils.tool_util import ToolManager
|
10
|
+
from ..api.oai_service import OpenAIService
|
11
|
+
|
12
|
+
from ..api.oai_config import oai_llmconfig
|
13
|
+
|
10
14
|
|
11
15
|
status_tracker = StatusTracker()
|
16
|
+
OAIService = OpenAIService()
|
12
17
|
|
13
18
|
class Session():
|
14
19
|
"""
|
15
|
-
A class representing a conversation session with
|
20
|
+
A class representing a conversation session with a conversational AI system.
|
16
21
|
|
17
|
-
This class manages
|
18
|
-
and logs the interactions using a DataLogger.
|
22
|
+
This class manages the flow of conversation, system settings, and interactions with external tools.
|
19
23
|
|
20
24
|
Attributes:
|
21
|
-
conversation: An instance of the Conversation class
|
22
|
-
system: The system
|
23
|
-
llmconfig: Configuration
|
24
|
-
|
25
|
-
api_service: An instance of the API service for making
|
25
|
+
conversation (Conversation): An instance of the Conversation class to manage messages.
|
26
|
+
system (str): The current system setting for the conversation.
|
27
|
+
llmconfig (dict): Configuration settings for the language model.
|
28
|
+
_logger (DataLogger): An instance of the DataLogger class for logging conversation details.
|
29
|
+
api_service: An instance of the API service for making calls to the conversational AI model.
|
30
|
+
toolmanager (ToolManager): An instance of the ToolManager class for managing external tools.
|
26
31
|
|
27
32
|
Methods:
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
33
|
+
set_dir(dir):
|
34
|
+
Set the directory for logging.
|
35
|
+
|
36
|
+
set_system(system):
|
37
|
+
Set the system for the conversation.
|
38
|
+
|
39
|
+
set_llmconfig(llmconfig):
|
40
|
+
Set the language model configuration.
|
41
|
+
|
42
|
+
set_api_service(api_service):
|
43
|
+
Set the API service for making model calls.
|
44
|
+
|
45
|
+
_output(output, invoke=True, out=True) -> Any:
|
46
|
+
Process the output, invoke tools if needed, and optionally return the output.
|
47
|
+
|
48
|
+
register_tools(tools, funcs, update=False, new=False, prefix=None, postfix=None):
|
49
|
+
Register tools and their corresponding functions.
|
50
|
+
|
51
|
+
initiate(instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
|
52
|
+
Start a new conversation session with the provided instruction.
|
53
|
+
|
54
|
+
followup(instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
|
55
|
+
Continue the conversation with the provided instruction.
|
56
|
+
|
57
|
+
create_payload_chatcompletion(**kwargs) -> dict:
|
58
|
+
Create a payload for chat completion based on the conversation state and configuration.
|
59
|
+
|
60
|
+
call_chatcompletion(sleep=0.1, **kwargs) -> None:
|
61
|
+
Make a call to the chat completion API and process the response.
|
62
|
+
|
63
|
+
messages_to_csv(dir=None, filename="_messages.csv", **kwargs) -> None:
|
64
|
+
Save conversation messages to a CSV file.
|
65
|
+
|
66
|
+
log_to_csv(dir=None, filename="_llmlog.csv", **kwargs) -> None:
|
67
|
+
Save conversation logs to a CSV file.
|
32
68
|
"""
|
33
69
|
|
34
|
-
def __init__(self, system, dir=None, llmconfig=
|
70
|
+
def __init__(self, system, dir=None, llmconfig=oai_llmconfig, api_service=OAIService):
|
35
71
|
"""
|
36
|
-
Initialize a Session object.
|
72
|
+
Initialize a Session object with default or provided settings.
|
37
73
|
|
38
|
-
|
39
|
-
system: The system
|
40
|
-
dir: The directory for logging
|
41
|
-
llmconfig: Configuration
|
42
|
-
api_service: An instance of the API service for making
|
74
|
+
Parameters:
|
75
|
+
system (str): The initial system setting for the conversation.
|
76
|
+
dir (Optional[str]): The directory for logging. Default is None.
|
77
|
+
llmconfig (Optional[dict]): Configuration settings for the language model. Default is oai_llmconfig.
|
78
|
+
api_service: An instance of the API service for making calls to the conversational AI model.
|
43
79
|
"""
|
44
80
|
self.conversation = Conversation()
|
45
81
|
self.system = system
|
46
82
|
self.llmconfig = llmconfig
|
47
|
-
self.
|
83
|
+
self._logger = DataLogger(dir=dir)
|
84
|
+
self.api_service = api_service
|
85
|
+
self.toolmanager = ToolManager()
|
86
|
+
|
87
|
+
def set_dir(self, dir):
|
88
|
+
"""
|
89
|
+
Set the directory for logging.
|
90
|
+
|
91
|
+
Parameters:
|
92
|
+
dir (str): The directory path.
|
93
|
+
"""
|
94
|
+
self._logger.dir = dir
|
95
|
+
|
96
|
+
def set_system(self, system):
|
97
|
+
"""
|
98
|
+
Set the system for the conversation.
|
99
|
+
|
100
|
+
Parameters:
|
101
|
+
system (str): The system setting.
|
102
|
+
"""
|
103
|
+
self.conversation.change_system(system)
|
104
|
+
|
105
|
+
def set_llmconfig(self, llmconfig):
|
106
|
+
"""
|
107
|
+
Set the language model configuration.
|
108
|
+
|
109
|
+
Parameters:
|
110
|
+
llmconfig (dict): Configuration settings for the language model.
|
111
|
+
"""
|
112
|
+
self.llmconfig = llmconfig
|
113
|
+
|
114
|
+
def set_api_service(self, api_service):
|
115
|
+
"""
|
116
|
+
Set the API service for making model calls.
|
117
|
+
|
118
|
+
Parameters:
|
119
|
+
api_service: An instance of the API service.
|
120
|
+
"""
|
48
121
|
self.api_service = api_service
|
49
122
|
|
50
|
-
async def
|
123
|
+
async def _output(self, output, invoke=True, out=True):
|
51
124
|
"""
|
52
|
-
|
125
|
+
Process the output, invoke tools if needed, and optionally return the output.
|
53
126
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
out: Whether to return the output content.
|
127
|
+
Parameters:
|
128
|
+
output: The output to process.
|
129
|
+
invoke (bool): Whether to invoke tools based on the output. Default is True.
|
130
|
+
out (bool): Whether to return the output. Default is True.
|
59
131
|
|
60
132
|
Returns:
|
61
|
-
Any: The output
|
133
|
+
Any: The processed output.
|
134
|
+
"""
|
135
|
+
if invoke:
|
136
|
+
try:
|
137
|
+
func, args = self.toolmanager._get_function_call(output)
|
138
|
+
outs = await self.toolmanager.ainvoke(func, args)
|
139
|
+
self.conversation.add_messages(tool=outs)
|
140
|
+
except:
|
141
|
+
pass
|
142
|
+
if out:
|
143
|
+
return output
|
144
|
+
|
145
|
+
def register_tools(self, tools, funcs, update=False, new=False, prefix=None, postfix=None):
|
146
|
+
"""
|
147
|
+
Register tools and their corresponding functions.
|
148
|
+
|
149
|
+
Parameters:
|
150
|
+
tools (list): The list of tool information dictionaries.
|
151
|
+
funcs (list): The list of corresponding functions.
|
152
|
+
update (bool): Whether to update existing functions.
|
153
|
+
new (bool): Whether to create new registries for existing functions.
|
154
|
+
prefix (Optional[str]): A prefix to add to the function names.
|
155
|
+
postfix (Optional[str]): A postfix to add to the function names.
|
156
|
+
"""
|
157
|
+
funcs = to_list(funcs)
|
158
|
+
self.toolmanager.register_tools(tools, funcs, update, new, prefix, postfix)
|
159
|
+
|
160
|
+
async def initiate(self, instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
|
161
|
+
"""
|
162
|
+
Start a new conversation session with the provided instruction.
|
163
|
+
|
164
|
+
Parameters:
|
165
|
+
instruction (str): The instruction to initiate the conversation.
|
166
|
+
system (Optional[str]): The system setting for the conversation. Default is None.
|
167
|
+
context (Optional[dict]): Additional context for the instruction. Default is None.
|
168
|
+
out (bool): Whether to return the output. Default is True.
|
169
|
+
name (Optional[str]): The name associated with the instruction. Default is None.
|
170
|
+
invoke (bool): Whether to invoke tools based on the output. Default is True.
|
171
|
+
kwargs: Additional keyword arguments for configuration.
|
172
|
+
|
173
|
+
Returns:
|
174
|
+
Any: The processed output.
|
62
175
|
"""
|
63
176
|
config = {**self.llmconfig, **kwargs}
|
64
177
|
system = system or self.system
|
65
|
-
self.conversation.initiate_conversation(system=system, instruction=instruction, context=context)
|
66
|
-
|
178
|
+
self.conversation.initiate_conversation(system=system, instruction=instruction, context=context, name=name)
|
67
179
|
await self.call_chatcompletion(**config)
|
68
|
-
|
69
|
-
|
180
|
+
output = self.conversation.responses[-1]['content']
|
181
|
+
|
182
|
+
return await self._output(output, invoke, out)
|
70
183
|
|
71
|
-
async def followup(self, instruction, system=None, context=None, out=True, **kwargs) -> Any:
|
184
|
+
async def followup(self, instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
|
72
185
|
"""
|
73
|
-
Continue the conversation
|
186
|
+
Continue the conversation with the provided instruction.
|
74
187
|
|
75
|
-
|
76
|
-
instruction: The
|
77
|
-
system: The
|
78
|
-
context: Additional context for the
|
79
|
-
out: Whether to return the output
|
188
|
+
Parameters:
|
189
|
+
instruction (str): The instruction to continue the conversation.
|
190
|
+
system (Optional[str]): The system setting for the conversation. Default is None.
|
191
|
+
context (Optional[dict]): Additional context for the instruction. Default is None.
|
192
|
+
out (bool): Whether to return the output. Default is True.
|
193
|
+
name (Optional[str]): The name associated with the instruction. Default is None.
|
194
|
+
invoke (bool): Whether to invoke tools based on the output. Default is True.
|
195
|
+
kwargs: Additional keyword arguments for configuration.
|
80
196
|
|
81
197
|
Returns:
|
82
|
-
Any: The output
|
198
|
+
Any: The processed output.
|
83
199
|
"""
|
84
|
-
self.conversation.append_last_response()
|
85
200
|
if system:
|
86
201
|
self.conversation.change_system(system)
|
87
|
-
self.conversation.add_messages(instruction=instruction, context=context)
|
88
|
-
|
202
|
+
self.conversation.add_messages(instruction=instruction, context=context, name=name)
|
89
203
|
config = {**self.llmconfig, **kwargs}
|
90
204
|
await self.call_chatcompletion(**config)
|
91
|
-
|
92
|
-
|
205
|
+
output = self.conversation.responses[-1]['content']
|
206
|
+
|
207
|
+
return await self._output(output, invoke, out)
|
93
208
|
|
94
209
|
def create_payload_chatcompletion(self, **kwargs):
|
95
210
|
"""
|
96
|
-
Create a payload for chat completion
|
211
|
+
Create a payload for chat completion based on the conversation state and configuration.
|
97
212
|
|
98
|
-
|
99
|
-
kwargs: Additional keyword arguments for
|
213
|
+
Parameters:
|
214
|
+
kwargs: Additional keyword arguments for configuration.
|
215
|
+
|
216
|
+
Returns:
|
217
|
+
dict: The payload for chat completion.
|
100
218
|
"""
|
101
219
|
# currently only openai chat completions are supported
|
102
220
|
messages = self.conversation.messages
|
103
|
-
request_url = f"https://api.openai.com/v1/chat/completions"
|
104
221
|
config = {**self.llmconfig, **kwargs}
|
105
|
-
|
106
222
|
payload = {
|
107
223
|
"messages": messages,
|
108
224
|
"model": config.get('model'),
|
109
225
|
"frequency_penalty": config.get('frequency_penalty'),
|
110
|
-
"max_tokens": config.get('max_tokens'),
|
111
226
|
"n": config.get('n'),
|
112
227
|
"presence_penalty": config.get('presence_penalty'),
|
113
228
|
"response_format": config.get('response_format'),
|
@@ -115,35 +230,62 @@ class Session():
|
|
115
230
|
"top_p": config.get('top_p'),
|
116
231
|
}
|
117
232
|
|
118
|
-
for key in ["seed", "stop", "stream", "tools", "tool_choice", "user"]:
|
119
|
-
if config[key] is True:
|
233
|
+
for key in ["seed", "stop", "stream", "tools", "tool_choice", "user", "max_tokens"]:
|
234
|
+
if bool(config[key]) is True and str(config[key]) != "none":
|
120
235
|
payload.update({key: config[key]})
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
async def call_chatcompletion(self, delay=1, **kwargs):
|
236
|
+
return payload
|
237
|
+
|
238
|
+
async def call_chatcompletion(self, sleep=0.1, **kwargs):
|
125
239
|
"""
|
126
|
-
Make
|
240
|
+
Make a call to the chat completion API and process the response.
|
127
241
|
|
128
|
-
|
129
|
-
|
130
|
-
kwargs: Additional keyword arguments for
|
242
|
+
Parameters:
|
243
|
+
sleep (float): The sleep duration after making the API call. Default is 0.1.
|
244
|
+
kwargs: Additional keyword arguments for configuration.
|
131
245
|
"""
|
132
|
-
|
133
|
-
payload, request_url = self.create_payload_chatcompletion(**kwargs)
|
246
|
+
endpoint = f"chat/completions"
|
134
247
|
try:
|
135
248
|
async with aiohttp.ClientSession() as session:
|
136
|
-
|
249
|
+
payload = self.create_payload_chatcompletion(**kwargs)
|
250
|
+
completion = await self.api_service.call_api(
|
251
|
+
session, endpoint, payload)
|
137
252
|
if "choices" in completion:
|
138
|
-
|
139
|
-
self.
|
140
|
-
|
141
|
-
self.conversation.responses.append(response)
|
253
|
+
self._logger({"input":payload, "output": completion})
|
254
|
+
self.conversation.add_messages(response=completion['choices'][0])
|
255
|
+
self.conversation.responses.append(self.conversation.messages[-1])
|
142
256
|
self.conversation.response_counts += 1
|
143
|
-
await asyncio.sleep(
|
257
|
+
await asyncio.sleep(sleep)
|
144
258
|
status_tracker.num_tasks_succeeded += 1
|
145
259
|
else:
|
146
260
|
status_tracker.num_tasks_failed += 1
|
147
261
|
except Exception as e:
|
148
262
|
status_tracker.num_tasks_failed += 1
|
149
|
-
raise e
|
263
|
+
raise e
|
264
|
+
|
265
|
+
def messages_to_csv(self, dir=None, filename="_messages.csv", **kwags):
|
266
|
+
"""
|
267
|
+
Save conversation messages to a CSV file.
|
268
|
+
|
269
|
+
Parameters:
|
270
|
+
dir (Optional[str]): The directory path for saving the CSV file. Default is None.
|
271
|
+
filename (Optional[str]): The filename for the CSV file. Default is "_messages.csv".
|
272
|
+
kwargs: Additional keyword arguments for CSV file settings.
|
273
|
+
"""
|
274
|
+
dir = dir or self._logger.dir
|
275
|
+
if dir is None:
|
276
|
+
raise ValueError("No directory specified.")
|
277
|
+
self.conversation.msg.to_csv(dir=dir, filename=filename, **kwags)
|
278
|
+
|
279
|
+
def log_to_csv(self, dir=None, filename="_llmlog.csv", **kwags):
|
280
|
+
"""
|
281
|
+
Save conversation logs to a CSV file.
|
282
|
+
|
283
|
+
Parameters:
|
284
|
+
dir (Optional[str]): The directory path for saving the CSV file. Default is None.
|
285
|
+
filename (Optional[str]): The filename for the CSV file. Default is "_llmlog.csv".
|
286
|
+
kwargs: Additional keyword arguments for CSV file settings.
|
287
|
+
"""
|
288
|
+
dir = dir or self._logger.dir
|
289
|
+
if dir is None:
|
290
|
+
raise ValueError("No directory specified.")
|
291
|
+
self._logger.to_csv(dir=dir, filename=filename, **kwags)
|
File without changes
|
lionagi/utils/__init__.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1
|
-
from .sys_util import to_flat_dict, append_to_jsonl, to_list, str_to_num, make_copy, to_temp, to_csv, hold_call, ahold_call, l_call, al_call, m_call, am_call, e_call, ae_call, get_timestamp,
|
2
|
-
from .api_util import StatusTracker, RateLimiter, BaseAPIService, AsyncQueue
|
1
|
+
from .sys_util import to_flat_dict, append_to_jsonl, to_list, str_to_num, make_copy, to_temp, to_csv, hold_call, ahold_call, l_call, al_call, m_call, am_call, e_call, ae_call, get_timestamp, create_path
|
3
2
|
from .doc_util import dir_to_path, read_text, dir_to_files, chunk_text, file_to_chunks, file_to_chunks, get_bins
|
4
3
|
from .log_util import DataLogger
|
4
|
+
from .tool_util import ToolManager
|
5
5
|
|
6
6
|
__all__ = [
|
7
|
-
"to_list", "str_to_num", "make_copy", "to_temp", "to_csv", "hold_call", "ahold_call", "l_call", "al_call", "m_call", "am_call", "e_call", "ae_call", "get_timestamp", "
|
8
|
-
"StatusTracker", "RateLimiter", "BaseAPIService", "AsyncQueue",
|
7
|
+
"to_list", "str_to_num", "make_copy", "to_temp", "to_csv", "hold_call", "ahold_call", "l_call", "al_call", "m_call", "am_call", "e_call", "ae_call", "get_timestamp", "create_path", "to_flat_dict", "append_to_jsonl",
|
9
8
|
"dir_to_path", "read_text", "dir_to_files", "chunk_text", "file_to_chunks", "file_to_chunks", "get_bins",
|
10
|
-
"DataLogger"
|
9
|
+
"DataLogger", "ToolManager"
|
11
10
|
]
|
lionagi/utils/api_util.py
CHANGED
@@ -215,7 +215,8 @@ class RateLimiter(ABC):
|
|
215
215
|
...
|
216
216
|
>>> limiter = MyRateLimiter(100, 200)
|
217
217
|
"""
|
218
|
-
|
218
|
+
|
219
|
+
...
|
219
220
|
|
220
221
|
@abstractmethod
|
221
222
|
def calculate_num_token(self, payload: Dict[str, Any], api_endpoint: str) -> int:
|
@@ -241,7 +242,8 @@ class RateLimiter(ABC):
|
|
241
242
|
>>> limiter.calculate_num_token({'data': '12345'}, 'api/send')
|
242
243
|
0
|
243
244
|
"""
|
244
|
-
|
245
|
+
|
246
|
+
...
|
245
247
|
|
246
248
|
class BaseAPIService(ABC):
|
247
249
|
"""
|
@@ -272,9 +274,11 @@ class BaseAPIService(ABC):
|
|
272
274
|
api_key: str,
|
273
275
|
token_encoding_name: str,
|
274
276
|
max_attempts: int,
|
275
|
-
|
276
|
-
|
277
|
-
|
277
|
+
max_requests_per_minute: int,
|
278
|
+
max_tokens_per_minute: int,
|
279
|
+
ratelimiter,
|
280
|
+
status_tracker: Optional[StatusTracker] = None,
|
281
|
+
queue: Optional[AsyncQueue] = None,
|
278
282
|
) -> None:
|
279
283
|
"""
|
280
284
|
Initializes the BaseAPIService with necessary configuration.
|
@@ -299,8 +303,8 @@ class BaseAPIService(ABC):
|
|
299
303
|
self.token_encoding_name = token_encoding_name
|
300
304
|
self.max_attempts = max_attempts
|
301
305
|
self.status_tracker = status_tracker or StatusTracker()
|
302
|
-
self.rate_limiter = rate_limiter
|
303
306
|
self.queue = queue or AsyncQueue()
|
307
|
+
self.rate_limiter = ratelimiter(max_requests_per_minute, max_tokens_per_minute)
|
304
308
|
|
305
309
|
@abstractmethod
|
306
310
|
async def call_api(self) -> Any:
|
@@ -316,7 +320,8 @@ class BaseAPIService(ABC):
|
|
316
320
|
... # Implementation details here
|
317
321
|
...
|
318
322
|
"""
|
319
|
-
|
323
|
+
|
324
|
+
...
|
320
325
|
|
321
326
|
def handle_error(
|
322
327
|
self,
|
@@ -346,19 +351,6 @@ class BaseAPIService(ABC):
|
|
346
351
|
self.append_to_jsonl(data, save_filepath)
|
347
352
|
logging.error(f"Request failed after all attempts. Saving errors: {data}")
|
348
353
|
|
349
|
-
@staticmethod
|
350
|
-
def append_to_jsonl(data: Any, filename: str) -> None:
|
351
|
-
"""
|
352
|
-
Appends the given data to the specified JSONL file.
|
353
|
-
|
354
|
-
Args:
|
355
|
-
data (Any): The data to be appended in JSON Lines format.
|
356
|
-
filename (str): The file path to the JSONL file.
|
357
|
-
"""
|
358
|
-
json_string = json.dumps(data)
|
359
|
-
with open(filename, "a") as f:
|
360
|
-
f.write(json_string + "\n")
|
361
|
-
|
362
354
|
@staticmethod
|
363
355
|
def api_endpoint_from_url(request_url: str) -> str:
|
364
356
|
"""
|
lionagi/utils/doc_util.py
CHANGED
@@ -26,7 +26,7 @@ def dir_to_path(dir: str, ext, recursive: bool = False, flat: bool = True):
|
|
26
26
|
def _dir_to_path(ext, recursive=recursive):
|
27
27
|
tem = '**/*' if recursive else '*'
|
28
28
|
return list(Path(dir).glob(tem + ext))
|
29
|
-
|
29
|
+
|
30
30
|
return to_list(l_call(ext, _dir_to_path, flat=True), flat=flat)
|
31
31
|
|
32
32
|
def read_text(filepath: str, clean: bool = True) -> str:
|
@@ -48,15 +48,15 @@ def read_text(filepath: str, clean: bool = True) -> str:
|
|
48
48
|
content = f.read()
|
49
49
|
if clean:
|
50
50
|
# Define characters to replace and their replacements
|
51
|
-
replacements = {'\\': ' ', '
|
51
|
+
replacements = {'\\': ' ', '\n': ' ', '\t': ' ', ' ': ' ', '\'': ' '}
|
52
52
|
for old, new in replacements.items():
|
53
53
|
content = content.replace(old, new)
|
54
54
|
return content
|
55
55
|
|
56
|
-
def dir_to_files(dir: str, ext: str, recursive: bool = False,
|
57
|
-
reader: Callable = read_text, clean: bool = True,
|
56
|
+
def dir_to_files(dir: str, ext: str, recursive: bool = False,
|
57
|
+
reader: Callable = read_text, clean: bool = True,
|
58
58
|
to_csv: bool = False, project: str = 'project',
|
59
|
-
output_dir: str = 'data/logs/sources/', filename: Optional[str] = None,
|
59
|
+
output_dir: str = 'data/logs/sources/', filename: Optional[str] = None,
|
60
60
|
verbose: bool = True, timestamp: bool = True, logger: Optional[DataLogger] = None):
|
61
61
|
"""
|
62
62
|
Reads and processes files in a specified directory with the given extension.
|
@@ -81,9 +81,9 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
|
|
81
81
|
Examples:
|
82
82
|
>>> logs = dir_to_files(dir='my_directory', ext='.txt', to_csv=True)
|
83
83
|
"""
|
84
|
-
|
84
|
+
|
85
85
|
sources = dir_to_path(dir, ext, recursive)
|
86
|
-
|
86
|
+
|
87
87
|
def split_path(path: Path) -> tuple:
|
88
88
|
folder_name = path.parent.name
|
89
89
|
file_name = path.name
|
@@ -99,9 +99,9 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
|
|
99
99
|
"file_size": len(str(content)),
|
100
100
|
'content': content
|
101
101
|
} if content else None
|
102
|
-
|
102
|
+
|
103
103
|
logs = to_list(l_call(sources, to_dict, flat=True), dropna=True)
|
104
|
-
|
104
|
+
|
105
105
|
if to_csv:
|
106
106
|
filename = filename or f"{project}_sources.csv"
|
107
107
|
logger = DataLogger(dir=output_dir, log=logs) if not logger else logger
|
@@ -109,7 +109,7 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
|
|
109
109
|
|
110
110
|
return logs
|
111
111
|
|
112
|
-
def chunk_text(input: str, chunk_size: int, overlap: float,
|
112
|
+
def chunk_text(input: str, chunk_size: int, overlap: float,
|
113
113
|
threshold: int) -> List[Union[str, None]]:
|
114
114
|
"""
|
115
115
|
Splits a string into chunks of a specified size, allowing for optional overlap between chunks.
|
@@ -127,19 +127,19 @@ def chunk_text(input: str, chunk_size: int, overlap: float,
|
|
127
127
|
Returns:
|
128
128
|
List[Union[str, None]]: List of text chunks.
|
129
129
|
"""
|
130
|
-
|
130
|
+
|
131
131
|
try:
|
132
132
|
# Ensure text is a string
|
133
133
|
if not isinstance(input, str):
|
134
134
|
input = str(input)
|
135
|
-
|
135
|
+
|
136
136
|
chunks = []
|
137
137
|
n_chunks = math.ceil(len(input) / chunk_size)
|
138
138
|
overlap_size = int(chunk_size * overlap / 2)
|
139
|
-
|
139
|
+
|
140
140
|
if n_chunks == 1:
|
141
141
|
return [input]
|
142
|
-
|
142
|
+
|
143
143
|
elif n_chunks == 2:
|
144
144
|
chunks.append(input[:chunk_size + overlap_size])
|
145
145
|
if len(input) - chunk_size > threshold:
|
@@ -147,28 +147,28 @@ def chunk_text(input: str, chunk_size: int, overlap: float,
|
|
147
147
|
else:
|
148
148
|
return [input]
|
149
149
|
return chunks
|
150
|
-
|
150
|
+
|
151
151
|
elif n_chunks > 2:
|
152
152
|
chunks.append(input[:chunk_size + overlap_size])
|
153
153
|
for i in range(1, n_chunks - 1):
|
154
154
|
start_idx = chunk_size * i - overlap_size
|
155
155
|
end_idx = chunk_size * (i + 1) + overlap_size
|
156
156
|
chunks.append(input[start_idx:end_idx])
|
157
|
-
|
157
|
+
|
158
158
|
if len(input) - chunk_size * (n_chunks - 1) > threshold:
|
159
159
|
chunks.append(input[chunk_size * (n_chunks - 1) - overlap_size:])
|
160
160
|
else:
|
161
|
-
chunks[-1] += input[chunk_size * (n_chunks - 1):]
|
162
|
-
|
161
|
+
chunks[-1] += input[chunk_size * (n_chunks - 1) + overlap_size:]
|
162
|
+
|
163
163
|
return chunks
|
164
|
-
|
164
|
+
|
165
165
|
except Exception as e:
|
166
166
|
raise ValueError(f"An error occurred while chunking the text. {e}")
|
167
167
|
|
168
|
-
def _file_to_chunks(input: Dict[str, Any],
|
169
|
-
field: str = 'content',
|
170
|
-
chunk_size: int = 1500,
|
171
|
-
overlap: float = 0.2,
|
168
|
+
def _file_to_chunks(input: Dict[str, Any],
|
169
|
+
field: str = 'content',
|
170
|
+
chunk_size: int = 1500,
|
171
|
+
overlap: float = 0.2,
|
172
172
|
threshold: int = 200) -> List[Dict[str, Any]]:
|
173
173
|
"""
|
174
174
|
Splits text from a specified dictionary field into chunks and returns a list of dictionaries.
|
@@ -195,7 +195,7 @@ def _file_to_chunks(input: Dict[str, Any],
|
|
195
195
|
try:
|
196
196
|
out = {key: value for key, value in input.items() if key != field}
|
197
197
|
out.update({"chunk_overlap": overlap, "chunk_threshold": threshold})
|
198
|
-
|
198
|
+
|
199
199
|
chunks = chunk_text(input[field], chunk_size=chunk_size, overlap=overlap, threshold=threshold)
|
200
200
|
logs = []
|
201
201
|
for i, chunk in enumerate(chunks):
|
@@ -209,22 +209,22 @@ def _file_to_chunks(input: Dict[str, Any],
|
|
209
209
|
logs.append(chunk_dict)
|
210
210
|
|
211
211
|
return logs
|
212
|
-
|
212
|
+
|
213
213
|
except Exception as e:
|
214
214
|
raise ValueError(f"An error occurred while chunking the file. {e}")
|
215
|
-
|
216
|
-
def file_to_chunks(input,
|
217
|
-
field: str = 'content',
|
218
|
-
chunk_size: int = 1500,
|
219
|
-
overlap: float = 0.2,
|
220
|
-
threshold: int = 200,
|
221
|
-
to_csv=False,
|
215
|
+
|
216
|
+
def file_to_chunks(input,
|
217
|
+
field: str = 'content',
|
218
|
+
chunk_size: int = 1500,
|
219
|
+
overlap: float = 0.2,
|
220
|
+
threshold: int = 200,
|
221
|
+
to_csv=False,
|
222
222
|
project='project',
|
223
|
-
output_dir='data/logs/sources/',
|
223
|
+
output_dir='data/logs/sources/',
|
224
224
|
chunk_func = _file_to_chunks,
|
225
|
-
filename=None,
|
226
|
-
verbose=True,
|
227
|
-
timestamp=True,
|
225
|
+
filename=None,
|
226
|
+
verbose=True,
|
227
|
+
timestamp=True,
|
228
228
|
logger=None):
|
229
229
|
"""
|
230
230
|
Splits text from a specified dictionary field into chunks and returns a list of dictionaries.
|
@@ -243,10 +243,10 @@ def file_to_chunks(input,
|
|
243
243
|
timestamp: If True, include a timestamp in the exported file name.
|
244
244
|
logger: An optional DataLogger instance for logging.
|
245
245
|
"""
|
246
|
-
|
246
|
+
|
247
247
|
f = lambda x: chunk_func(x, field=field, chunk_size=chunk_size, overlap=overlap, threshold=threshold)
|
248
248
|
logs = to_list(l_call(input, f), flat=True)
|
249
|
-
|
249
|
+
|
250
250
|
if to_csv:
|
251
251
|
filename = filename if filename else f"{project}_sources.csv"
|
252
252
|
logger = DataLogger(log=logs) if not logger else logger
|