lionagi 0.0.103__py3-none-any.whl → 0.0.105__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lionagi/__init__.py +0 -1
- lionagi/api/__init__.py +3 -2
- lionagi/{config/llmconfig.py → api/oai_config.py} +2 -2
- lionagi/api/{OAIService.py → oai_service.py} +23 -13
- lionagi/session/__init__.py +0 -4
- lionagi/session/conversation.py +47 -43
- lionagi/session/message.py +102 -36
- lionagi/session/session.py +214 -72
- lionagi/tools/__init__.py +0 -0
- lionagi/utils/__init__.py +4 -5
- lionagi/utils/api_util.py +12 -20
- lionagi/utils/doc_util.py +38 -38
- lionagi/utils/sys_util.py +6 -3
- lionagi/utils/tool_util.py +194 -0
- lionagi/version.py +1 -1
- lionagi-0.0.105.dist-info/METADATA +311 -0
- lionagi-0.0.105.dist-info/RECORD +21 -0
- lionagi/config/__init__.py +0 -4
- lionagi/config/oaiconfig.py +0 -19
- lionagi-0.0.103.dist-info/METADATA +0 -97
- lionagi-0.0.103.dist-info/RECORD +0 -21
- {lionagi-0.0.103.dist-info → lionagi-0.0.105.dist-info}/LICENSE +0 -0
- {lionagi-0.0.103.dist-info → lionagi-0.0.105.dist-info}/WHEEL +0 -0
- {lionagi-0.0.103.dist-info → lionagi-0.0.105.dist-info}/top_level.txt +0 -0
lionagi/session/session.py
CHANGED
@@ -3,111 +3,226 @@ import asyncio
|
|
3
3
|
from typing import Any
|
4
4
|
|
5
5
|
from .conversation import Conversation
|
6
|
-
from ..
|
6
|
+
from ..utils.sys_util import to_list
|
7
7
|
from ..utils.log_util import DataLogger
|
8
8
|
from ..utils.api_util import StatusTracker
|
9
|
-
from ..
|
9
|
+
from ..utils.tool_util import ToolManager
|
10
|
+
from ..api.oai_service import OpenAIService
|
11
|
+
|
12
|
+
from ..api.oai_config import oai_llmconfig
|
13
|
+
|
10
14
|
|
11
15
|
status_tracker = StatusTracker()
|
16
|
+
OAIService = OpenAIService()
|
12
17
|
|
13
18
|
class Session():
|
14
19
|
"""
|
15
|
-
A class representing a conversation session with
|
20
|
+
A class representing a conversation session with a conversational AI system.
|
16
21
|
|
17
|
-
This class manages
|
18
|
-
and logs the interactions using a DataLogger.
|
22
|
+
This class manages the flow of conversation, system settings, and interactions with external tools.
|
19
23
|
|
20
24
|
Attributes:
|
21
|
-
conversation: An instance of the Conversation class
|
22
|
-
system: The system
|
23
|
-
llmconfig: Configuration
|
24
|
-
|
25
|
-
api_service: An instance of the API service for making
|
25
|
+
conversation (Conversation): An instance of the Conversation class to manage messages.
|
26
|
+
system (str): The current system setting for the conversation.
|
27
|
+
llmconfig (dict): Configuration settings for the language model.
|
28
|
+
_logger (DataLogger): An instance of the DataLogger class for logging conversation details.
|
29
|
+
api_service: An instance of the API service for making calls to the conversational AI model.
|
30
|
+
toolmanager (ToolManager): An instance of the ToolManager class for managing external tools.
|
26
31
|
|
27
32
|
Methods:
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
33
|
+
set_dir(dir):
|
34
|
+
Set the directory for logging.
|
35
|
+
|
36
|
+
set_system(system):
|
37
|
+
Set the system for the conversation.
|
38
|
+
|
39
|
+
set_llmconfig(llmconfig):
|
40
|
+
Set the language model configuration.
|
41
|
+
|
42
|
+
set_api_service(api_service):
|
43
|
+
Set the API service for making model calls.
|
44
|
+
|
45
|
+
_output(output, invoke=True, out=True) -> Any:
|
46
|
+
Process the output, invoke tools if needed, and optionally return the output.
|
47
|
+
|
48
|
+
register_tools(tools, funcs, update=False, new=False, prefix=None, postfix=None):
|
49
|
+
Register tools and their corresponding functions.
|
50
|
+
|
51
|
+
initiate(instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
|
52
|
+
Start a new conversation session with the provided instruction.
|
53
|
+
|
54
|
+
followup(instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
|
55
|
+
Continue the conversation with the provided instruction.
|
56
|
+
|
57
|
+
create_payload_chatcompletion(**kwargs) -> dict:
|
58
|
+
Create a payload for chat completion based on the conversation state and configuration.
|
59
|
+
|
60
|
+
call_chatcompletion(sleep=0.1, **kwargs) -> None:
|
61
|
+
Make a call to the chat completion API and process the response.
|
62
|
+
|
63
|
+
messages_to_csv(dir=None, filename="_messages.csv", **kwargs) -> None:
|
64
|
+
Save conversation messages to a CSV file.
|
65
|
+
|
66
|
+
log_to_csv(dir=None, filename="_llmlog.csv", **kwargs) -> None:
|
67
|
+
Save conversation logs to a CSV file.
|
32
68
|
"""
|
33
69
|
|
34
|
-
def __init__(self, system, dir=None, llmconfig=
|
70
|
+
def __init__(self, system, dir=None, llmconfig=oai_llmconfig, api_service=OAIService):
|
35
71
|
"""
|
36
|
-
Initialize a Session object.
|
72
|
+
Initialize a Session object with default or provided settings.
|
37
73
|
|
38
|
-
|
39
|
-
system: The system
|
40
|
-
dir: The directory for logging
|
41
|
-
llmconfig: Configuration
|
42
|
-
api_service: An instance of the API service for making
|
74
|
+
Parameters:
|
75
|
+
system (str): The initial system setting for the conversation.
|
76
|
+
dir (Optional[str]): The directory for logging. Default is None.
|
77
|
+
llmconfig (Optional[dict]): Configuration settings for the language model. Default is oai_llmconfig.
|
78
|
+
api_service: An instance of the API service for making calls to the conversational AI model.
|
43
79
|
"""
|
44
80
|
self.conversation = Conversation()
|
45
81
|
self.system = system
|
46
82
|
self.llmconfig = llmconfig
|
47
|
-
self.
|
83
|
+
self._logger = DataLogger(dir=dir)
|
84
|
+
self.api_service = api_service
|
85
|
+
self.toolmanager = ToolManager()
|
86
|
+
|
87
|
+
def set_dir(self, dir):
|
88
|
+
"""
|
89
|
+
Set the directory for logging.
|
90
|
+
|
91
|
+
Parameters:
|
92
|
+
dir (str): The directory path.
|
93
|
+
"""
|
94
|
+
self._logger.dir = dir
|
95
|
+
|
96
|
+
def set_system(self, system):
|
97
|
+
"""
|
98
|
+
Set the system for the conversation.
|
99
|
+
|
100
|
+
Parameters:
|
101
|
+
system (str): The system setting.
|
102
|
+
"""
|
103
|
+
self.conversation.change_system(system)
|
104
|
+
|
105
|
+
def set_llmconfig(self, llmconfig):
|
106
|
+
"""
|
107
|
+
Set the language model configuration.
|
108
|
+
|
109
|
+
Parameters:
|
110
|
+
llmconfig (dict): Configuration settings for the language model.
|
111
|
+
"""
|
112
|
+
self.llmconfig = llmconfig
|
113
|
+
|
114
|
+
def set_api_service(self, api_service):
|
115
|
+
"""
|
116
|
+
Set the API service for making model calls.
|
117
|
+
|
118
|
+
Parameters:
|
119
|
+
api_service: An instance of the API service.
|
120
|
+
"""
|
48
121
|
self.api_service = api_service
|
49
122
|
|
50
|
-
async def
|
123
|
+
async def _output(self, output, invoke=True, out=True):
|
51
124
|
"""
|
52
|
-
|
125
|
+
Process the output, invoke tools if needed, and optionally return the output.
|
53
126
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
out: Whether to return the output content.
|
127
|
+
Parameters:
|
128
|
+
output: The output to process.
|
129
|
+
invoke (bool): Whether to invoke tools based on the output. Default is True.
|
130
|
+
out (bool): Whether to return the output. Default is True.
|
59
131
|
|
60
132
|
Returns:
|
61
|
-
Any: The output
|
133
|
+
Any: The processed output.
|
134
|
+
"""
|
135
|
+
if invoke:
|
136
|
+
try:
|
137
|
+
func, args = self.toolmanager._get_function_call(output)
|
138
|
+
outs = await self.toolmanager.ainvoke(func, args)
|
139
|
+
self.conversation.add_messages(tool=outs)
|
140
|
+
except:
|
141
|
+
pass
|
142
|
+
if out:
|
143
|
+
return output
|
144
|
+
|
145
|
+
def register_tools(self, tools, funcs, update=False, new=False, prefix=None, postfix=None):
|
146
|
+
"""
|
147
|
+
Register tools and their corresponding functions.
|
148
|
+
|
149
|
+
Parameters:
|
150
|
+
tools (list): The list of tool information dictionaries.
|
151
|
+
funcs (list): The list of corresponding functions.
|
152
|
+
update (bool): Whether to update existing functions.
|
153
|
+
new (bool): Whether to create new registries for existing functions.
|
154
|
+
prefix (Optional[str]): A prefix to add to the function names.
|
155
|
+
postfix (Optional[str]): A postfix to add to the function names.
|
156
|
+
"""
|
157
|
+
funcs = to_list(funcs)
|
158
|
+
self.toolmanager.register_tools(tools, funcs, update, new, prefix, postfix)
|
159
|
+
|
160
|
+
async def initiate(self, instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
|
161
|
+
"""
|
162
|
+
Start a new conversation session with the provided instruction.
|
163
|
+
|
164
|
+
Parameters:
|
165
|
+
instruction (str): The instruction to initiate the conversation.
|
166
|
+
system (Optional[str]): The system setting for the conversation. Default is None.
|
167
|
+
context (Optional[dict]): Additional context for the instruction. Default is None.
|
168
|
+
out (bool): Whether to return the output. Default is True.
|
169
|
+
name (Optional[str]): The name associated with the instruction. Default is None.
|
170
|
+
invoke (bool): Whether to invoke tools based on the output. Default is True.
|
171
|
+
kwargs: Additional keyword arguments for configuration.
|
172
|
+
|
173
|
+
Returns:
|
174
|
+
Any: The processed output.
|
62
175
|
"""
|
63
176
|
config = {**self.llmconfig, **kwargs}
|
64
177
|
system = system or self.system
|
65
|
-
self.conversation.initiate_conversation(system=system, instruction=instruction, context=context)
|
66
|
-
|
178
|
+
self.conversation.initiate_conversation(system=system, instruction=instruction, context=context, name=name)
|
67
179
|
await self.call_chatcompletion(**config)
|
68
|
-
|
69
|
-
|
180
|
+
output = self.conversation.responses[-1]['content']
|
181
|
+
|
182
|
+
return await self._output(output, invoke, out)
|
70
183
|
|
71
|
-
async def followup(self, instruction, system=None, context=None, out=True, **kwargs) -> Any:
|
184
|
+
async def followup(self, instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
|
72
185
|
"""
|
73
|
-
Continue the conversation
|
186
|
+
Continue the conversation with the provided instruction.
|
74
187
|
|
75
|
-
|
76
|
-
instruction: The
|
77
|
-
system: The
|
78
|
-
context: Additional context for the
|
79
|
-
out: Whether to return the output
|
188
|
+
Parameters:
|
189
|
+
instruction (str): The instruction to continue the conversation.
|
190
|
+
system (Optional[str]): The system setting for the conversation. Default is None.
|
191
|
+
context (Optional[dict]): Additional context for the instruction. Default is None.
|
192
|
+
out (bool): Whether to return the output. Default is True.
|
193
|
+
name (Optional[str]): The name associated with the instruction. Default is None.
|
194
|
+
invoke (bool): Whether to invoke tools based on the output. Default is True.
|
195
|
+
kwargs: Additional keyword arguments for configuration.
|
80
196
|
|
81
197
|
Returns:
|
82
|
-
Any: The output
|
198
|
+
Any: The processed output.
|
83
199
|
"""
|
84
|
-
self.conversation.append_last_response()
|
85
200
|
if system:
|
86
201
|
self.conversation.change_system(system)
|
87
|
-
self.conversation.add_messages(instruction=instruction, context=context)
|
88
|
-
|
202
|
+
self.conversation.add_messages(instruction=instruction, context=context, name=name)
|
89
203
|
config = {**self.llmconfig, **kwargs}
|
90
204
|
await self.call_chatcompletion(**config)
|
91
|
-
|
92
|
-
|
205
|
+
output = self.conversation.responses[-1]['content']
|
206
|
+
|
207
|
+
return await self._output(output, invoke, out)
|
93
208
|
|
94
209
|
def create_payload_chatcompletion(self, **kwargs):
|
95
210
|
"""
|
96
|
-
Create a payload for chat completion
|
211
|
+
Create a payload for chat completion based on the conversation state and configuration.
|
97
212
|
|
98
|
-
|
99
|
-
kwargs: Additional keyword arguments for
|
213
|
+
Parameters:
|
214
|
+
kwargs: Additional keyword arguments for configuration.
|
215
|
+
|
216
|
+
Returns:
|
217
|
+
dict: The payload for chat completion.
|
100
218
|
"""
|
101
219
|
# currently only openai chat completions are supported
|
102
220
|
messages = self.conversation.messages
|
103
|
-
request_url = f"https://api.openai.com/v1/chat/completions"
|
104
221
|
config = {**self.llmconfig, **kwargs}
|
105
|
-
|
106
222
|
payload = {
|
107
223
|
"messages": messages,
|
108
224
|
"model": config.get('model'),
|
109
225
|
"frequency_penalty": config.get('frequency_penalty'),
|
110
|
-
"max_tokens": config.get('max_tokens'),
|
111
226
|
"n": config.get('n'),
|
112
227
|
"presence_penalty": config.get('presence_penalty'),
|
113
228
|
"response_format": config.get('response_format'),
|
@@ -115,35 +230,62 @@ class Session():
|
|
115
230
|
"top_p": config.get('top_p'),
|
116
231
|
}
|
117
232
|
|
118
|
-
for key in ["seed", "stop", "stream", "tools", "tool_choice", "user"]:
|
119
|
-
if config[key] is True:
|
233
|
+
for key in ["seed", "stop", "stream", "tools", "tool_choice", "user", "max_tokens"]:
|
234
|
+
if bool(config[key]) is True and str(config[key]) != "none":
|
120
235
|
payload.update({key: config[key]})
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
async def call_chatcompletion(self, delay=1, **kwargs):
|
236
|
+
return payload
|
237
|
+
|
238
|
+
async def call_chatcompletion(self, sleep=0.1, **kwargs):
|
125
239
|
"""
|
126
|
-
Make
|
240
|
+
Make a call to the chat completion API and process the response.
|
127
241
|
|
128
|
-
|
129
|
-
|
130
|
-
kwargs: Additional keyword arguments for
|
242
|
+
Parameters:
|
243
|
+
sleep (float): The sleep duration after making the API call. Default is 0.1.
|
244
|
+
kwargs: Additional keyword arguments for configuration.
|
131
245
|
"""
|
132
|
-
|
133
|
-
payload, request_url = self.create_payload_chatcompletion(**kwargs)
|
246
|
+
endpoint = f"chat/completions"
|
134
247
|
try:
|
135
248
|
async with aiohttp.ClientSession() as session:
|
136
|
-
|
249
|
+
payload = self.create_payload_chatcompletion(**kwargs)
|
250
|
+
completion = await self.api_service.call_api(
|
251
|
+
session, endpoint, payload)
|
137
252
|
if "choices" in completion:
|
138
|
-
|
139
|
-
self.
|
140
|
-
|
141
|
-
self.conversation.responses.append(response)
|
253
|
+
self._logger({"input":payload, "output": completion})
|
254
|
+
self.conversation.add_messages(response=completion['choices'][0])
|
255
|
+
self.conversation.responses.append(self.conversation.messages[-1])
|
142
256
|
self.conversation.response_counts += 1
|
143
|
-
await asyncio.sleep(
|
257
|
+
await asyncio.sleep(sleep)
|
144
258
|
status_tracker.num_tasks_succeeded += 1
|
145
259
|
else:
|
146
260
|
status_tracker.num_tasks_failed += 1
|
147
261
|
except Exception as e:
|
148
262
|
status_tracker.num_tasks_failed += 1
|
149
|
-
raise e
|
263
|
+
raise e
|
264
|
+
|
265
|
+
def messages_to_csv(self, dir=None, filename="_messages.csv", **kwags):
|
266
|
+
"""
|
267
|
+
Save conversation messages to a CSV file.
|
268
|
+
|
269
|
+
Parameters:
|
270
|
+
dir (Optional[str]): The directory path for saving the CSV file. Default is None.
|
271
|
+
filename (Optional[str]): The filename for the CSV file. Default is "_messages.csv".
|
272
|
+
kwargs: Additional keyword arguments for CSV file settings.
|
273
|
+
"""
|
274
|
+
dir = dir or self._logger.dir
|
275
|
+
if dir is None:
|
276
|
+
raise ValueError("No directory specified.")
|
277
|
+
self.conversation.msg.to_csv(dir=dir, filename=filename, **kwags)
|
278
|
+
|
279
|
+
def log_to_csv(self, dir=None, filename="_llmlog.csv", **kwags):
|
280
|
+
"""
|
281
|
+
Save conversation logs to a CSV file.
|
282
|
+
|
283
|
+
Parameters:
|
284
|
+
dir (Optional[str]): The directory path for saving the CSV file. Default is None.
|
285
|
+
filename (Optional[str]): The filename for the CSV file. Default is "_llmlog.csv".
|
286
|
+
kwargs: Additional keyword arguments for CSV file settings.
|
287
|
+
"""
|
288
|
+
dir = dir or self._logger.dir
|
289
|
+
if dir is None:
|
290
|
+
raise ValueError("No directory specified.")
|
291
|
+
self._logger.to_csv(dir=dir, filename=filename, **kwags)
|
File without changes
|
lionagi/utils/__init__.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1
|
-
from .sys_util import to_flat_dict, append_to_jsonl, to_list, str_to_num, make_copy, to_temp, to_csv, hold_call, ahold_call, l_call, al_call, m_call, am_call, e_call, ae_call, get_timestamp,
|
2
|
-
from .api_util import StatusTracker, RateLimiter, BaseAPIService, AsyncQueue
|
1
|
+
from .sys_util import to_flat_dict, append_to_jsonl, to_list, str_to_num, make_copy, to_temp, to_csv, hold_call, ahold_call, l_call, al_call, m_call, am_call, e_call, ae_call, get_timestamp, create_path
|
3
2
|
from .doc_util import dir_to_path, read_text, dir_to_files, chunk_text, file_to_chunks, file_to_chunks, get_bins
|
4
3
|
from .log_util import DataLogger
|
4
|
+
from .tool_util import ToolManager
|
5
5
|
|
6
6
|
__all__ = [
|
7
|
-
"to_list", "str_to_num", "make_copy", "to_temp", "to_csv", "hold_call", "ahold_call", "l_call", "al_call", "m_call", "am_call", "e_call", "ae_call", "get_timestamp", "
|
8
|
-
"StatusTracker", "RateLimiter", "BaseAPIService", "AsyncQueue",
|
7
|
+
"to_list", "str_to_num", "make_copy", "to_temp", "to_csv", "hold_call", "ahold_call", "l_call", "al_call", "m_call", "am_call", "e_call", "ae_call", "get_timestamp", "create_path", "to_flat_dict", "append_to_jsonl",
|
9
8
|
"dir_to_path", "read_text", "dir_to_files", "chunk_text", "file_to_chunks", "file_to_chunks", "get_bins",
|
10
|
-
"DataLogger"
|
9
|
+
"DataLogger", "ToolManager"
|
11
10
|
]
|
lionagi/utils/api_util.py
CHANGED
@@ -215,7 +215,8 @@ class RateLimiter(ABC):
|
|
215
215
|
...
|
216
216
|
>>> limiter = MyRateLimiter(100, 200)
|
217
217
|
"""
|
218
|
-
|
218
|
+
|
219
|
+
...
|
219
220
|
|
220
221
|
@abstractmethod
|
221
222
|
def calculate_num_token(self, payload: Dict[str, Any], api_endpoint: str) -> int:
|
@@ -241,7 +242,8 @@ class RateLimiter(ABC):
|
|
241
242
|
>>> limiter.calculate_num_token({'data': '12345'}, 'api/send')
|
242
243
|
0
|
243
244
|
"""
|
244
|
-
|
245
|
+
|
246
|
+
...
|
245
247
|
|
246
248
|
class BaseAPIService(ABC):
|
247
249
|
"""
|
@@ -272,9 +274,11 @@ class BaseAPIService(ABC):
|
|
272
274
|
api_key: str,
|
273
275
|
token_encoding_name: str,
|
274
276
|
max_attempts: int,
|
275
|
-
|
276
|
-
|
277
|
-
|
277
|
+
max_requests_per_minute: int,
|
278
|
+
max_tokens_per_minute: int,
|
279
|
+
ratelimiter,
|
280
|
+
status_tracker: Optional[StatusTracker] = None,
|
281
|
+
queue: Optional[AsyncQueue] = None,
|
278
282
|
) -> None:
|
279
283
|
"""
|
280
284
|
Initializes the BaseAPIService with necessary configuration.
|
@@ -299,8 +303,8 @@ class BaseAPIService(ABC):
|
|
299
303
|
self.token_encoding_name = token_encoding_name
|
300
304
|
self.max_attempts = max_attempts
|
301
305
|
self.status_tracker = status_tracker or StatusTracker()
|
302
|
-
self.rate_limiter = rate_limiter
|
303
306
|
self.queue = queue or AsyncQueue()
|
307
|
+
self.rate_limiter = ratelimiter(max_requests_per_minute, max_tokens_per_minute)
|
304
308
|
|
305
309
|
@abstractmethod
|
306
310
|
async def call_api(self) -> Any:
|
@@ -316,7 +320,8 @@ class BaseAPIService(ABC):
|
|
316
320
|
... # Implementation details here
|
317
321
|
...
|
318
322
|
"""
|
319
|
-
|
323
|
+
|
324
|
+
...
|
320
325
|
|
321
326
|
def handle_error(
|
322
327
|
self,
|
@@ -346,19 +351,6 @@ class BaseAPIService(ABC):
|
|
346
351
|
self.append_to_jsonl(data, save_filepath)
|
347
352
|
logging.error(f"Request failed after all attempts. Saving errors: {data}")
|
348
353
|
|
349
|
-
@staticmethod
|
350
|
-
def append_to_jsonl(data: Any, filename: str) -> None:
|
351
|
-
"""
|
352
|
-
Appends the given data to the specified JSONL file.
|
353
|
-
|
354
|
-
Args:
|
355
|
-
data (Any): The data to be appended in JSON Lines format.
|
356
|
-
filename (str): The file path to the JSONL file.
|
357
|
-
"""
|
358
|
-
json_string = json.dumps(data)
|
359
|
-
with open(filename, "a") as f:
|
360
|
-
f.write(json_string + "\n")
|
361
|
-
|
362
354
|
@staticmethod
|
363
355
|
def api_endpoint_from_url(request_url: str) -> str:
|
364
356
|
"""
|
lionagi/utils/doc_util.py
CHANGED
@@ -26,7 +26,7 @@ def dir_to_path(dir: str, ext, recursive: bool = False, flat: bool = True):
|
|
26
26
|
def _dir_to_path(ext, recursive=recursive):
|
27
27
|
tem = '**/*' if recursive else '*'
|
28
28
|
return list(Path(dir).glob(tem + ext))
|
29
|
-
|
29
|
+
|
30
30
|
return to_list(l_call(ext, _dir_to_path, flat=True), flat=flat)
|
31
31
|
|
32
32
|
def read_text(filepath: str, clean: bool = True) -> str:
|
@@ -48,15 +48,15 @@ def read_text(filepath: str, clean: bool = True) -> str:
|
|
48
48
|
content = f.read()
|
49
49
|
if clean:
|
50
50
|
# Define characters to replace and their replacements
|
51
|
-
replacements = {'\\': ' ', '
|
51
|
+
replacements = {'\\': ' ', '\n': ' ', '\t': ' ', ' ': ' ', '\'': ' '}
|
52
52
|
for old, new in replacements.items():
|
53
53
|
content = content.replace(old, new)
|
54
54
|
return content
|
55
55
|
|
56
|
-
def dir_to_files(dir: str, ext: str, recursive: bool = False,
|
57
|
-
reader: Callable = read_text, clean: bool = True,
|
56
|
+
def dir_to_files(dir: str, ext: str, recursive: bool = False,
|
57
|
+
reader: Callable = read_text, clean: bool = True,
|
58
58
|
to_csv: bool = False, project: str = 'project',
|
59
|
-
output_dir: str = 'data/logs/sources/', filename: Optional[str] = None,
|
59
|
+
output_dir: str = 'data/logs/sources/', filename: Optional[str] = None,
|
60
60
|
verbose: bool = True, timestamp: bool = True, logger: Optional[DataLogger] = None):
|
61
61
|
"""
|
62
62
|
Reads and processes files in a specified directory with the given extension.
|
@@ -81,9 +81,9 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
|
|
81
81
|
Examples:
|
82
82
|
>>> logs = dir_to_files(dir='my_directory', ext='.txt', to_csv=True)
|
83
83
|
"""
|
84
|
-
|
84
|
+
|
85
85
|
sources = dir_to_path(dir, ext, recursive)
|
86
|
-
|
86
|
+
|
87
87
|
def split_path(path: Path) -> tuple:
|
88
88
|
folder_name = path.parent.name
|
89
89
|
file_name = path.name
|
@@ -99,9 +99,9 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
|
|
99
99
|
"file_size": len(str(content)),
|
100
100
|
'content': content
|
101
101
|
} if content else None
|
102
|
-
|
102
|
+
|
103
103
|
logs = to_list(l_call(sources, to_dict, flat=True), dropna=True)
|
104
|
-
|
104
|
+
|
105
105
|
if to_csv:
|
106
106
|
filename = filename or f"{project}_sources.csv"
|
107
107
|
logger = DataLogger(dir=output_dir, log=logs) if not logger else logger
|
@@ -109,7 +109,7 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
|
|
109
109
|
|
110
110
|
return logs
|
111
111
|
|
112
|
-
def chunk_text(input: str, chunk_size: int, overlap: float,
|
112
|
+
def chunk_text(input: str, chunk_size: int, overlap: float,
|
113
113
|
threshold: int) -> List[Union[str, None]]:
|
114
114
|
"""
|
115
115
|
Splits a string into chunks of a specified size, allowing for optional overlap between chunks.
|
@@ -127,19 +127,19 @@ def chunk_text(input: str, chunk_size: int, overlap: float,
|
|
127
127
|
Returns:
|
128
128
|
List[Union[str, None]]: List of text chunks.
|
129
129
|
"""
|
130
|
-
|
130
|
+
|
131
131
|
try:
|
132
132
|
# Ensure text is a string
|
133
133
|
if not isinstance(input, str):
|
134
134
|
input = str(input)
|
135
|
-
|
135
|
+
|
136
136
|
chunks = []
|
137
137
|
n_chunks = math.ceil(len(input) / chunk_size)
|
138
138
|
overlap_size = int(chunk_size * overlap / 2)
|
139
|
-
|
139
|
+
|
140
140
|
if n_chunks == 1:
|
141
141
|
return [input]
|
142
|
-
|
142
|
+
|
143
143
|
elif n_chunks == 2:
|
144
144
|
chunks.append(input[:chunk_size + overlap_size])
|
145
145
|
if len(input) - chunk_size > threshold:
|
@@ -147,28 +147,28 @@ def chunk_text(input: str, chunk_size: int, overlap: float,
|
|
147
147
|
else:
|
148
148
|
return [input]
|
149
149
|
return chunks
|
150
|
-
|
150
|
+
|
151
151
|
elif n_chunks > 2:
|
152
152
|
chunks.append(input[:chunk_size + overlap_size])
|
153
153
|
for i in range(1, n_chunks - 1):
|
154
154
|
start_idx = chunk_size * i - overlap_size
|
155
155
|
end_idx = chunk_size * (i + 1) + overlap_size
|
156
156
|
chunks.append(input[start_idx:end_idx])
|
157
|
-
|
157
|
+
|
158
158
|
if len(input) - chunk_size * (n_chunks - 1) > threshold:
|
159
159
|
chunks.append(input[chunk_size * (n_chunks - 1) - overlap_size:])
|
160
160
|
else:
|
161
|
-
chunks[-1] += input[chunk_size * (n_chunks - 1):]
|
162
|
-
|
161
|
+
chunks[-1] += input[chunk_size * (n_chunks - 1) + overlap_size:]
|
162
|
+
|
163
163
|
return chunks
|
164
|
-
|
164
|
+
|
165
165
|
except Exception as e:
|
166
166
|
raise ValueError(f"An error occurred while chunking the text. {e}")
|
167
167
|
|
168
|
-
def _file_to_chunks(input: Dict[str, Any],
|
169
|
-
field: str = 'content',
|
170
|
-
chunk_size: int = 1500,
|
171
|
-
overlap: float = 0.2,
|
168
|
+
def _file_to_chunks(input: Dict[str, Any],
|
169
|
+
field: str = 'content',
|
170
|
+
chunk_size: int = 1500,
|
171
|
+
overlap: float = 0.2,
|
172
172
|
threshold: int = 200) -> List[Dict[str, Any]]:
|
173
173
|
"""
|
174
174
|
Splits text from a specified dictionary field into chunks and returns a list of dictionaries.
|
@@ -195,7 +195,7 @@ def _file_to_chunks(input: Dict[str, Any],
|
|
195
195
|
try:
|
196
196
|
out = {key: value for key, value in input.items() if key != field}
|
197
197
|
out.update({"chunk_overlap": overlap, "chunk_threshold": threshold})
|
198
|
-
|
198
|
+
|
199
199
|
chunks = chunk_text(input[field], chunk_size=chunk_size, overlap=overlap, threshold=threshold)
|
200
200
|
logs = []
|
201
201
|
for i, chunk in enumerate(chunks):
|
@@ -209,22 +209,22 @@ def _file_to_chunks(input: Dict[str, Any],
|
|
209
209
|
logs.append(chunk_dict)
|
210
210
|
|
211
211
|
return logs
|
212
|
-
|
212
|
+
|
213
213
|
except Exception as e:
|
214
214
|
raise ValueError(f"An error occurred while chunking the file. {e}")
|
215
|
-
|
216
|
-
def file_to_chunks(input,
|
217
|
-
field: str = 'content',
|
218
|
-
chunk_size: int = 1500,
|
219
|
-
overlap: float = 0.2,
|
220
|
-
threshold: int = 200,
|
221
|
-
to_csv=False,
|
215
|
+
|
216
|
+
def file_to_chunks(input,
|
217
|
+
field: str = 'content',
|
218
|
+
chunk_size: int = 1500,
|
219
|
+
overlap: float = 0.2,
|
220
|
+
threshold: int = 200,
|
221
|
+
to_csv=False,
|
222
222
|
project='project',
|
223
|
-
output_dir='data/logs/sources/',
|
223
|
+
output_dir='data/logs/sources/',
|
224
224
|
chunk_func = _file_to_chunks,
|
225
|
-
filename=None,
|
226
|
-
verbose=True,
|
227
|
-
timestamp=True,
|
225
|
+
filename=None,
|
226
|
+
verbose=True,
|
227
|
+
timestamp=True,
|
228
228
|
logger=None):
|
229
229
|
"""
|
230
230
|
Splits text from a specified dictionary field into chunks and returns a list of dictionaries.
|
@@ -243,10 +243,10 @@ def file_to_chunks(input,
|
|
243
243
|
timestamp: If True, include a timestamp in the exported file name.
|
244
244
|
logger: An optional DataLogger instance for logging.
|
245
245
|
"""
|
246
|
-
|
246
|
+
|
247
247
|
f = lambda x: chunk_func(x, field=field, chunk_size=chunk_size, overlap=overlap, threshold=threshold)
|
248
248
|
logs = to_list(l_call(input, f), flat=True)
|
249
|
-
|
249
|
+
|
250
250
|
if to_csv:
|
251
251
|
filename = filename if filename else f"{project}_sources.csv"
|
252
252
|
logger = DataLogger(log=logs) if not logger else logger
|