lionagi 0.0.103__py3-none-any.whl → 0.0.105__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,111 +3,226 @@ import asyncio
3
3
  from typing import Any
4
4
 
5
5
  from .conversation import Conversation
6
- from ..config.llmconfig import llmconfig
6
+ from ..utils.sys_util import to_list
7
7
  from ..utils.log_util import DataLogger
8
8
  from ..utils.api_util import StatusTracker
9
- from ..config.oaiconfig import OAIService
9
+ from ..utils.tool_util import ToolManager
10
+ from ..api.oai_service import OpenAIService
11
+
12
+ from ..api.oai_config import oai_llmconfig
13
+
10
14
 
11
15
  status_tracker = StatusTracker()
16
+ OAIService = OpenAIService()
12
17
 
13
18
  class Session():
14
19
  """
15
- A class representing a conversation session with chat completion capabilities.
20
+ A class representing a conversation session with a conversational AI system.
16
21
 
17
- This class manages conversations, interacts with chat completion services (currently OpenAI),
18
- and logs the interactions using a DataLogger.
22
+ This class manages the flow of conversation, system settings, and interactions with external tools.
19
23
 
20
24
  Attributes:
21
- conversation: An instance of the Conversation class for managing messages.
22
- system: The system identifier for the conversation session.
23
- llmconfig: Configuration parameters for language models.
24
- logger: An instance of DataLogger for logging conversation interactions.
25
- api_service: An instance of the API service for making asynchronous API calls.
25
+ conversation (Conversation): An instance of the Conversation class to manage messages.
26
+ system (str): The current system setting for the conversation.
27
+ llmconfig (dict): Configuration settings for the language model.
28
+ _logger (DataLogger): An instance of the DataLogger class for logging conversation details.
29
+ api_service: An instance of the API service for making calls to the conversational AI model.
30
+ toolmanager (ToolManager): An instance of the ToolManager class for managing external tools.
26
31
 
27
32
  Methods:
28
- initiate: Initiate a conversation session with the given instruction.
29
- followup: Continue the conversation session with a follow-up instruction.
30
- create_payload_chatcompletion: Create a payload for chat completion API calls.
31
- call_chatcompletion: Make an asynchronous call to the chat completion API.
33
+ set_dir(dir):
34
+ Set the directory for logging.
35
+
36
+ set_system(system):
37
+ Set the system for the conversation.
38
+
39
+ set_llmconfig(llmconfig):
40
+ Set the language model configuration.
41
+
42
+ set_api_service(api_service):
43
+ Set the API service for making model calls.
44
+
45
+ _output(output, invoke=True, out=True) -> Any:
46
+ Process the output, invoke tools if needed, and optionally return the output.
47
+
48
+ register_tools(tools, funcs, update=False, new=False, prefix=None, postfix=None):
49
+ Register tools and their corresponding functions.
50
+
51
+ initiate(instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
52
+ Start a new conversation session with the provided instruction.
53
+
54
+ followup(instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
55
+ Continue the conversation with the provided instruction.
56
+
57
+ create_payload_chatcompletion(**kwargs) -> dict:
58
+ Create a payload for chat completion based on the conversation state and configuration.
59
+
60
+ call_chatcompletion(sleep=0.1, **kwargs) -> None:
61
+ Make a call to the chat completion API and process the response.
62
+
63
+ messages_to_csv(dir=None, filename="_messages.csv", **kwargs) -> None:
64
+ Save conversation messages to a CSV file.
65
+
66
+ log_to_csv(dir=None, filename="_llmlog.csv", **kwargs) -> None:
67
+ Save conversation logs to a CSV file.
32
68
  """
33
69
 
34
- def __init__(self, system, dir=None, llmconfig=llmconfig, api_service=OAIService):
70
+ def __init__(self, system, dir=None, llmconfig=oai_llmconfig, api_service=OAIService):
35
71
  """
36
- Initialize a Session object.
72
+ Initialize a Session object with default or provided settings.
37
73
 
38
- Args:
39
- system: The system identifier for the conversation session.
40
- dir: The directory for logging interactions.
41
- llmconfig: Configuration parameters for language models.
42
- api_service: An instance of the API service for making asynchronous API calls.
74
+ Parameters:
75
+ system (str): The initial system setting for the conversation.
76
+ dir (Optional[str]): The directory for logging. Default is None.
77
+ llmconfig (Optional[dict]): Configuration settings for the language model. Default is oai_llmconfig.
78
+ api_service: An instance of the API service for making calls to the conversational AI model.
43
79
  """
44
80
  self.conversation = Conversation()
45
81
  self.system = system
46
82
  self.llmconfig = llmconfig
47
- self.logger = DataLogger(dir=dir)
83
+ self._logger = DataLogger(dir=dir)
84
+ self.api_service = api_service
85
+ self.toolmanager = ToolManager()
86
+
87
+ def set_dir(self, dir):
88
+ """
89
+ Set the directory for logging.
90
+
91
+ Parameters:
92
+ dir (str): The directory path.
93
+ """
94
+ self._logger.dir = dir
95
+
96
+ def set_system(self, system):
97
+ """
98
+ Set the system for the conversation.
99
+
100
+ Parameters:
101
+ system (str): The system setting.
102
+ """
103
+ self.conversation.change_system(system)
104
+
105
+ def set_llmconfig(self, llmconfig):
106
+ """
107
+ Set the language model configuration.
108
+
109
+ Parameters:
110
+ llmconfig (dict): Configuration settings for the language model.
111
+ """
112
+ self.llmconfig = llmconfig
113
+
114
+ def set_api_service(self, api_service):
115
+ """
116
+ Set the API service for making model calls.
117
+
118
+ Parameters:
119
+ api_service: An instance of the API service.
120
+ """
48
121
  self.api_service = api_service
49
122
 
50
- async def initiate(self, instruction, system=None, context=None, out=True, **kwargs) -> Any:
123
+ async def _output(self, output, invoke=True, out=True):
51
124
  """
52
- Initiate a conversation session with the given instruction.
125
+ Process the output, invoke tools if needed, and optionally return the output.
53
126
 
54
- Args:
55
- instruction: The user's instruction to initiate the conversation.
56
- system: The content of the system message.
57
- context: Additional context for the user instruction.
58
- out: Whether to return the output content.
127
+ Parameters:
128
+ output: The output to process.
129
+ invoke (bool): Whether to invoke tools based on the output. Default is True.
130
+ out (bool): Whether to return the output. Default is True.
59
131
 
60
132
  Returns:
61
- Any: The output content if 'out' is True, otherwise None.
133
+ Any: The processed output.
134
+ """
135
+ if invoke:
136
+ try:
137
+ func, args = self.toolmanager._get_function_call(output)
138
+ outs = await self.toolmanager.ainvoke(func, args)
139
+ self.conversation.add_messages(tool=outs)
140
+ except:
141
+ pass
142
+ if out:
143
+ return output
144
+
145
+ def register_tools(self, tools, funcs, update=False, new=False, prefix=None, postfix=None):
146
+ """
147
+ Register tools and their corresponding functions.
148
+
149
+ Parameters:
150
+ tools (list): The list of tool information dictionaries.
151
+ funcs (list): The list of corresponding functions.
152
+ update (bool): Whether to update existing functions.
153
+ new (bool): Whether to create new registries for existing functions.
154
+ prefix (Optional[str]): A prefix to add to the function names.
155
+ postfix (Optional[str]): A postfix to add to the function names.
156
+ """
157
+ funcs = to_list(funcs)
158
+ self.toolmanager.register_tools(tools, funcs, update, new, prefix, postfix)
159
+
160
+ async def initiate(self, instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
161
+ """
162
+ Start a new conversation session with the provided instruction.
163
+
164
+ Parameters:
165
+ instruction (str): The instruction to initiate the conversation.
166
+ system (Optional[str]): The system setting for the conversation. Default is None.
167
+ context (Optional[dict]): Additional context for the instruction. Default is None.
168
+ out (bool): Whether to return the output. Default is True.
169
+ name (Optional[str]): The name associated with the instruction. Default is None.
170
+ invoke (bool): Whether to invoke tools based on the output. Default is True.
171
+ kwargs: Additional keyword arguments for configuration.
172
+
173
+ Returns:
174
+ Any: The processed output.
62
175
  """
63
176
  config = {**self.llmconfig, **kwargs}
64
177
  system = system or self.system
65
- self.conversation.initiate_conversation(system=system, instruction=instruction, context=context)
66
-
178
+ self.conversation.initiate_conversation(system=system, instruction=instruction, context=context, name=name)
67
179
  await self.call_chatcompletion(**config)
68
- if out:
69
- return self.conversation.responses[-1]['content']
180
+ output = self.conversation.responses[-1]['content']
181
+
182
+ return await self._output(output, invoke, out)
70
183
 
71
- async def followup(self, instruction, system=None, context=None, out=True, **kwargs) -> Any:
184
+ async def followup(self, instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
72
185
  """
73
- Continue the conversation session with a follow-up instruction.
186
+ Continue the conversation with the provided instruction.
74
187
 
75
- Args:
76
- instruction: The user's follow-up instruction.
77
- system: The content of the system message.
78
- context: Additional context for the user instruction.
79
- out: Whether to return the output content.
188
+ Parameters:
189
+ instruction (str): The instruction to continue the conversation.
190
+ system (Optional[str]): The system setting for the conversation. Default is None.
191
+ context (Optional[dict]): Additional context for the instruction. Default is None.
192
+ out (bool): Whether to return the output. Default is True.
193
+ name (Optional[str]): The name associated with the instruction. Default is None.
194
+ invoke (bool): Whether to invoke tools based on the output. Default is True.
195
+ kwargs: Additional keyword arguments for configuration.
80
196
 
81
197
  Returns:
82
- Any: The output content if 'out' is True, otherwise None.
198
+ Any: The processed output.
83
199
  """
84
- self.conversation.append_last_response()
85
200
  if system:
86
201
  self.conversation.change_system(system)
87
- self.conversation.add_messages(instruction=instruction, context=context)
88
-
202
+ self.conversation.add_messages(instruction=instruction, context=context, name=name)
89
203
  config = {**self.llmconfig, **kwargs}
90
204
  await self.call_chatcompletion(**config)
91
- if out:
92
- return self.conversation.responses[-1]['content']
205
+ output = self.conversation.responses[-1]['content']
206
+
207
+ return await self._output(output, invoke, out)
93
208
 
94
209
  def create_payload_chatcompletion(self, **kwargs):
95
210
  """
96
- Create a payload for chat completion API calls.
211
+ Create a payload for chat completion based on the conversation state and configuration.
97
212
 
98
- Args:
99
- kwargs: Additional keyword arguments for customization.
213
+ Parameters:
214
+ kwargs: Additional keyword arguments for configuration.
215
+
216
+ Returns:
217
+ dict: The payload for chat completion.
100
218
  """
101
219
  # currently only openai chat completions are supported
102
220
  messages = self.conversation.messages
103
- request_url = f"https://api.openai.com/v1/chat/completions"
104
221
  config = {**self.llmconfig, **kwargs}
105
-
106
222
  payload = {
107
223
  "messages": messages,
108
224
  "model": config.get('model'),
109
225
  "frequency_penalty": config.get('frequency_penalty'),
110
- "max_tokens": config.get('max_tokens'),
111
226
  "n": config.get('n'),
112
227
  "presence_penalty": config.get('presence_penalty'),
113
228
  "response_format": config.get('response_format'),
@@ -115,35 +230,62 @@ class Session():
115
230
  "top_p": config.get('top_p'),
116
231
  }
117
232
 
118
- for key in ["seed", "stop", "stream", "tools", "tool_choice", "user"]:
119
- if config[key] is True:
233
+ for key in ["seed", "stop", "stream", "tools", "tool_choice", "user", "max_tokens"]:
234
+ if bool(config[key]) is True and str(config[key]) != "none":
120
235
  payload.update({key: config[key]})
121
-
122
- return (payload, request_url)
123
-
124
- async def call_chatcompletion(self, delay=1, **kwargs):
236
+ return payload
237
+
238
+ async def call_chatcompletion(self, sleep=0.1, **kwargs):
125
239
  """
126
- Make an asynchronous call to the chat completion API.
240
+ Make a call to the chat completion API and process the response.
127
241
 
128
- Args:
129
- delay: The delay (in seconds) between API calls.
130
- kwargs: Additional keyword arguments for customization.
242
+ Parameters:
243
+ sleep (float): The sleep duration after making the API call. Default is 0.1.
244
+ kwargs: Additional keyword arguments for configuration.
131
245
  """
132
- # currently only openai chat completions are supported
133
- payload, request_url = self.create_payload_chatcompletion(**kwargs)
246
+ endpoint = f"chat/completions"
134
247
  try:
135
248
  async with aiohttp.ClientSession() as session:
136
- completion = await self.api_service.call_api(session, request_url, payload)
249
+ payload = self.create_payload_chatcompletion(**kwargs)
250
+ completion = await self.api_service.call_api(
251
+ session, endpoint, payload)
137
252
  if "choices" in completion:
138
- completion = completion['choices'][0] # currently can only call one completion at a time, n has to be 1
139
- self.logger({"input":self.conversation.messages, "output": completion})
140
- response = {"role": "assistant", "content": completion['message']["content"]}
141
- self.conversation.responses.append(response)
253
+ self._logger({"input":payload, "output": completion})
254
+ self.conversation.add_messages(response=completion['choices'][0])
255
+ self.conversation.responses.append(self.conversation.messages[-1])
142
256
  self.conversation.response_counts += 1
143
- await asyncio.sleep(delay=delay)
257
+ await asyncio.sleep(sleep)
144
258
  status_tracker.num_tasks_succeeded += 1
145
259
  else:
146
260
  status_tracker.num_tasks_failed += 1
147
261
  except Exception as e:
148
262
  status_tracker.num_tasks_failed += 1
149
- raise e
263
+ raise e
264
+
265
+ def messages_to_csv(self, dir=None, filename="_messages.csv", **kwags):
266
+ """
267
+ Save conversation messages to a CSV file.
268
+
269
+ Parameters:
270
+ dir (Optional[str]): The directory path for saving the CSV file. Default is None.
271
+ filename (Optional[str]): The filename for the CSV file. Default is "_messages.csv".
272
+ kwargs: Additional keyword arguments for CSV file settings.
273
+ """
274
+ dir = dir or self._logger.dir
275
+ if dir is None:
276
+ raise ValueError("No directory specified.")
277
+ self.conversation.msg.to_csv(dir=dir, filename=filename, **kwags)
278
+
279
+ def log_to_csv(self, dir=None, filename="_llmlog.csv", **kwags):
280
+ """
281
+ Save conversation logs to a CSV file.
282
+
283
+ Parameters:
284
+ dir (Optional[str]): The directory path for saving the CSV file. Default is None.
285
+ filename (Optional[str]): The filename for the CSV file. Default is "_llmlog.csv".
286
+ kwargs: Additional keyword arguments for CSV file settings.
287
+ """
288
+ dir = dir or self._logger.dir
289
+ if dir is None:
290
+ raise ValueError("No directory specified.")
291
+ self._logger.to_csv(dir=dir, filename=filename, **kwags)
File without changes
lionagi/utils/__init__.py CHANGED
@@ -1,11 +1,10 @@
1
- from .sys_util import to_flat_dict, append_to_jsonl, to_list, str_to_num, make_copy, to_temp, to_csv, hold_call, ahold_call, l_call, al_call, m_call, am_call, e_call, ae_call, get_timestamp, _flatten_dict, _flatten_list, create_id, create_path
2
- from .api_util import StatusTracker, RateLimiter, BaseAPIService, AsyncQueue
1
+ from .sys_util import to_flat_dict, append_to_jsonl, to_list, str_to_num, make_copy, to_temp, to_csv, hold_call, ahold_call, l_call, al_call, m_call, am_call, e_call, ae_call, get_timestamp, create_path
3
2
  from .doc_util import dir_to_path, read_text, dir_to_files, chunk_text, file_to_chunks, file_to_chunks, get_bins
4
3
  from .log_util import DataLogger
4
+ from .tool_util import ToolManager
5
5
 
6
6
  __all__ = [
7
- "to_list", "str_to_num", "make_copy", "to_temp", "to_csv", "hold_call", "ahold_call", "l_call", "al_call", "m_call", "am_call", "e_call", "ae_call", "get_timestamp", "_flatten_dict", "_flatten_list", "create_id", "create_path", "to_flat_dict", "append_to_jsonl",
8
- "StatusTracker", "RateLimiter", "BaseAPIService", "AsyncQueue",
7
+ "to_list", "str_to_num", "make_copy", "to_temp", "to_csv", "hold_call", "ahold_call", "l_call", "al_call", "m_call", "am_call", "e_call", "ae_call", "get_timestamp", "create_path", "to_flat_dict", "append_to_jsonl",
9
8
  "dir_to_path", "read_text", "dir_to_files", "chunk_text", "file_to_chunks", "file_to_chunks", "get_bins",
10
- "DataLogger"
9
+ "DataLogger", "ToolManager"
11
10
  ]
lionagi/utils/api_util.py CHANGED
@@ -215,7 +215,8 @@ class RateLimiter(ABC):
215
215
  ...
216
216
  >>> limiter = MyRateLimiter(100, 200)
217
217
  """
218
- pass
218
+
219
+ ...
219
220
 
220
221
  @abstractmethod
221
222
  def calculate_num_token(self, payload: Dict[str, Any], api_endpoint: str) -> int:
@@ -241,7 +242,8 @@ class RateLimiter(ABC):
241
242
  >>> limiter.calculate_num_token({'data': '12345'}, 'api/send')
242
243
  0
243
244
  """
244
- pass
245
+
246
+ ...
245
247
 
246
248
  class BaseAPIService(ABC):
247
249
  """
@@ -272,9 +274,11 @@ class BaseAPIService(ABC):
272
274
  api_key: str,
273
275
  token_encoding_name: str,
274
276
  max_attempts: int,
275
- status_tracker: Optional[StatusTracker],
276
- rate_limiter: RateLimiter,
277
- queue: Optional[AsyncQueue]
277
+ max_requests_per_minute: int,
278
+ max_tokens_per_minute: int,
279
+ ratelimiter,
280
+ status_tracker: Optional[StatusTracker] = None,
281
+ queue: Optional[AsyncQueue] = None,
278
282
  ) -> None:
279
283
  """
280
284
  Initializes the BaseAPIService with necessary configuration.
@@ -299,8 +303,8 @@ class BaseAPIService(ABC):
299
303
  self.token_encoding_name = token_encoding_name
300
304
  self.max_attempts = max_attempts
301
305
  self.status_tracker = status_tracker or StatusTracker()
302
- self.rate_limiter = rate_limiter
303
306
  self.queue = queue or AsyncQueue()
307
+ self.rate_limiter = ratelimiter(max_requests_per_minute, max_tokens_per_minute)
304
308
 
305
309
  @abstractmethod
306
310
  async def call_api(self) -> Any:
@@ -316,7 +320,8 @@ class BaseAPIService(ABC):
316
320
  ... # Implementation details here
317
321
  ...
318
322
  """
319
- pass
323
+
324
+ ...
320
325
 
321
326
  def handle_error(
322
327
  self,
@@ -346,19 +351,6 @@ class BaseAPIService(ABC):
346
351
  self.append_to_jsonl(data, save_filepath)
347
352
  logging.error(f"Request failed after all attempts. Saving errors: {data}")
348
353
 
349
- @staticmethod
350
- def append_to_jsonl(data: Any, filename: str) -> None:
351
- """
352
- Appends the given data to the specified JSONL file.
353
-
354
- Args:
355
- data (Any): The data to be appended in JSON Lines format.
356
- filename (str): The file path to the JSONL file.
357
- """
358
- json_string = json.dumps(data)
359
- with open(filename, "a") as f:
360
- f.write(json_string + "\n")
361
-
362
354
  @staticmethod
363
355
  def api_endpoint_from_url(request_url: str) -> str:
364
356
  """
lionagi/utils/doc_util.py CHANGED
@@ -26,7 +26,7 @@ def dir_to_path(dir: str, ext, recursive: bool = False, flat: bool = True):
26
26
  def _dir_to_path(ext, recursive=recursive):
27
27
  tem = '**/*' if recursive else '*'
28
28
  return list(Path(dir).glob(tem + ext))
29
-
29
+
30
30
  return to_list(l_call(ext, _dir_to_path, flat=True), flat=flat)
31
31
 
32
32
  def read_text(filepath: str, clean: bool = True) -> str:
@@ -48,15 +48,15 @@ def read_text(filepath: str, clean: bool = True) -> str:
48
48
  content = f.read()
49
49
  if clean:
50
50
  # Define characters to replace and their replacements
51
- replacements = {'\\': ' ', '\\\n': ' ', '\\\t': ' ', ' ': ' ', '\'': ' '}
51
+ replacements = {'\\': ' ', '\n': ' ', '\t': ' ', ' ': ' ', '\'': ' '}
52
52
  for old, new in replacements.items():
53
53
  content = content.replace(old, new)
54
54
  return content
55
55
 
56
- def dir_to_files(dir: str, ext: str, recursive: bool = False,
57
- reader: Callable = read_text, clean: bool = True,
56
+ def dir_to_files(dir: str, ext: str, recursive: bool = False,
57
+ reader: Callable = read_text, clean: bool = True,
58
58
  to_csv: bool = False, project: str = 'project',
59
- output_dir: str = 'data/logs/sources/', filename: Optional[str] = None,
59
+ output_dir: str = 'data/logs/sources/', filename: Optional[str] = None,
60
60
  verbose: bool = True, timestamp: bool = True, logger: Optional[DataLogger] = None):
61
61
  """
62
62
  Reads and processes files in a specified directory with the given extension.
@@ -81,9 +81,9 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
81
81
  Examples:
82
82
  >>> logs = dir_to_files(dir='my_directory', ext='.txt', to_csv=True)
83
83
  """
84
-
84
+
85
85
  sources = dir_to_path(dir, ext, recursive)
86
-
86
+
87
87
  def split_path(path: Path) -> tuple:
88
88
  folder_name = path.parent.name
89
89
  file_name = path.name
@@ -99,9 +99,9 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
99
99
  "file_size": len(str(content)),
100
100
  'content': content
101
101
  } if content else None
102
-
102
+
103
103
  logs = to_list(l_call(sources, to_dict, flat=True), dropna=True)
104
-
104
+
105
105
  if to_csv:
106
106
  filename = filename or f"{project}_sources.csv"
107
107
  logger = DataLogger(dir=output_dir, log=logs) if not logger else logger
@@ -109,7 +109,7 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
109
109
 
110
110
  return logs
111
111
 
112
- def chunk_text(input: str, chunk_size: int, overlap: float,
112
+ def chunk_text(input: str, chunk_size: int, overlap: float,
113
113
  threshold: int) -> List[Union[str, None]]:
114
114
  """
115
115
  Splits a string into chunks of a specified size, allowing for optional overlap between chunks.
@@ -127,19 +127,19 @@ def chunk_text(input: str, chunk_size: int, overlap: float,
127
127
  Returns:
128
128
  List[Union[str, None]]: List of text chunks.
129
129
  """
130
-
130
+
131
131
  try:
132
132
  # Ensure text is a string
133
133
  if not isinstance(input, str):
134
134
  input = str(input)
135
-
135
+
136
136
  chunks = []
137
137
  n_chunks = math.ceil(len(input) / chunk_size)
138
138
  overlap_size = int(chunk_size * overlap / 2)
139
-
139
+
140
140
  if n_chunks == 1:
141
141
  return [input]
142
-
142
+
143
143
  elif n_chunks == 2:
144
144
  chunks.append(input[:chunk_size + overlap_size])
145
145
  if len(input) - chunk_size > threshold:
@@ -147,28 +147,28 @@ def chunk_text(input: str, chunk_size: int, overlap: float,
147
147
  else:
148
148
  return [input]
149
149
  return chunks
150
-
150
+
151
151
  elif n_chunks > 2:
152
152
  chunks.append(input[:chunk_size + overlap_size])
153
153
  for i in range(1, n_chunks - 1):
154
154
  start_idx = chunk_size * i - overlap_size
155
155
  end_idx = chunk_size * (i + 1) + overlap_size
156
156
  chunks.append(input[start_idx:end_idx])
157
-
157
+
158
158
  if len(input) - chunk_size * (n_chunks - 1) > threshold:
159
159
  chunks.append(input[chunk_size * (n_chunks - 1) - overlap_size:])
160
160
  else:
161
- chunks[-1] += input[chunk_size * (n_chunks - 1):]
162
-
161
+ chunks[-1] += input[chunk_size * (n_chunks - 1) + overlap_size:]
162
+
163
163
  return chunks
164
-
164
+
165
165
  except Exception as e:
166
166
  raise ValueError(f"An error occurred while chunking the text. {e}")
167
167
 
168
- def _file_to_chunks(input: Dict[str, Any],
169
- field: str = 'content',
170
- chunk_size: int = 1500,
171
- overlap: float = 0.2,
168
+ def _file_to_chunks(input: Dict[str, Any],
169
+ field: str = 'content',
170
+ chunk_size: int = 1500,
171
+ overlap: float = 0.2,
172
172
  threshold: int = 200) -> List[Dict[str, Any]]:
173
173
  """
174
174
  Splits text from a specified dictionary field into chunks and returns a list of dictionaries.
@@ -195,7 +195,7 @@ def _file_to_chunks(input: Dict[str, Any],
195
195
  try:
196
196
  out = {key: value for key, value in input.items() if key != field}
197
197
  out.update({"chunk_overlap": overlap, "chunk_threshold": threshold})
198
-
198
+
199
199
  chunks = chunk_text(input[field], chunk_size=chunk_size, overlap=overlap, threshold=threshold)
200
200
  logs = []
201
201
  for i, chunk in enumerate(chunks):
@@ -209,22 +209,22 @@ def _file_to_chunks(input: Dict[str, Any],
209
209
  logs.append(chunk_dict)
210
210
 
211
211
  return logs
212
-
212
+
213
213
  except Exception as e:
214
214
  raise ValueError(f"An error occurred while chunking the file. {e}")
215
-
216
- def file_to_chunks(input,
217
- field: str = 'content',
218
- chunk_size: int = 1500,
219
- overlap: float = 0.2,
220
- threshold: int = 200,
221
- to_csv=False,
215
+
216
+ def file_to_chunks(input,
217
+ field: str = 'content',
218
+ chunk_size: int = 1500,
219
+ overlap: float = 0.2,
220
+ threshold: int = 200,
221
+ to_csv=False,
222
222
  project='project',
223
- output_dir='data/logs/sources/',
223
+ output_dir='data/logs/sources/',
224
224
  chunk_func = _file_to_chunks,
225
- filename=None,
226
- verbose=True,
227
- timestamp=True,
225
+ filename=None,
226
+ verbose=True,
227
+ timestamp=True,
228
228
  logger=None):
229
229
  """
230
230
  Splits text from a specified dictionary field into chunks and returns a list of dictionaries.
@@ -243,10 +243,10 @@ def file_to_chunks(input,
243
243
  timestamp: If True, include a timestamp in the exported file name.
244
244
  logger: An optional DataLogger instance for logging.
245
245
  """
246
-
246
+
247
247
  f = lambda x: chunk_func(x, field=field, chunk_size=chunk_size, overlap=overlap, threshold=threshold)
248
248
  logs = to_list(l_call(input, f), flat=True)
249
-
249
+
250
250
  if to_csv:
251
251
  filename = filename if filename else f"{project}_sources.csv"
252
252
  logger = DataLogger(log=logs) if not logger else logger