lionagi 0.0.103__py3-none-any.whl → 0.0.105__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,111 +3,226 @@ import asyncio
3
3
  from typing import Any
4
4
 
5
5
  from .conversation import Conversation
6
- from ..config.llmconfig import llmconfig
6
+ from ..utils.sys_util import to_list
7
7
  from ..utils.log_util import DataLogger
8
8
  from ..utils.api_util import StatusTracker
9
- from ..config.oaiconfig import OAIService
9
+ from ..utils.tool_util import ToolManager
10
+ from ..api.oai_service import OpenAIService
11
+
12
+ from ..api.oai_config import oai_llmconfig
13
+
10
14
 
11
15
  status_tracker = StatusTracker()
16
+ OAIService = OpenAIService()
12
17
 
13
18
  class Session():
14
19
  """
15
- A class representing a conversation session with chat completion capabilities.
20
+ A class representing a conversation session with a conversational AI system.
16
21
 
17
- This class manages conversations, interacts with chat completion services (currently OpenAI),
18
- and logs the interactions using a DataLogger.
22
+ This class manages the flow of conversation, system settings, and interactions with external tools.
19
23
 
20
24
  Attributes:
21
- conversation: An instance of the Conversation class for managing messages.
22
- system: The system identifier for the conversation session.
23
- llmconfig: Configuration parameters for language models.
24
- logger: An instance of DataLogger for logging conversation interactions.
25
- api_service: An instance of the API service for making asynchronous API calls.
25
+ conversation (Conversation): An instance of the Conversation class to manage messages.
26
+ system (str): The current system setting for the conversation.
27
+ llmconfig (dict): Configuration settings for the language model.
28
+ _logger (DataLogger): An instance of the DataLogger class for logging conversation details.
29
+ api_service: An instance of the API service for making calls to the conversational AI model.
30
+ toolmanager (ToolManager): An instance of the ToolManager class for managing external tools.
26
31
 
27
32
  Methods:
28
- initiate: Initiate a conversation session with the given instruction.
29
- followup: Continue the conversation session with a follow-up instruction.
30
- create_payload_chatcompletion: Create a payload for chat completion API calls.
31
- call_chatcompletion: Make an asynchronous call to the chat completion API.
33
+ set_dir(dir):
34
+ Set the directory for logging.
35
+
36
+ set_system(system):
37
+ Set the system for the conversation.
38
+
39
+ set_llmconfig(llmconfig):
40
+ Set the language model configuration.
41
+
42
+ set_api_service(api_service):
43
+ Set the API service for making model calls.
44
+
45
+ _output(output, invoke=True, out=True) -> Any:
46
+ Process the output, invoke tools if needed, and optionally return the output.
47
+
48
+ register_tools(tools, funcs, update=False, new=False, prefix=None, postfix=None):
49
+ Register tools and their corresponding functions.
50
+
51
+ initiate(instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
52
+ Start a new conversation session with the provided instruction.
53
+
54
+ followup(instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
55
+ Continue the conversation with the provided instruction.
56
+
57
+ create_payload_chatcompletion(**kwargs) -> dict:
58
+ Create a payload for chat completion based on the conversation state and configuration.
59
+
60
+ call_chatcompletion(sleep=0.1, **kwargs) -> None:
61
+ Make a call to the chat completion API and process the response.
62
+
63
+ messages_to_csv(dir=None, filename="_messages.csv", **kwargs) -> None:
64
+ Save conversation messages to a CSV file.
65
+
66
+ log_to_csv(dir=None, filename="_llmlog.csv", **kwargs) -> None:
67
+ Save conversation logs to a CSV file.
32
68
  """
33
69
 
34
- def __init__(self, system, dir=None, llmconfig=llmconfig, api_service=OAIService):
70
+ def __init__(self, system, dir=None, llmconfig=oai_llmconfig, api_service=OAIService):
35
71
  """
36
- Initialize a Session object.
72
+ Initialize a Session object with default or provided settings.
37
73
 
38
- Args:
39
- system: The system identifier for the conversation session.
40
- dir: The directory for logging interactions.
41
- llmconfig: Configuration parameters for language models.
42
- api_service: An instance of the API service for making asynchronous API calls.
74
+ Parameters:
75
+ system (str): The initial system setting for the conversation.
76
+ dir (Optional[str]): The directory for logging. Default is None.
77
+ llmconfig (Optional[dict]): Configuration settings for the language model. Default is oai_llmconfig.
78
+ api_service: An instance of the API service for making calls to the conversational AI model.
43
79
  """
44
80
  self.conversation = Conversation()
45
81
  self.system = system
46
82
  self.llmconfig = llmconfig
47
- self.logger = DataLogger(dir=dir)
83
+ self._logger = DataLogger(dir=dir)
84
+ self.api_service = api_service
85
+ self.toolmanager = ToolManager()
86
+
87
+ def set_dir(self, dir):
88
+ """
89
+ Set the directory for logging.
90
+
91
+ Parameters:
92
+ dir (str): The directory path.
93
+ """
94
+ self._logger.dir = dir
95
+
96
+ def set_system(self, system):
97
+ """
98
+ Set the system for the conversation.
99
+
100
+ Parameters:
101
+ system (str): The system setting.
102
+ """
103
+ self.conversation.change_system(system)
104
+
105
+ def set_llmconfig(self, llmconfig):
106
+ """
107
+ Set the language model configuration.
108
+
109
+ Parameters:
110
+ llmconfig (dict): Configuration settings for the language model.
111
+ """
112
+ self.llmconfig = llmconfig
113
+
114
+ def set_api_service(self, api_service):
115
+ """
116
+ Set the API service for making model calls.
117
+
118
+ Parameters:
119
+ api_service: An instance of the API service.
120
+ """
48
121
  self.api_service = api_service
49
122
 
50
- async def initiate(self, instruction, system=None, context=None, out=True, **kwargs) -> Any:
123
+ async def _output(self, output, invoke=True, out=True):
51
124
  """
52
- Initiate a conversation session with the given instruction.
125
+ Process the output, invoke tools if needed, and optionally return the output.
53
126
 
54
- Args:
55
- instruction: The user's instruction to initiate the conversation.
56
- system: The content of the system message.
57
- context: Additional context for the user instruction.
58
- out: Whether to return the output content.
127
+ Parameters:
128
+ output: The output to process.
129
+ invoke (bool): Whether to invoke tools based on the output. Default is True.
130
+ out (bool): Whether to return the output. Default is True.
59
131
 
60
132
  Returns:
61
- Any: The output content if 'out' is True, otherwise None.
133
+ Any: The processed output.
134
+ """
135
+ if invoke:
136
+ try:
137
+ func, args = self.toolmanager._get_function_call(output)
138
+ outs = await self.toolmanager.ainvoke(func, args)
139
+ self.conversation.add_messages(tool=outs)
140
+ except:
141
+ pass
142
+ if out:
143
+ return output
144
+
145
+ def register_tools(self, tools, funcs, update=False, new=False, prefix=None, postfix=None):
146
+ """
147
+ Register tools and their corresponding functions.
148
+
149
+ Parameters:
150
+ tools (list): The list of tool information dictionaries.
151
+ funcs (list): The list of corresponding functions.
152
+ update (bool): Whether to update existing functions.
153
+ new (bool): Whether to create new registries for existing functions.
154
+ prefix (Optional[str]): A prefix to add to the function names.
155
+ postfix (Optional[str]): A postfix to add to the function names.
156
+ """
157
+ funcs = to_list(funcs)
158
+ self.toolmanager.register_tools(tools, funcs, update, new, prefix, postfix)
159
+
160
+ async def initiate(self, instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
161
+ """
162
+ Start a new conversation session with the provided instruction.
163
+
164
+ Parameters:
165
+ instruction (str): The instruction to initiate the conversation.
166
+ system (Optional[str]): The system setting for the conversation. Default is None.
167
+ context (Optional[dict]): Additional context for the instruction. Default is None.
168
+ out (bool): Whether to return the output. Default is True.
169
+ name (Optional[str]): The name associated with the instruction. Default is None.
170
+ invoke (bool): Whether to invoke tools based on the output. Default is True.
171
+ kwargs: Additional keyword arguments for configuration.
172
+
173
+ Returns:
174
+ Any: The processed output.
62
175
  """
63
176
  config = {**self.llmconfig, **kwargs}
64
177
  system = system or self.system
65
- self.conversation.initiate_conversation(system=system, instruction=instruction, context=context)
66
-
178
+ self.conversation.initiate_conversation(system=system, instruction=instruction, context=context, name=name)
67
179
  await self.call_chatcompletion(**config)
68
- if out:
69
- return self.conversation.responses[-1]['content']
180
+ output = self.conversation.responses[-1]['content']
181
+
182
+ return await self._output(output, invoke, out)
70
183
 
71
- async def followup(self, instruction, system=None, context=None, out=True, **kwargs) -> Any:
184
+ async def followup(self, instruction, system=None, context=None, out=True, name=None, invoke=True, **kwargs) -> Any:
72
185
  """
73
- Continue the conversation session with a follow-up instruction.
186
+ Continue the conversation with the provided instruction.
74
187
 
75
- Args:
76
- instruction: The user's follow-up instruction.
77
- system: The content of the system message.
78
- context: Additional context for the user instruction.
79
- out: Whether to return the output content.
188
+ Parameters:
189
+ instruction (str): The instruction to continue the conversation.
190
+ system (Optional[str]): The system setting for the conversation. Default is None.
191
+ context (Optional[dict]): Additional context for the instruction. Default is None.
192
+ out (bool): Whether to return the output. Default is True.
193
+ name (Optional[str]): The name associated with the instruction. Default is None.
194
+ invoke (bool): Whether to invoke tools based on the output. Default is True.
195
+ kwargs: Additional keyword arguments for configuration.
80
196
 
81
197
  Returns:
82
- Any: The output content if 'out' is True, otherwise None.
198
+ Any: The processed output.
83
199
  """
84
- self.conversation.append_last_response()
85
200
  if system:
86
201
  self.conversation.change_system(system)
87
- self.conversation.add_messages(instruction=instruction, context=context)
88
-
202
+ self.conversation.add_messages(instruction=instruction, context=context, name=name)
89
203
  config = {**self.llmconfig, **kwargs}
90
204
  await self.call_chatcompletion(**config)
91
- if out:
92
- return self.conversation.responses[-1]['content']
205
+ output = self.conversation.responses[-1]['content']
206
+
207
+ return await self._output(output, invoke, out)
93
208
 
94
209
  def create_payload_chatcompletion(self, **kwargs):
95
210
  """
96
- Create a payload for chat completion API calls.
211
+ Create a payload for chat completion based on the conversation state and configuration.
97
212
 
98
- Args:
99
- kwargs: Additional keyword arguments for customization.
213
+ Parameters:
214
+ kwargs: Additional keyword arguments for configuration.
215
+
216
+ Returns:
217
+ dict: The payload for chat completion.
100
218
  """
101
219
  # currently only openai chat completions are supported
102
220
  messages = self.conversation.messages
103
- request_url = f"https://api.openai.com/v1/chat/completions"
104
221
  config = {**self.llmconfig, **kwargs}
105
-
106
222
  payload = {
107
223
  "messages": messages,
108
224
  "model": config.get('model'),
109
225
  "frequency_penalty": config.get('frequency_penalty'),
110
- "max_tokens": config.get('max_tokens'),
111
226
  "n": config.get('n'),
112
227
  "presence_penalty": config.get('presence_penalty'),
113
228
  "response_format": config.get('response_format'),
@@ -115,35 +230,62 @@ class Session():
115
230
  "top_p": config.get('top_p'),
116
231
  }
117
232
 
118
- for key in ["seed", "stop", "stream", "tools", "tool_choice", "user"]:
119
- if config[key] is True:
233
+ for key in ["seed", "stop", "stream", "tools", "tool_choice", "user", "max_tokens"]:
234
+ if bool(config[key]) is True and str(config[key]) != "none":
120
235
  payload.update({key: config[key]})
121
-
122
- return (payload, request_url)
123
-
124
- async def call_chatcompletion(self, delay=1, **kwargs):
236
+ return payload
237
+
238
+ async def call_chatcompletion(self, sleep=0.1, **kwargs):
125
239
  """
126
- Make an asynchronous call to the chat completion API.
240
+ Make a call to the chat completion API and process the response.
127
241
 
128
- Args:
129
- delay: The delay (in seconds) between API calls.
130
- kwargs: Additional keyword arguments for customization.
242
+ Parameters:
243
+ sleep (float): The sleep duration after making the API call. Default is 0.1.
244
+ kwargs: Additional keyword arguments for configuration.
131
245
  """
132
- # currently only openai chat completions are supported
133
- payload, request_url = self.create_payload_chatcompletion(**kwargs)
246
+ endpoint = f"chat/completions"
134
247
  try:
135
248
  async with aiohttp.ClientSession() as session:
136
- completion = await self.api_service.call_api(session, request_url, payload)
249
+ payload = self.create_payload_chatcompletion(**kwargs)
250
+ completion = await self.api_service.call_api(
251
+ session, endpoint, payload)
137
252
  if "choices" in completion:
138
- completion = completion['choices'][0] # currently can only call one completion at a time, n has to be 1
139
- self.logger({"input":self.conversation.messages, "output": completion})
140
- response = {"role": "assistant", "content": completion['message']["content"]}
141
- self.conversation.responses.append(response)
253
+ self._logger({"input":payload, "output": completion})
254
+ self.conversation.add_messages(response=completion['choices'][0])
255
+ self.conversation.responses.append(self.conversation.messages[-1])
142
256
  self.conversation.response_counts += 1
143
- await asyncio.sleep(delay=delay)
257
+ await asyncio.sleep(sleep)
144
258
  status_tracker.num_tasks_succeeded += 1
145
259
  else:
146
260
  status_tracker.num_tasks_failed += 1
147
261
  except Exception as e:
148
262
  status_tracker.num_tasks_failed += 1
149
- raise e
263
+ raise e
264
+
265
+ def messages_to_csv(self, dir=None, filename="_messages.csv", **kwags):
266
+ """
267
+ Save conversation messages to a CSV file.
268
+
269
+ Parameters:
270
+ dir (Optional[str]): The directory path for saving the CSV file. Default is None.
271
+ filename (Optional[str]): The filename for the CSV file. Default is "_messages.csv".
272
+ kwargs: Additional keyword arguments for CSV file settings.
273
+ """
274
+ dir = dir or self._logger.dir
275
+ if dir is None:
276
+ raise ValueError("No directory specified.")
277
+ self.conversation.msg.to_csv(dir=dir, filename=filename, **kwags)
278
+
279
+ def log_to_csv(self, dir=None, filename="_llmlog.csv", **kwags):
280
+ """
281
+ Save conversation logs to a CSV file.
282
+
283
+ Parameters:
284
+ dir (Optional[str]): The directory path for saving the CSV file. Default is None.
285
+ filename (Optional[str]): The filename for the CSV file. Default is "_llmlog.csv".
286
+ kwargs: Additional keyword arguments for CSV file settings.
287
+ """
288
+ dir = dir or self._logger.dir
289
+ if dir is None:
290
+ raise ValueError("No directory specified.")
291
+ self._logger.to_csv(dir=dir, filename=filename, **kwags)
File without changes
lionagi/utils/__init__.py CHANGED
@@ -1,11 +1,10 @@
1
- from .sys_util import to_flat_dict, append_to_jsonl, to_list, str_to_num, make_copy, to_temp, to_csv, hold_call, ahold_call, l_call, al_call, m_call, am_call, e_call, ae_call, get_timestamp, _flatten_dict, _flatten_list, create_id, create_path
2
- from .api_util import StatusTracker, RateLimiter, BaseAPIService, AsyncQueue
1
+ from .sys_util import to_flat_dict, append_to_jsonl, to_list, str_to_num, make_copy, to_temp, to_csv, hold_call, ahold_call, l_call, al_call, m_call, am_call, e_call, ae_call, get_timestamp, create_path
3
2
  from .doc_util import dir_to_path, read_text, dir_to_files, chunk_text, file_to_chunks, file_to_chunks, get_bins
4
3
  from .log_util import DataLogger
4
+ from .tool_util import ToolManager
5
5
 
6
6
  __all__ = [
7
- "to_list", "str_to_num", "make_copy", "to_temp", "to_csv", "hold_call", "ahold_call", "l_call", "al_call", "m_call", "am_call", "e_call", "ae_call", "get_timestamp", "_flatten_dict", "_flatten_list", "create_id", "create_path", "to_flat_dict", "append_to_jsonl",
8
- "StatusTracker", "RateLimiter", "BaseAPIService", "AsyncQueue",
7
+ "to_list", "str_to_num", "make_copy", "to_temp", "to_csv", "hold_call", "ahold_call", "l_call", "al_call", "m_call", "am_call", "e_call", "ae_call", "get_timestamp", "create_path", "to_flat_dict", "append_to_jsonl",
9
8
  "dir_to_path", "read_text", "dir_to_files", "chunk_text", "file_to_chunks", "file_to_chunks", "get_bins",
10
- "DataLogger"
9
+ "DataLogger", "ToolManager"
11
10
  ]
lionagi/utils/api_util.py CHANGED
@@ -215,7 +215,8 @@ class RateLimiter(ABC):
215
215
  ...
216
216
  >>> limiter = MyRateLimiter(100, 200)
217
217
  """
218
- pass
218
+
219
+ ...
219
220
 
220
221
  @abstractmethod
221
222
  def calculate_num_token(self, payload: Dict[str, Any], api_endpoint: str) -> int:
@@ -241,7 +242,8 @@ class RateLimiter(ABC):
241
242
  >>> limiter.calculate_num_token({'data': '12345'}, 'api/send')
242
243
  0
243
244
  """
244
- pass
245
+
246
+ ...
245
247
 
246
248
  class BaseAPIService(ABC):
247
249
  """
@@ -272,9 +274,11 @@ class BaseAPIService(ABC):
272
274
  api_key: str,
273
275
  token_encoding_name: str,
274
276
  max_attempts: int,
275
- status_tracker: Optional[StatusTracker],
276
- rate_limiter: RateLimiter,
277
- queue: Optional[AsyncQueue]
277
+ max_requests_per_minute: int,
278
+ max_tokens_per_minute: int,
279
+ ratelimiter,
280
+ status_tracker: Optional[StatusTracker] = None,
281
+ queue: Optional[AsyncQueue] = None,
278
282
  ) -> None:
279
283
  """
280
284
  Initializes the BaseAPIService with necessary configuration.
@@ -299,8 +303,8 @@ class BaseAPIService(ABC):
299
303
  self.token_encoding_name = token_encoding_name
300
304
  self.max_attempts = max_attempts
301
305
  self.status_tracker = status_tracker or StatusTracker()
302
- self.rate_limiter = rate_limiter
303
306
  self.queue = queue or AsyncQueue()
307
+ self.rate_limiter = ratelimiter(max_requests_per_minute, max_tokens_per_minute)
304
308
 
305
309
  @abstractmethod
306
310
  async def call_api(self) -> Any:
@@ -316,7 +320,8 @@ class BaseAPIService(ABC):
316
320
  ... # Implementation details here
317
321
  ...
318
322
  """
319
- pass
323
+
324
+ ...
320
325
 
321
326
  def handle_error(
322
327
  self,
@@ -346,19 +351,6 @@ class BaseAPIService(ABC):
346
351
  self.append_to_jsonl(data, save_filepath)
347
352
  logging.error(f"Request failed after all attempts. Saving errors: {data}")
348
353
 
349
- @staticmethod
350
- def append_to_jsonl(data: Any, filename: str) -> None:
351
- """
352
- Appends the given data to the specified JSONL file.
353
-
354
- Args:
355
- data (Any): The data to be appended in JSON Lines format.
356
- filename (str): The file path to the JSONL file.
357
- """
358
- json_string = json.dumps(data)
359
- with open(filename, "a") as f:
360
- f.write(json_string + "\n")
361
-
362
354
  @staticmethod
363
355
  def api_endpoint_from_url(request_url: str) -> str:
364
356
  """
lionagi/utils/doc_util.py CHANGED
@@ -26,7 +26,7 @@ def dir_to_path(dir: str, ext, recursive: bool = False, flat: bool = True):
26
26
  def _dir_to_path(ext, recursive=recursive):
27
27
  tem = '**/*' if recursive else '*'
28
28
  return list(Path(dir).glob(tem + ext))
29
-
29
+
30
30
  return to_list(l_call(ext, _dir_to_path, flat=True), flat=flat)
31
31
 
32
32
  def read_text(filepath: str, clean: bool = True) -> str:
@@ -48,15 +48,15 @@ def read_text(filepath: str, clean: bool = True) -> str:
48
48
  content = f.read()
49
49
  if clean:
50
50
  # Define characters to replace and their replacements
51
- replacements = {'\\': ' ', '\\\n': ' ', '\\\t': ' ', ' ': ' ', '\'': ' '}
51
+ replacements = {'\\': ' ', '\n': ' ', '\t': ' ', ' ': ' ', '\'': ' '}
52
52
  for old, new in replacements.items():
53
53
  content = content.replace(old, new)
54
54
  return content
55
55
 
56
- def dir_to_files(dir: str, ext: str, recursive: bool = False,
57
- reader: Callable = read_text, clean: bool = True,
56
+ def dir_to_files(dir: str, ext: str, recursive: bool = False,
57
+ reader: Callable = read_text, clean: bool = True,
58
58
  to_csv: bool = False, project: str = 'project',
59
- output_dir: str = 'data/logs/sources/', filename: Optional[str] = None,
59
+ output_dir: str = 'data/logs/sources/', filename: Optional[str] = None,
60
60
  verbose: bool = True, timestamp: bool = True, logger: Optional[DataLogger] = None):
61
61
  """
62
62
  Reads and processes files in a specified directory with the given extension.
@@ -81,9 +81,9 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
81
81
  Examples:
82
82
  >>> logs = dir_to_files(dir='my_directory', ext='.txt', to_csv=True)
83
83
  """
84
-
84
+
85
85
  sources = dir_to_path(dir, ext, recursive)
86
-
86
+
87
87
  def split_path(path: Path) -> tuple:
88
88
  folder_name = path.parent.name
89
89
  file_name = path.name
@@ -99,9 +99,9 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
99
99
  "file_size": len(str(content)),
100
100
  'content': content
101
101
  } if content else None
102
-
102
+
103
103
  logs = to_list(l_call(sources, to_dict, flat=True), dropna=True)
104
-
104
+
105
105
  if to_csv:
106
106
  filename = filename or f"{project}_sources.csv"
107
107
  logger = DataLogger(dir=output_dir, log=logs) if not logger else logger
@@ -109,7 +109,7 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
109
109
 
110
110
  return logs
111
111
 
112
- def chunk_text(input: str, chunk_size: int, overlap: float,
112
+ def chunk_text(input: str, chunk_size: int, overlap: float,
113
113
  threshold: int) -> List[Union[str, None]]:
114
114
  """
115
115
  Splits a string into chunks of a specified size, allowing for optional overlap between chunks.
@@ -127,19 +127,19 @@ def chunk_text(input: str, chunk_size: int, overlap: float,
127
127
  Returns:
128
128
  List[Union[str, None]]: List of text chunks.
129
129
  """
130
-
130
+
131
131
  try:
132
132
  # Ensure text is a string
133
133
  if not isinstance(input, str):
134
134
  input = str(input)
135
-
135
+
136
136
  chunks = []
137
137
  n_chunks = math.ceil(len(input) / chunk_size)
138
138
  overlap_size = int(chunk_size * overlap / 2)
139
-
139
+
140
140
  if n_chunks == 1:
141
141
  return [input]
142
-
142
+
143
143
  elif n_chunks == 2:
144
144
  chunks.append(input[:chunk_size + overlap_size])
145
145
  if len(input) - chunk_size > threshold:
@@ -147,28 +147,28 @@ def chunk_text(input: str, chunk_size: int, overlap: float,
147
147
  else:
148
148
  return [input]
149
149
  return chunks
150
-
150
+
151
151
  elif n_chunks > 2:
152
152
  chunks.append(input[:chunk_size + overlap_size])
153
153
  for i in range(1, n_chunks - 1):
154
154
  start_idx = chunk_size * i - overlap_size
155
155
  end_idx = chunk_size * (i + 1) + overlap_size
156
156
  chunks.append(input[start_idx:end_idx])
157
-
157
+
158
158
  if len(input) - chunk_size * (n_chunks - 1) > threshold:
159
159
  chunks.append(input[chunk_size * (n_chunks - 1) - overlap_size:])
160
160
  else:
161
- chunks[-1] += input[chunk_size * (n_chunks - 1):]
162
-
161
+ chunks[-1] += input[chunk_size * (n_chunks - 1) + overlap_size:]
162
+
163
163
  return chunks
164
-
164
+
165
165
  except Exception as e:
166
166
  raise ValueError(f"An error occurred while chunking the text. {e}")
167
167
 
168
- def _file_to_chunks(input: Dict[str, Any],
169
- field: str = 'content',
170
- chunk_size: int = 1500,
171
- overlap: float = 0.2,
168
+ def _file_to_chunks(input: Dict[str, Any],
169
+ field: str = 'content',
170
+ chunk_size: int = 1500,
171
+ overlap: float = 0.2,
172
172
  threshold: int = 200) -> List[Dict[str, Any]]:
173
173
  """
174
174
  Splits text from a specified dictionary field into chunks and returns a list of dictionaries.
@@ -195,7 +195,7 @@ def _file_to_chunks(input: Dict[str, Any],
195
195
  try:
196
196
  out = {key: value for key, value in input.items() if key != field}
197
197
  out.update({"chunk_overlap": overlap, "chunk_threshold": threshold})
198
-
198
+
199
199
  chunks = chunk_text(input[field], chunk_size=chunk_size, overlap=overlap, threshold=threshold)
200
200
  logs = []
201
201
  for i, chunk in enumerate(chunks):
@@ -209,22 +209,22 @@ def _file_to_chunks(input: Dict[str, Any],
209
209
  logs.append(chunk_dict)
210
210
 
211
211
  return logs
212
-
212
+
213
213
  except Exception as e:
214
214
  raise ValueError(f"An error occurred while chunking the file. {e}")
215
-
216
- def file_to_chunks(input,
217
- field: str = 'content',
218
- chunk_size: int = 1500,
219
- overlap: float = 0.2,
220
- threshold: int = 200,
221
- to_csv=False,
215
+
216
+ def file_to_chunks(input,
217
+ field: str = 'content',
218
+ chunk_size: int = 1500,
219
+ overlap: float = 0.2,
220
+ threshold: int = 200,
221
+ to_csv=False,
222
222
  project='project',
223
- output_dir='data/logs/sources/',
223
+ output_dir='data/logs/sources/',
224
224
  chunk_func = _file_to_chunks,
225
- filename=None,
226
- verbose=True,
227
- timestamp=True,
225
+ filename=None,
226
+ verbose=True,
227
+ timestamp=True,
228
228
  logger=None):
229
229
  """
230
230
  Splits text from a specified dictionary field into chunks and returns a list of dictionaries.
@@ -243,10 +243,10 @@ def file_to_chunks(input,
243
243
  timestamp: If True, include a timestamp in the exported file name.
244
244
  logger: An optional DataLogger instance for logging.
245
245
  """
246
-
246
+
247
247
  f = lambda x: chunk_func(x, field=field, chunk_size=chunk_size, overlap=overlap, threshold=threshold)
248
248
  logs = to_list(l_call(input, f), flat=True)
249
-
249
+
250
250
  if to_csv:
251
251
  filename = filename if filename else f"{project}_sources.csv"
252
252
  logger = DataLogger(log=logs) if not logger else logger