lionagi 0.0.106__py3-none-any.whl → 0.0.107__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
lionagi/utils/api_util.py CHANGED
@@ -1,27 +1,37 @@
1
1
  import asyncio
2
- import json
3
2
  import logging
4
3
  import re
5
4
  from abc import ABC, abstractmethod
6
5
  from dataclasses import dataclass
7
6
  from typing import Any, Callable, Dict, Generator, NoReturn, Optional
8
7
 
8
+ from .sys_util import append_to_jsonl
9
+
10
+
9
11
  @dataclass
10
12
  class StatusTracker:
11
- """Class for keeping track of various task statuses.
13
+ """
14
+ Class for keeping track of various task statuses.
12
15
 
13
16
  This class serves as a simple way to monitor different types of task
14
17
  outcomes and errors within a system. It uses dataclasses for easy
15
18
  creation and management of state.
16
19
 
17
20
  Attributes:
18
- num_tasks_started: The number of tasks that have been initiated.
19
- num_tasks_in_progress: The number of tasks currently being processed.
20
- num_tasks_succeeded: The number of tasks that have completed successfully.
21
- num_tasks_failed: The number of tasks that have failed.
22
- num_rate_limit_errors: The number of tasks that failed due to rate limiting.
23
- num_api_errors: The number of tasks that failed due to API errors.
24
- num_other_errors: The number of tasks that failed due to other errors.
21
+ num_tasks_started:
22
+ The number of tasks that have been initiated.
23
+ num_tasks_in_progress:
24
+ The number of tasks currently being processed.
25
+ num_tasks_succeeded:
26
+ The number of tasks that have completed successfully.
27
+ num_tasks_failed:
28
+ The number of tasks that have failed.
29
+ num_rate_limit_errors:
30
+ The number of tasks that failed due to rate limiting.
31
+ num_api_errors:
32
+ The number of tasks that failed due to API errors.
33
+ num_other_errors:
34
+ The number of tasks that failed due to other errors.
25
35
  """
26
36
  num_tasks_started: int = 0
27
37
  num_tasks_in_progress: int = 0
@@ -41,16 +51,24 @@ class AsyncQueue:
41
51
  concurrent task processing in an orderly and controlled manner.
42
52
 
43
53
  Attributes:
44
- queue (asyncio.Queue): A queue to hold items for asynchronous processing.
45
- _stop_event (asyncio.Event): An event to signal when the queue should stop processing.
54
+ queue (asyncio.Queue):
55
+ A queue to hold items for asynchronous processing.
56
+ _stop_event (asyncio.Event):
57
+ An event to signal when the queue should stop processing.
46
58
 
47
59
  Methods:
48
- enqueue(item): Add an item to the queue for processing.
49
- dequeue(): Remove and return an item from the queue.
50
- join(): Wait until all items in the queue have been processed.
51
- stop(): Signal to stop processing new items in the queue.
52
- stopped(): Check if the queue has been signaled to stop.
53
- process_requests(func): Process items using a provided function.
60
+ enqueue(item):
61
+ Add an item to the queue for processing.
62
+ dequeue():
63
+ Remove and return an item from the queue.
64
+ join():
65
+ Wait until all items in the queue have been processed.
66
+ stop():
67
+ Signal to stop processing new items in the queue.
68
+ stopped():
69
+ Check if the queue has been signaled to stop.
70
+ process_requests(func):
71
+ Process items using a provided function.
54
72
  """
55
73
 
56
74
  def __init__(self) -> None:
@@ -64,7 +82,7 @@ class AsyncQueue:
64
82
  """
65
83
  Asynchronously add an item to the queue for processing.
66
84
 
67
- Args:
85
+ Parameters:
68
86
  item (Any): The item to be added to the queue.
69
87
 
70
88
  Example:
@@ -139,7 +157,7 @@ class AsyncQueue:
139
157
  Continuously dequeues items and applies the given function to each.
140
158
  The processing stops when the queue is signaled to stop or a sentinel value (`None`) is dequeued.
141
159
 
142
- Args:
160
+ Parameters:
143
161
  func (Callable[[Any], Any]): A coroutine function to process items from the queue.
144
162
 
145
163
  Example:
@@ -166,23 +184,29 @@ class RateLimiter(ABC):
166
184
  of requests sent to or received from a network interface controller or an API.
167
185
 
168
186
  Attributes:
169
- max_requests_per_minute (int): Maximum number of requests permitted per minute.
170
- max_tokens_per_minute (int): Maximum number of tokens that can accumulate per minute.
171
- available_request_capacity (int): Current number of available request slots.
172
- available_token_capacity (int): Current number of available tokens.
173
- rate_limit_replenisher_task (asyncio.Task): Background task for replenishing rate limits.
187
+ max_requests_per_minute (int):
188
+ Maximum number of requests permitted per minute.
189
+ max_tokens_per_minute (int):
190
+ Maximum number of tokens that can accumulate per minute.
191
+ available_request_capacity (int):
192
+ Current number of available request slots.
193
+ available_token_capacity (int):
194
+ Current number of available tokens.
174
195
 
175
196
  Methods:
176
- rate_limit_replenisher: Coroutine to replenish rate limits over time.
177
- calculate_num_token: Method to calculate required tokens for a request.
197
+ rate_limit_replenisher:
198
+ Coroutine to replenish rate limits over time.
199
+ calculate_num_token:
200
+ Method to calculate required tokens for a request.
178
201
  """
179
202
 
180
203
  def __init__(self, max_requests_per_minute: int, max_tokens_per_minute: int) -> None:
181
204
  """
182
205
  Initializes the RateLimiter with specified maximum request and token limits.
183
206
 
184
- Args:
207
+ Parameters:
185
208
  max_requests_per_minute (int): Maximum requests allowed per minute.
209
+
186
210
  max_tokens_per_minute (int): Maximum tokens allowed to accumulate per minute.
187
211
 
188
212
  Example:
@@ -226,8 +250,9 @@ class RateLimiter(ABC):
226
250
  Subclasses should implement this method to determine the number of tokens needed based
227
251
  on the request payload and target endpoint.
228
252
 
229
- Args:
253
+ Parameters:
230
254
  payload (Dict[str, Any]): Payload of the request.
255
+
231
256
  api_endpoint (str): Target API endpoint for the request.
232
257
 
233
258
  Returns:
@@ -245,6 +270,7 @@ class RateLimiter(ABC):
245
270
 
246
271
  ...
247
272
 
273
+
248
274
  class BaseAPIService(ABC):
249
275
  """
250
276
  Abstract base class for API services requiring asynchronous operations.
@@ -254,19 +280,30 @@ class BaseAPIService(ABC):
254
280
  subclassed for concrete implementations of specific API service interactions.
255
281
 
256
282
  Attributes:
257
- api_key (str): The API key used for authenticating with the API service.
258
- token_encoding_name (str): The encoding for the API token.
259
- max_attempts (int): The maximum number of retry attempts for API calls.
260
- status_tracker (StatusTracker): Tracker for API call statuses.
261
- rate_limiter (RateLimiter): Limiter to control the rate of API calls.
262
- queue (AsyncQueue): Queue for managing API call tasks.
283
+ api_key (str):
284
+ The API key used for authenticating with the API service.
285
+ token_encoding_name (str):
286
+ The encoding for the API token.
287
+ max_attempts (int):
288
+ The maximum number of retry attempts for API calls.
289
+ status_tracker (StatusTracker):
290
+ Tracker for API call statuses.
291
+ rate_limiter (RateLimiter):
292
+ Limiter to control the rate of API calls.
293
+ queue (AsyncQueue):
294
+ Queue for managing API call tasks.
263
295
 
264
296
  Methods:
265
- call_api: Abstract method to define API call mechanism in subclasses.
266
- handle_error: Handle errors by logging and saving details to a JSONL file.
267
- append_to_jsonl: Append data to a file in JSONL format.
268
- api_endpoint_from_url: Extract the API endpoint from a URL.
269
- task_id_generator_function: Generate a sequence of unique task IDs.
297
+ call_api:
298
+ Abstract method to define API call mechanism in subclasses.
299
+ handle_error:
300
+ Handle errors by logging and saving details to a JSONL file.
301
+ append_to_jsonl:
302
+ Append data to a file in JSONL format.
303
+ api_endpoint_from_url:
304
+ Extract the API endpoint from a URL.
305
+ task_id_generator_function:
306
+ Generate a sequence of unique task IDs.
270
307
  """
271
308
 
272
309
  def __init__(
@@ -278,17 +315,22 @@ class BaseAPIService(ABC):
278
315
  max_tokens_per_minute: int,
279
316
  ratelimiter,
280
317
  status_tracker: Optional[StatusTracker] = None,
281
- queue: Optional[AsyncQueue] = None,
318
+ queue: Optional[AsyncQueue] = None,
282
319
  ) -> None:
283
320
  """
284
321
  Initializes the BaseAPIService with necessary configuration.
285
322
 
286
- Args:
323
+ Parameters:
287
324
  api_key (str): The API key for authentication.
325
+
288
326
  token_encoding_name (str): Encoding name for the API token.
327
+
289
328
  max_attempts (int): Maximum number of attempts for an API call.
329
+
290
330
  status_tracker (Optional[StatusTracker]): Tracker for API call statuses.
291
- rate_limiter (RateLimiter): Limiter for API call rates.
331
+
332
+ ratelimiter: Limiter for API call rates.
333
+
292
334
  queue (Optional[AsyncQueue]): Queue for managing API tasks.
293
335
 
294
336
  Example:
@@ -297,7 +339,7 @@ class BaseAPIService(ABC):
297
339
  ...
298
340
  >>> service = MyAPIService(api_key="12345", token_encoding_name="utf-8",
299
341
  ... max_attempts=3, status_tracker=None,
300
- ... rate_limiter=my_rate_limiter, queue=None)
342
+ ... rate_limiter=ratelimiter, queue=None)
301
343
  """
302
344
  self.api_key = api_key
303
345
  self.token_encoding_name = token_encoding_name
@@ -305,6 +347,7 @@ class BaseAPIService(ABC):
305
347
  self.status_tracker = status_tracker or StatusTracker()
306
348
  self.queue = queue or AsyncQueue()
307
349
  self.rate_limiter = ratelimiter(max_requests_per_minute, max_tokens_per_minute)
350
+ self.append_to_jsonl = append_to_jsonl
308
351
 
309
352
  @abstractmethod
310
353
  async def call_api(self) -> Any:
@@ -335,10 +378,13 @@ class BaseAPIService(ABC):
335
378
 
336
379
  Updates the status tracker to indicate the error and saves details to a JSONL file.
337
380
 
338
- Args:
381
+ Parameters:
339
382
  error (Exception): The exception that was raised during the API call.
383
+
340
384
  payload (Any): The data payload that was used for the API call.
385
+
341
386
  metadata (Any): Additional metadata related to the API call.
387
+
342
388
  save_filepath (str): The file path where error details should be saved.
343
389
  """
344
390
  self.status_tracker.num_tasks_in_progress -= 1
@@ -356,7 +402,7 @@ class BaseAPIService(ABC):
356
402
  """
357
403
  Extracts the endpoint from an API request URL.
358
404
 
359
- Args:
405
+ Parameters:
360
406
  request_url (str): The URL from which to extract the API endpoint.
361
407
 
362
408
  Returns:
lionagi/utils/doc_util.py CHANGED
@@ -12,8 +12,11 @@ def dir_to_path(dir: str, ext, recursive: bool = False, flat: bool = True):
12
12
 
13
13
  Parameters:
14
14
  dir (str): The directory path where to search for files.
15
+
15
16
  ext (str): The file extension to filter by.
17
+
16
18
  recursive (bool, optional): If True, search for files recursively in subdirectories. Defaults to False.
19
+
17
20
  flat (bool, optional): If True, return a flat list of file paths. Defaults to True.
18
21
 
19
22
  Returns:
@@ -27,14 +30,18 @@ def dir_to_path(dir: str, ext, recursive: bool = False, flat: bool = True):
27
30
  tem = '**/*' if recursive else '*'
28
31
  return list(Path(dir).glob(tem + ext))
29
32
 
30
- return to_list(l_call(ext, _dir_to_path, flat=True), flat=flat)
31
-
33
+ try:
34
+ return to_list(l_call(ext, _dir_to_path, flat=True), flat=flat)
35
+ except:
36
+ raise ValueError("Invalid directory or extension, please check the path")
37
+
32
38
  def read_text(filepath: str, clean: bool = True) -> str:
33
39
  """
34
40
  Reads the content of a text file and optionally cleans it by removing specified characters.
35
41
 
36
42
  Parameters:
37
43
  filepath (str): The path to the text file to be read.
44
+
38
45
  clean (bool, optional): If True, clean the content by removing specific unwanted characters. Defaults to True.
39
46
 
40
47
  Returns:
@@ -63,16 +70,27 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
63
70
 
64
71
  Parameters:
65
72
  dir (str): The directory path where files are located.
73
+
66
74
  ext (str): The file extension to filter by.
75
+
67
76
  recursive (bool, optional): If True, search files recursively in subdirectories. Defaults to False.
77
+
68
78
  reader (Callable, optional): Function used to read and process the content of each file. Defaults to read_text.
79
+
69
80
  clean (bool, optional): If True, cleans the content by removing specified characters. Defaults to True.
81
+
70
82
  to_csv (bool, optional): If True, export the processed data to a CSV file. Defaults to False.
83
+
71
84
  project (str, optional): The name of the project. Defaults to 'project'.
85
+
72
86
  output_dir (str, optional): Directory path for exporting the CSV file. Defaults to 'data/logs/sources/'.
87
+
73
88
  filename (Optional[str], optional): Name of the CSV file, if not provided, a default will be used. Defaults to None.
89
+
74
90
  verbose (bool, optional): If True, print a message upon CSV export. Defaults to True.
91
+
75
92
  timestamp (bool, optional): If True, include a timestamp in the file name. Defaults to True.
93
+
76
94
  logger (Optional[DataLogger], optional): An instance of DataLogger for logging, if not provided, a new one will be created. Defaults to None.
77
95
 
78
96
  Returns:
@@ -84,13 +102,13 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
84
102
 
85
103
  sources = dir_to_path(dir, ext, recursive)
86
104
 
87
- def split_path(path: Path) -> tuple:
105
+ def _split_path(path: Path) -> tuple:
88
106
  folder_name = path.parent.name
89
107
  file_name = path.name
90
108
  return (folder_name, file_name)
91
109
 
92
- def to_dict(path_: Path) -> Dict[str, Union[str, Path]]:
93
- folder, file = split_path(path_)
110
+ def _to_dict(path_: Path) -> Dict[str, Union[str, Path]]:
111
+ folder, file = _split_path(path_)
94
112
  content = reader(str(path_), clean=clean)
95
113
  return {
96
114
  'project': project,
@@ -100,7 +118,7 @@ def dir_to_files(dir: str, ext: str, recursive: bool = False,
100
118
  'content': content
101
119
  } if content else None
102
120
 
103
- logs = to_list(l_call(sources, to_dict, flat=True), dropna=True)
121
+ logs = to_list(l_call(sources, _to_dict, flat=True), dropna=True)
104
122
 
105
123
  if to_csv:
106
124
  filename = filename or f"{project}_sources.csv"
@@ -114,14 +132,18 @@ def chunk_text(input: str, chunk_size: int, overlap: float,
114
132
  """
115
133
  Splits a string into chunks of a specified size, allowing for optional overlap between chunks.
116
134
 
117
- Args:
135
+ Parameters:
118
136
  input (str): The text to be split into chunks.
137
+
119
138
  chunk_size (int): The size of each chunk in characters.
139
+
120
140
  overlap (float): A value between [0, 1] specifying the percentage of overlap between adjacent chunks.
141
+
121
142
  threshold (int): The minimum size for the last chunk. If the last chunk is smaller than this, it will be merged with the previous chunk.
122
143
 
123
144
  Raises:
124
145
  TypeError: If input text cannot be converted to a string.
146
+
125
147
  ValueError: If any error occurs during the chunking process.
126
148
 
127
149
  Returns:
@@ -173,11 +195,15 @@ def _file_to_chunks(input: Dict[str, Any],
173
195
  """
174
196
  Splits text from a specified dictionary field into chunks and returns a list of dictionaries.
175
197
 
176
- Args:
198
+ Parameters:
177
199
  input (Dict[str, Any]): The input dictionary containing the text field to be chunked.
200
+
178
201
  field (str, optional): The dictionary key corresponding to the text field. Defaults to 'content'.
202
+
179
203
  chunk_size (int, optional): Size of each text chunk in characters. Defaults to 1500.
204
+
180
205
  overlap (float, optional): Percentage of overlap between adjacent chunks, in the range [0, 1]. Defaults to 0.2.
206
+
181
207
  threshold (int, optional): Minimum size for the last chunk. If smaller, it will be merged with the previous chunk. Defaults to 200.
182
208
 
183
209
  Raises:
@@ -229,23 +255,39 @@ def file_to_chunks(input,
229
255
  """
230
256
  Splits text from a specified dictionary field into chunks and returns a list of dictionaries.
231
257
 
232
- Args:
258
+ Parameters:
233
259
  input (List[Dict[str, Any]]): The input dictionaries containing the text field to be chunked.
260
+
234
261
  field (str, optional): The dictionary key corresponding to the text field. Defaults to 'content'.
262
+
235
263
  chunk_size (int, optional): Size of each text chunk in characters. Defaults to 1500.
264
+
236
265
  overlap (float, optional): Percentage of overlap between adjacent chunks, in the range [0, 1]. Defaults to 0.2.
266
+
237
267
  threshold (int, optional): Minimum size for the last chunk. If smaller, it will be merged with the previous chunk. Defaults to 200.
238
- to_csv: If True, export the processed data to a CSV file.
239
- project: The name of the project.
240
- output_dir: The directory path for exporting the CSV file.
241
- filename: The name of the CSV file.
242
- verbose: If True, print a verbose message after export.
243
- timestamp: If True, include a timestamp in the exported file name.
244
- logger: An optional DataLogger instance for logging.
268
+
269
+ to_csv (bool, optional): If True, export the processed data to a CSV file.
270
+
271
+ project (str, optional): The name of the project.
272
+
273
+ output_dir (str, optional): The directory path for exporting the CSV file.
274
+
275
+ chunk_func (function, optional): The function to be used for chunking. Defaults to _file_to_chunks.
276
+
277
+ filename (str, optional): The name of the CSV file.
278
+
279
+ verbose (bool, optional): If True, print a verbose message after export.
280
+
281
+ timestamp (bool, optional): If True, include a timestamp in the exported file name.
282
+
283
+ logger (DataLogger, optional): An optional DataLogger instance for logging.
284
+
285
+ Returns:
286
+ List[Dict[str, Any]]: A list of dictionaries representing the processed text chunks.
245
287
  """
246
288
 
247
- f = lambda x: chunk_func(x, field=field, chunk_size=chunk_size, overlap=overlap, threshold=threshold)
248
- logs = to_list(l_call(input, f), flat=True)
289
+ _f = lambda x: chunk_func(x, field=field, chunk_size=chunk_size, overlap=overlap, threshold=threshold)
290
+ logs = to_list(l_call(input, _f), flat=True)
249
291
 
250
292
  if to_csv:
251
293
  filename = filename if filename else f"{project}_sources.csv"
@@ -259,18 +301,19 @@ def get_bins(input: List[str], upper: int = 7500) -> List[List[int]]:
259
301
  Get index of elements in a list based on their consecutive cumulative sum of length,
260
302
  according to some upper threshold. Return lists of indices as bins.
261
303
 
262
- Args:
263
- input (List[str]): List of items to be binned.
264
- upper (int, optional): Upper threshold for the cumulative sum of the length of items in a bin. Default is 7500.
304
+ Parameters:
305
+ input (List[str]): List of items to be binned.
306
+
307
+ upper (int, optional): Upper threshold for the cumulative sum of the length of items in a bin. Default is 7500.
265
308
 
266
309
  Returns:
267
- List[List[int]]: List of lists, where each inner list contains the indices of the items that form a bin.
310
+ List[List[int]]: List of lists, where each inner list contains the indices of the items that form a bin.
268
311
 
269
312
  Example:
270
- >>> items = ['apple', 'a', 'b', 'banana', 'cheery', 'c', 'd', 'e']
271
- >>> upper = 10
272
- >>> get_bins(items, upper)
273
- [[0, 1, 2], [3], [4, 5, 6, 7]]
313
+ >>> items = ['apple', 'a', 'b', 'banana', 'cheery', 'c', 'd', 'e']
314
+ >>> upper = 10
315
+ >>> get_bins(items, upper)
316
+ [[0, 1, 2], [3], [4, 5, 6, 7]]
274
317
  """
275
318
  current = 0
276
319
  bins = []
lionagi/utils/log_util.py CHANGED
@@ -11,14 +11,18 @@ class DataLogger:
11
11
  and setting the directory where the logs should be saved.
12
12
 
13
13
  Attributes:
14
- dir (str): The directory where the log files are to be saved.
15
- log (deque): A deque that stores log entries.
14
+ dir (str):
15
+ The directory where the log files are to be saved.
16
+ log (deque):
17
+ A deque that stores log entries.
16
18
 
17
19
  Methods:
18
- __call__(entry): Appends a new entry to the log.
20
+ __call__(entry):
21
+ Appends a new entry to the log.
19
22
  to_csv(dir: str, filename: str, verbose: bool, timestamp: bool, dir_exist_ok: bool, file_exist_ok: bool):
20
23
  Converts the log to a CSV format and saves it to a file.
21
- set_dir(dir: str): Sets the directory for saving log files.
24
+ set_dir(dir: str):
25
+ Sets the directory for saving log files.
22
26
  """
23
27
 
24
28
  def __init__(self, dir= None, log: list = None) -> None:
@@ -27,6 +31,7 @@ class DataLogger:
27
31
 
28
32
  Parameters:
29
33
  dir (str, optional): The directory where the log files will be saved. Defaults to None.
34
+
30
35
  log (list, optional): An initial list of log entries. Defaults to an empty deque.
31
36
  """
32
37
  self.dir = dir
@@ -47,14 +52,20 @@ class DataLogger:
47
52
 
48
53
  Parameters:
49
54
  dir (str): The directory where the CSV file will be saved.
55
+
50
56
  filename (str): The name of the CSV file.
57
+
51
58
  verbose (bool, optional): If True, prints a message after saving the log. Defaults to True.
59
+
52
60
  timestamp (bool, optional): If True, appends a timestamp to the filename. Defaults to True.
61
+
53
62
  dir_exist_ok (bool, optional): If True, overrides the existing directory if needed. Defaults to True.
63
+
54
64
  file_exist_ok (bool, optional): If True, overrides the existing file if needed. Defaults to False.
55
65
 
56
66
  Postconditions:
57
67
  Saves the log entries to a CSV file and clears the `log` attribute.
68
+
58
69
  Optionally prints a message with the number of log entries saved and the file path.
59
70
  """
60
71
  filepath = create_path(dir=dir, filename=filename, timestamp=timestamp, dir_exist_ok=dir_exist_ok)