llm-ie 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py CHANGED
@@ -6,8 +6,7 @@ import warnings
6
6
  import itertools
7
7
  import asyncio
8
8
  import nest_asyncio
9
- from concurrent.futures import ThreadPoolExecutor
10
- from typing import Any, Set, List, Dict, Tuple, Union, Callable, Generator, Optional, AsyncGenerator
9
+ from typing import Any, Set, List, Dict, Tuple, Union, Callable, Generator, Optional
11
10
  from llm_ie.utils import extract_json, apply_prompt_template
12
11
  from llm_ie.data_types import FrameExtractionUnit, LLMInformationExtractionFrame, LLMInformationExtractionDocument
13
12
  from llm_ie.chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker
@@ -306,7 +305,7 @@ class StructExtractor(Extractor):
306
305
  yield {"type": "info", "data": "All units processed by LLM."}
307
306
  return units
308
307
 
309
- async def extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
308
+ async def _extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
310
309
  concurrent_batch_size:int=32, return_messages_log:bool=False) -> List[FrameExtractionUnit]:
311
310
  """
312
311
  This is the asynchronous version of the extract() method.
@@ -422,6 +421,28 @@ class StructExtractor(Extractor):
422
421
  for struct in structs:
423
422
  aggregated_struct.update(struct)
424
423
  return aggregated_struct
424
+
425
+ def _post_process_struct(self, units: List[FrameExtractionUnit]) -> Dict[str, Any]:
426
+ """
427
+ Helper method to post-process units into a structured dictionary.
428
+ Shared by extract_struct and extract_struct_async.
429
+ """
430
+ struct_json = []
431
+ for unit in units:
432
+ if unit.status != "success":
433
+ continue
434
+ try:
435
+ unit_struct_json = extract_json(unit.get_generated_text())
436
+ struct_json.extend(unit_struct_json)
437
+ except Exception as e:
438
+ unit.set_status("fail")
439
+ warnings.warn(f"Struct extraction failed for unit ({unit.start}, {unit.end}): {e}", RuntimeWarning)
440
+
441
+ if self.aggregation_func is None:
442
+ struct = self._default_struct_aggregate(struct_json)
443
+ else:
444
+ struct = self.aggregation_func(struct_json)
445
+ return struct
425
446
 
426
447
 
427
448
  def extract_struct(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
@@ -457,7 +478,7 @@ class StructExtractor(Extractor):
457
478
  warnings.warn("verbose=True is not supported in concurrent mode.", RuntimeWarning)
458
479
 
459
480
  nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
460
- extraction_results = asyncio.run(self.extract_async(text_content=text_content,
481
+ extraction_results = asyncio.run(self._extract_async(text_content=text_content,
461
482
  document_key=document_key,
462
483
  concurrent_batch_size=concurrent_batch_size,
463
484
  return_messages_log=return_messages_log)
@@ -470,26 +491,29 @@ class StructExtractor(Extractor):
470
491
 
471
492
  units, messages_log = extraction_results if return_messages_log else (extraction_results, None)
472
493
 
473
- struct_json = []
474
- for unit in units:
475
- if unit.status != "success":
476
- continue
477
- try:
478
- unit_struct_json = extract_json(unit.get_generated_text())
479
- struct_json.extend(unit_struct_json)
480
- except Exception as e:
481
- unit.set_status("fail")
482
- warnings.warn(f"Struct extraction failed for unit ({unit.start}, {unit.end}): {e}", RuntimeWarning)
483
-
484
- if self.aggregation_func is None:
485
- struct = self._default_struct_aggregate(struct_json)
486
- else:
487
- struct = self.aggregation_func(struct_json)
494
+ struct = self._post_process_struct(units)
488
495
 
489
496
  if return_messages_log:
490
497
  return struct, messages_log
491
498
  return struct
492
499
 
500
+ async def extract_struct_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
501
+ concurrent_batch_size:int=32, return_messages_log:bool=False) -> Dict[str, Any]:
502
+ """
503
+ This is the async version of extract_struct.
504
+ """
505
+ extraction_results = await self._extract_async(text_content=text_content,
506
+ document_key=document_key,
507
+ concurrent_batch_size=concurrent_batch_size,
508
+ return_messages_log=return_messages_log)
509
+
510
+ units, messages_log = extraction_results if return_messages_log else (extraction_results, None)
511
+ struct = self._post_process_struct(units)
512
+
513
+ if return_messages_log:
514
+ return struct, messages_log
515
+ return struct
516
+
493
517
 
494
518
  class BasicStructExtractor(StructExtractor):
495
519
  def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
@@ -725,6 +749,14 @@ class FrameExtractor(Extractor):
725
749
  """
726
750
  return NotImplemented
727
751
 
752
+ @abc.abstractmethod
753
+ async def extract_frames_async(self, text_content:Union[str, Dict[str,str]], entity_key:str,
754
+ document_key:str=None, return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
755
+ """
756
+ This is the async version of extract_frames.
757
+ """
758
+ return NotImplemented
759
+
728
760
 
729
761
  class DirectFrameExtractor(FrameExtractor):
730
762
  def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker,
@@ -933,7 +965,7 @@ class DirectFrameExtractor(FrameExtractor):
933
965
  yield {"type": "info", "data": "All units processed by LLM."}
934
966
  return units
935
967
 
936
- async def extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
968
+ async def _extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
937
969
  concurrent_batch_size:int=32, return_messages_log:bool=False) -> List[FrameExtractionUnit]:
938
970
  """
939
971
  This is the asynchronous version of the extract() method.
@@ -1040,6 +1072,45 @@ class DirectFrameExtractor(FrameExtractor):
1040
1072
  else:
1041
1073
  return units
1042
1074
 
1075
+ def _post_process_units_to_frames(self, units, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities):
1076
+ ENTITY_KEY = "entity_text"
1077
+ frame_list = []
1078
+ for unit in units:
1079
+ entity_json = []
1080
+ if unit.status != "success":
1081
+ warnings.warn(f"Skipping failed unit ({unit.start}, {unit.end}): {unit.text}", RuntimeWarning)
1082
+ continue
1083
+ for entity in extract_json(gen_text=unit.gen_text):
1084
+ if ENTITY_KEY in entity:
1085
+ entity_json.append(entity)
1086
+ else:
1087
+ warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{ENTITY_KEY}"). This frame will be dropped.', RuntimeWarning)
1088
+
1089
+ spans = self._find_entity_spans(text=unit.text,
1090
+ entities=[e[ENTITY_KEY] for e in entity_json],
1091
+ case_sensitive=case_sensitive,
1092
+ fuzzy_match=fuzzy_match,
1093
+ fuzzy_buffer_size=fuzzy_buffer_size,
1094
+ fuzzy_score_cutoff=fuzzy_score_cutoff,
1095
+ allow_overlap_entities=allow_overlap_entities)
1096
+ for ent, span in zip(entity_json, spans):
1097
+ if span is not None:
1098
+ start, end = span
1099
+ entity_text = unit.text[start:end]
1100
+ start += unit.start
1101
+ end += unit.start
1102
+ attr = {}
1103
+ if "attr" in ent and ent["attr"] is not None:
1104
+ attr = ent["attr"]
1105
+
1106
+ frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
1107
+ start=start,
1108
+ end=end,
1109
+ entity_text=entity_text,
1110
+ attr=attr)
1111
+ frame_list.append(frame)
1112
+ return frame_list
1113
+
1043
1114
 
1044
1115
  def extract_frames(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
1045
1116
  verbose:bool=False, concurrent:bool=False, concurrent_batch_size:int=32,
@@ -1088,7 +1159,7 @@ class DirectFrameExtractor(FrameExtractor):
1088
1159
  warnings.warn("verbose=True is not supported in concurrent mode.", RuntimeWarning)
1089
1160
 
1090
1161
  nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
1091
- extraction_results = asyncio.run(self.extract_async(text_content=text_content,
1162
+ extraction_results = asyncio.run(self._extract_async(text_content=text_content,
1092
1163
  document_key=document_key,
1093
1164
  concurrent_batch_size=concurrent_batch_size,
1094
1165
  return_messages_log=return_messages_log)
@@ -1101,248 +1172,31 @@ class DirectFrameExtractor(FrameExtractor):
1101
1172
 
1102
1173
  units, messages_log = extraction_results if return_messages_log else (extraction_results, None)
1103
1174
 
1104
- frame_list = []
1105
- for unit in units:
1106
- entity_json = []
1107
- if unit.status != "success":
1108
- warnings.warn(f"Skipping failed unit ({unit.start}, {unit.end}): {unit.text}", RuntimeWarning)
1109
- continue
1110
- for entity in extract_json(gen_text=unit.gen_text):
1111
- if ENTITY_KEY in entity:
1112
- entity_json.append(entity)
1113
- else:
1114
- warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{ENTITY_KEY}"). This frame will be dropped.', RuntimeWarning)
1115
-
1116
- spans = self._find_entity_spans(text=unit.text,
1117
- entities=[e[ENTITY_KEY] for e in entity_json],
1118
- case_sensitive=case_sensitive,
1119
- fuzzy_match=fuzzy_match,
1120
- fuzzy_buffer_size=fuzzy_buffer_size,
1121
- fuzzy_score_cutoff=fuzzy_score_cutoff,
1122
- allow_overlap_entities=allow_overlap_entities)
1123
- for ent, span in zip(entity_json, spans):
1124
- if span is not None:
1125
- start, end = span
1126
- entity_text = unit.text[start:end]
1127
- start += unit.start
1128
- end += unit.start
1129
- attr = {}
1130
- if "attr" in ent and ent["attr"] is not None:
1131
- attr = ent["attr"]
1132
-
1133
- frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
1134
- start=start,
1135
- end=end,
1136
- entity_text=entity_text,
1137
- attr=attr)
1138
- frame_list.append(frame)
1175
+ frame_list = self._post_process_units_to_frames(units, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities)
1139
1176
 
1140
1177
  if return_messages_log:
1141
1178
  return frame_list, messages_log
1142
1179
  return frame_list
1143
-
1144
1180
 
1145
- async def extract_frames_from_documents(self, text_contents:List[Union[str,Dict[str, any]]], document_key:str="text",
1146
- cpu_concurrency:int=4, llm_concurrency:int=32, case_sensitive:bool=False,
1147
- fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
1148
- allow_overlap_entities:bool=False, return_messages_log:bool=False) -> AsyncGenerator[Dict[str, any], None]:
1181
+ async def extract_frames_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
1182
+ concurrent_batch_size:int=32, case_sensitive:bool=False,
1183
+ fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
1184
+ allow_overlap_entities:bool=False, return_messages_log:bool=False) -> List[LLMInformationExtractionFrame]:
1149
1185
  """
1150
- This method inputs a list of documents and yields the results for each document as soon as it is complete.
1151
-
1152
- Parameters:
1153
- -----------
1154
- text_contents : List[Union[str,Dict[str, any]]]
1155
- a list of input text contents to put in prompt template.
1156
- If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
1157
- If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
1158
- document_key: str, optional
1159
- The key in the `text_contents` dictionaries that holds the document text.
1160
- cpu_concurrency: int, optional
1161
- The number of parallel threads to use for CPU-bound tasks like chunking.
1162
- llm_concurrency: int, optional
1163
- The number of concurrent requests to make to the LLM.
1164
- case_sensitive : bool, Optional
1165
- if True, entity text matching will be case-sensitive.
1166
- fuzzy_match : bool, Optional
1167
- if True, fuzzy matching will be applied to find entity text.
1168
- fuzzy_buffer_size : float, Optional
1169
- the buffer size for fuzzy matching. Default is 20% of entity text length.
1170
- fuzzy_score_cutoff : float, Optional
1171
- the Jaccard score cutoff for fuzzy matching.
1172
- Matched entity text must have a score higher than this value or a None will be returned.
1173
- allow_overlap_entities : bool, Optional
1174
- if True, entities can overlap in the text.
1175
- return_messages_log : bool, Optional
1176
- if True, a list of messages will be returned.
1177
-
1178
- Yields:
1179
- -------
1180
- AsyncGenerator[Dict[str, any], None]
1181
- A dictionary for each completed document, containing its 'idx' and extracted 'frames'.
1182
- """
1183
- # Validate text_contents must be a list of str or dict, and not both
1184
- if not isinstance(text_contents, list):
1185
- raise ValueError("text_contents must be a list of strings or dictionaries.")
1186
- if all(isinstance(doc, str) for doc in text_contents):
1187
- pass
1188
- elif all(isinstance(doc, dict) for doc in text_contents):
1189
- pass
1190
- # Set CPU executor and queues
1191
- cpu_executor = ThreadPoolExecutor(max_workers=cpu_concurrency)
1192
- tasks_queue = asyncio.Queue(maxsize=llm_concurrency * 2)
1193
- # Store to track units and pending counts
1194
- results_store = {
1195
- idx: {'pending': 0, 'units': [], 'text': doc if isinstance(doc, str) else doc.get(document_key, "")}
1196
- for idx, doc in enumerate(text_contents)
1197
- }
1198
-
1199
- output_queue = asyncio.Queue()
1200
- messages_logger = MessagesLogger() if return_messages_log else None
1201
-
1202
- async def producer():
1203
- try:
1204
- for idx, text_content in enumerate(text_contents):
1205
- text = text_content if isinstance(text_content, str) else text_content.get(document_key, "")
1206
- if not text:
1207
- warnings.warn(f"Document at index {idx} is empty or missing the document key '{document_key}'.")
1208
- # signal that this document is done
1209
- await output_queue.put({'idx': idx, 'frames': []})
1210
- continue
1211
-
1212
- units = await self.unit_chunker.chunk_async(text, cpu_executor)
1213
- await self.context_chunker.fit_async(text, units, cpu_executor)
1214
- results_store[idx]['pending'] = len(units)
1215
-
1216
- # Handle cases where a document yields no units
1217
- if not units:
1218
- # signal that this document is done
1219
- await output_queue.put({'idx': idx, 'frames': []})
1220
- continue
1221
-
1222
- # Iterate through units
1223
- for unit in units:
1224
- context = await self.context_chunker.chunk_async(unit, cpu_executor)
1225
- messages = []
1226
- if self.system_prompt:
1227
- messages.append({'role': 'system', 'content': self.system_prompt})
1228
-
1229
- if not context:
1230
- if isinstance(text_content, str):
1231
- messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
1232
- else:
1233
- unit_content = text_content.copy()
1234
- unit_content[document_key] = unit.text
1235
- messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
1236
- else:
1237
- # insert context to user prompt
1238
- if isinstance(text_content, str):
1239
- messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
1240
- else:
1241
- context_content = text_content.copy()
1242
- context_content[document_key] = context
1243
- messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
1244
- # simulate conversation where assistant confirms
1245
- messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
1246
- # place unit of interest
1247
- messages.append({'role': 'user', 'content': unit.text})
1248
-
1249
- await tasks_queue.put({'idx': idx, 'unit': unit, 'messages': messages})
1250
- finally:
1251
- for _ in range(llm_concurrency):
1252
- await tasks_queue.put(None)
1253
-
1254
- async def worker():
1255
- while True:
1256
- task_item = await tasks_queue.get()
1257
- if task_item is None:
1258
- tasks_queue.task_done()
1259
- break
1260
-
1261
- idx = task_item['idx']
1262
- unit = task_item['unit']
1263
- doc_results = results_store[idx]
1264
-
1265
- try:
1266
- gen_text = await self.inference_engine.chat_async(
1267
- messages=task_item['messages'], messages_logger=messages_logger
1268
- )
1269
- unit.set_generated_text(gen_text["response"])
1270
- unit.set_status("success")
1271
- doc_results['units'].append(unit)
1272
- except Exception as e:
1273
- warnings.warn(f"Error processing unit for doc idx {idx}: {e}")
1274
- finally:
1275
- doc_results['pending'] -= 1
1276
- if doc_results['pending'] <= 0:
1277
- final_frames = self._post_process_and_create_frames(doc_results, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities)
1278
- output_payload = {'idx': idx, 'frames': final_frames}
1279
- if return_messages_log:
1280
- output_payload['messages_log'] = messages_logger.get_messages_log()
1281
- await output_queue.put(output_payload)
1282
-
1283
- tasks_queue.task_done()
1284
-
1285
- # Start producer and workers
1286
- producer_task = asyncio.create_task(producer())
1287
- worker_tasks = [asyncio.create_task(worker()) for _ in range(llm_concurrency)]
1288
-
1289
- # Main loop to gather results
1290
- docs_completed = 0
1291
- while docs_completed < len(text_contents):
1292
- result = await output_queue.get()
1293
- yield result
1294
- docs_completed += 1
1295
-
1296
- # Final cleanup
1297
- await producer_task
1298
- await tasks_queue.join()
1299
-
1300
- # Cancel any lingering worker tasks
1301
- for task in worker_tasks:
1302
- task.cancel()
1303
- await asyncio.gather(*worker_tasks, return_exceptions=True)
1304
-
1305
- cpu_executor.shutdown(wait=False)
1306
-
1307
-
1308
- def _post_process_and_create_frames(self, doc_results, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities):
1309
- """Helper function to run post-processing logic for a completed document."""
1310
- ENTITY_KEY = "entity_text"
1311
- frame_list = []
1312
- for res in sorted(doc_results['units'], key=lambda r: r.start):
1313
- entity_json = []
1314
- for entity in extract_json(gen_text=res.gen_text):
1315
- if ENTITY_KEY in entity:
1316
- entity_json.append(entity)
1317
- else:
1318
- warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{ENTITY_KEY}"). This frame will be dropped.', RuntimeWarning)
1186
+ This is the async version of extract_frames.
1187
+ """
1188
+ extraction_results = await self._extract_async(text_content=text_content,
1189
+ document_key=document_key,
1190
+ concurrent_batch_size=concurrent_batch_size,
1191
+ return_messages_log=return_messages_log)
1192
+
1193
+ units, messages_log = extraction_results if return_messages_log else (extraction_results, None)
1194
+ frame_list = self._post_process_units_to_frames(units, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities)
1319
1195
 
1320
- spans = self._find_entity_spans(
1321
- text=res.text,
1322
- entities=[e[ENTITY_KEY] for e in entity_json],
1323
- case_sensitive=case_sensitive,
1324
- fuzzy_match=fuzzy_match,
1325
- fuzzy_buffer_size=fuzzy_buffer_size,
1326
- fuzzy_score_cutoff=fuzzy_score_cutoff,
1327
- allow_overlap_entities=allow_overlap_entities
1328
- )
1329
- for ent, span in zip(entity_json, spans):
1330
- if span is not None:
1331
- start, end = span
1332
- entity_text = res.text[start:end]
1333
- start += res.start
1334
- end += res.start
1335
- attr = ent.get("attr", {}) or {}
1336
- frame = LLMInformationExtractionFrame(
1337
- frame_id=f"{len(frame_list)}",
1338
- start=start,
1339
- end=end,
1340
- entity_text=entity_text,
1341
- attr=attr
1342
- )
1343
- frame_list.append(frame)
1196
+ if return_messages_log:
1197
+ return frame_list, messages_log
1344
1198
  return frame_list
1345
-
1199
+
1346
1200
 
1347
1201
  class ReviewFrameExtractor(DirectFrameExtractor):
1348
1202
  def __init__(self, unit_chunker:UnitChunker, context_chunker:ContextChunker, inference_engine:InferenceEngine,
@@ -1620,7 +1474,7 @@ class ReviewFrameExtractor(DirectFrameExtractor):
1620
1474
  for chunk in response_stream:
1621
1475
  yield chunk
1622
1476
 
1623
- async def extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
1477
+ async def _extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
1624
1478
  concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[FrameExtractionUnit]:
1625
1479
  """
1626
1480
  This is the asynchronous version of the extract() method with the review step.
@@ -2123,7 +1977,7 @@ class AttributeExtractor(Extractor):
2123
1977
  return (new_frames, messages_log) if return_messages_log else new_frames
2124
1978
 
2125
1979
 
2126
- async def extract_async(self, frames:List[LLMInformationExtractionFrame], text:str, context_size:int=256,
1980
+ async def _extract_async(self, frames:List[LLMInformationExtractionFrame], text:str, context_size:int=256,
2127
1981
  concurrent_batch_size:int=32, inplace:bool=True, return_messages_log:bool=False) -> Union[None, List[LLMInformationExtractionFrame]]:
2128
1982
  """
2129
1983
  This method extracts attributes from the document asynchronously.
@@ -2195,6 +2049,16 @@ class AttributeExtractor(Extractor):
2195
2049
  else:
2196
2050
  return (new_frames, messages_logger.get_messages_log()) if return_messages_log else new_frames
2197
2051
 
2052
+ async def extract_attributes_async(self, frames:List[LLMInformationExtractionFrame], text:str, context_size:int=256,
2053
+ concurrent_batch_size:int=32, inplace:bool=True,
2054
+ return_messages_log:bool=False) -> Union[None, List[LLMInformationExtractionFrame]]:
2055
+ """
2056
+ This is the async version of extract_attributes.
2057
+ """
2058
+ return await self._extract_async(frames=frames, text=text, context_size=context_size,
2059
+ concurrent_batch_size=concurrent_batch_size, inplace=inplace, return_messages_log=return_messages_log)
2060
+
2061
+
2198
2062
  def extract_attributes(self, frames:List[LLMInformationExtractionFrame], text:str, context_size:int=256,
2199
2063
  concurrent:bool=False, concurrent_batch_size:int=32, verbose:bool=False,
2200
2064
  return_messages_log:bool=False, inplace:bool=True) -> Union[None, List[LLMInformationExtractionFrame]]:
@@ -2230,7 +2094,7 @@ class AttributeExtractor(Extractor):
2230
2094
 
2231
2095
  nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
2232
2096
 
2233
- return asyncio.run(self.extract_async(frames=frames, text=text, context_size=context_size,
2097
+ return asyncio.run(self._extract_async(frames=frames, text=text, context_size=context_size,
2234
2098
  concurrent_batch_size=concurrent_batch_size,
2235
2099
  inplace=inplace, return_messages_log=return_messages_log))
2236
2100
  else:
@@ -2375,6 +2239,17 @@ class RelationExtractor(Extractor):
2375
2239
  return asyncio.run(self._extract_async(doc, buffer_size, concurrent_batch_size, return_messages_log))
2376
2240
  else:
2377
2241
  return self._extract(doc, buffer_size, verbose, return_messages_log)
2242
+
2243
+ async def extract_relations_async(self, doc: LLMInformationExtractionDocument, buffer_size: int = 128, concurrent_batch_size: int = 32, return_messages_log: bool = False) -> Union[List[Dict], Tuple[List[Dict], List]]:
2244
+ """
2245
+ This is the async version of extract_relations.
2246
+ """
2247
+ if not doc.has_frame():
2248
+ raise ValueError("Input document must have frames.")
2249
+ if doc.has_duplicate_frame_ids():
2250
+ raise ValueError("All frame_ids in the input document must be unique.")
2251
+
2252
+ return await self._extract_async(doc, buffer_size, concurrent_batch_size, return_messages_log)
2378
2253
 
2379
2254
 
2380
2255
  class BinaryRelationExtractor(RelationExtractor):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llm-ie
3
- Version: 1.3.0
3
+ Version: 1.4.0
4
4
  Summary: A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -10,7 +10,8 @@ Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Programming Language :: Python :: 3.11
11
11
  Classifier: Programming Language :: Python :: 3.12
12
12
  Requires-Dist: colorama (>=0.4.6,<0.5.0)
13
- Requires-Dist: json_repair (>=0.30,<0.31)
13
+ Requires-Dist: json_repair (>=0.30)
14
+ Requires-Dist: llm-inference-engine (>=0.1.1,<0.2.0)
14
15
  Requires-Dist: nest_asyncio (>=1.6.0,<2.0.0)
15
16
  Requires-Dist: nltk (>=3.8,<4.0)
16
17
  Description-Content-Type: text/markdown
@@ -22,10 +22,10 @@ llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=9
22
22
  llm_ie/asset/prompt_guide/StructExtractor_prompt_guide.txt,sha256=x8L4n_LVl6ofQu6cDE9YP4SB2FSQ4GrTee8y1XKwwwc,1922
23
23
  llm_ie/chunkers.py,sha256=b4APRwaLMU40QXVEhOK8m1DZi_jr-VCHAFwbMjqVBgA,11308
24
24
  llm_ie/data_types.py,sha256=iG_jdqhpBi33xnsfFQYayCXNBK-2N-8u1xIhoKfJzRI,18294
25
- llm_ie/engines.py,sha256=K4Zgb1dYiuopBeTLcgSAseI-VXgwtTeWf9O4EK9SQqE,63901
26
- llm_ie/extractors.py,sha256=Voexzc_sYQ3jBGkvLybazt9zVsLnnrMbsUswKciBS4I,120933
25
+ llm_ie/engines.py,sha256=Lxzj0gfbUjaU8TpWWM7MqS71Vmpqdq_mIHoLiXqOmXs,1089
26
+ llm_ie/extractors.py,sha256=5tvYfWWvwsfRHGgHYOE5080hx9tLm__eyORbimZ8UUY,115189
27
27
  llm_ie/prompt_editor.py,sha256=Hqukm2HMgsoGpXV3vZ__7CGgfMhd-UUIwTKGnfSDltM,12055
28
28
  llm_ie/utils.py,sha256=k6M4l8GsKOMcmO6UwONQ353Zk-TeoBj6HXGjlAn-JE0,3679
29
- llm_ie-1.3.0.dist-info/METADATA,sha256=GrgKPwzTXtHIBsEThNsJ6i7Z43Ghb2I5Y47mRYbSIAo,728
30
- llm_ie-1.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
31
- llm_ie-1.3.0.dist-info/RECORD,,
29
+ llm_ie-1.4.0.dist-info/METADATA,sha256=Y0f5yOV_GKP_qKQFje6Cw0p-EhNjsNzXVqJ839aZIBo,775
30
+ llm_ie-1.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
31
+ llm_ie-1.4.0.dist-info/RECORD,,
File without changes