llm-ie 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py
CHANGED
|
@@ -6,8 +6,7 @@ import warnings
|
|
|
6
6
|
import itertools
|
|
7
7
|
import asyncio
|
|
8
8
|
import nest_asyncio
|
|
9
|
-
from
|
|
10
|
-
from typing import Any, Set, List, Dict, Tuple, Union, Callable, Generator, Optional, AsyncGenerator
|
|
9
|
+
from typing import Any, Set, List, Dict, Tuple, Union, Callable, Generator, Optional
|
|
11
10
|
from llm_ie.utils import extract_json, apply_prompt_template
|
|
12
11
|
from llm_ie.data_types import FrameExtractionUnit, LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
13
12
|
from llm_ie.chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker
|
|
@@ -306,7 +305,7 @@ class StructExtractor(Extractor):
|
|
|
306
305
|
yield {"type": "info", "data": "All units processed by LLM."}
|
|
307
306
|
return units
|
|
308
307
|
|
|
309
|
-
async def
|
|
308
|
+
async def _extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
|
|
310
309
|
concurrent_batch_size:int=32, return_messages_log:bool=False) -> List[FrameExtractionUnit]:
|
|
311
310
|
"""
|
|
312
311
|
This is the asynchronous version of the extract() method.
|
|
@@ -422,6 +421,28 @@ class StructExtractor(Extractor):
|
|
|
422
421
|
for struct in structs:
|
|
423
422
|
aggregated_struct.update(struct)
|
|
424
423
|
return aggregated_struct
|
|
424
|
+
|
|
425
|
+
def _post_process_struct(self, units: List[FrameExtractionUnit]) -> Dict[str, Any]:
|
|
426
|
+
"""
|
|
427
|
+
Helper method to post-process units into a structured dictionary.
|
|
428
|
+
Shared by extract_struct and extract_struct_async.
|
|
429
|
+
"""
|
|
430
|
+
struct_json = []
|
|
431
|
+
for unit in units:
|
|
432
|
+
if unit.status != "success":
|
|
433
|
+
continue
|
|
434
|
+
try:
|
|
435
|
+
unit_struct_json = extract_json(unit.get_generated_text())
|
|
436
|
+
struct_json.extend(unit_struct_json)
|
|
437
|
+
except Exception as e:
|
|
438
|
+
unit.set_status("fail")
|
|
439
|
+
warnings.warn(f"Struct extraction failed for unit ({unit.start}, {unit.end}): {e}", RuntimeWarning)
|
|
440
|
+
|
|
441
|
+
if self.aggregation_func is None:
|
|
442
|
+
struct = self._default_struct_aggregate(struct_json)
|
|
443
|
+
else:
|
|
444
|
+
struct = self.aggregation_func(struct_json)
|
|
445
|
+
return struct
|
|
425
446
|
|
|
426
447
|
|
|
427
448
|
def extract_struct(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
|
|
@@ -457,7 +478,7 @@ class StructExtractor(Extractor):
|
|
|
457
478
|
warnings.warn("verbose=True is not supported in concurrent mode.", RuntimeWarning)
|
|
458
479
|
|
|
459
480
|
nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
|
|
460
|
-
extraction_results = asyncio.run(self.
|
|
481
|
+
extraction_results = asyncio.run(self._extract_async(text_content=text_content,
|
|
461
482
|
document_key=document_key,
|
|
462
483
|
concurrent_batch_size=concurrent_batch_size,
|
|
463
484
|
return_messages_log=return_messages_log)
|
|
@@ -470,26 +491,29 @@ class StructExtractor(Extractor):
|
|
|
470
491
|
|
|
471
492
|
units, messages_log = extraction_results if return_messages_log else (extraction_results, None)
|
|
472
493
|
|
|
473
|
-
|
|
474
|
-
for unit in units:
|
|
475
|
-
if unit.status != "success":
|
|
476
|
-
continue
|
|
477
|
-
try:
|
|
478
|
-
unit_struct_json = extract_json(unit.get_generated_text())
|
|
479
|
-
struct_json.extend(unit_struct_json)
|
|
480
|
-
except Exception as e:
|
|
481
|
-
unit.set_status("fail")
|
|
482
|
-
warnings.warn(f"Struct extraction failed for unit ({unit.start}, {unit.end}): {e}", RuntimeWarning)
|
|
483
|
-
|
|
484
|
-
if self.aggregation_func is None:
|
|
485
|
-
struct = self._default_struct_aggregate(struct_json)
|
|
486
|
-
else:
|
|
487
|
-
struct = self.aggregation_func(struct_json)
|
|
494
|
+
struct = self._post_process_struct(units)
|
|
488
495
|
|
|
489
496
|
if return_messages_log:
|
|
490
497
|
return struct, messages_log
|
|
491
498
|
return struct
|
|
492
499
|
|
|
500
|
+
async def extract_struct_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
|
|
501
|
+
concurrent_batch_size:int=32, return_messages_log:bool=False) -> Dict[str, Any]:
|
|
502
|
+
"""
|
|
503
|
+
This is the async version of extract_struct.
|
|
504
|
+
"""
|
|
505
|
+
extraction_results = await self._extract_async(text_content=text_content,
|
|
506
|
+
document_key=document_key,
|
|
507
|
+
concurrent_batch_size=concurrent_batch_size,
|
|
508
|
+
return_messages_log=return_messages_log)
|
|
509
|
+
|
|
510
|
+
units, messages_log = extraction_results if return_messages_log else (extraction_results, None)
|
|
511
|
+
struct = self._post_process_struct(units)
|
|
512
|
+
|
|
513
|
+
if return_messages_log:
|
|
514
|
+
return struct, messages_log
|
|
515
|
+
return struct
|
|
516
|
+
|
|
493
517
|
|
|
494
518
|
class BasicStructExtractor(StructExtractor):
|
|
495
519
|
def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
|
|
@@ -725,6 +749,14 @@ class FrameExtractor(Extractor):
|
|
|
725
749
|
"""
|
|
726
750
|
return NotImplemented
|
|
727
751
|
|
|
752
|
+
@abc.abstractmethod
|
|
753
|
+
async def extract_frames_async(self, text_content:Union[str, Dict[str,str]], entity_key:str,
|
|
754
|
+
document_key:str=None, return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
755
|
+
"""
|
|
756
|
+
This is the async version of extract_frames.
|
|
757
|
+
"""
|
|
758
|
+
return NotImplemented
|
|
759
|
+
|
|
728
760
|
|
|
729
761
|
class DirectFrameExtractor(FrameExtractor):
|
|
730
762
|
def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker,
|
|
@@ -933,7 +965,7 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
933
965
|
yield {"type": "info", "data": "All units processed by LLM."}
|
|
934
966
|
return units
|
|
935
967
|
|
|
936
|
-
async def
|
|
968
|
+
async def _extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
|
|
937
969
|
concurrent_batch_size:int=32, return_messages_log:bool=False) -> List[FrameExtractionUnit]:
|
|
938
970
|
"""
|
|
939
971
|
This is the asynchronous version of the extract() method.
|
|
@@ -1040,6 +1072,45 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
1040
1072
|
else:
|
|
1041
1073
|
return units
|
|
1042
1074
|
|
|
1075
|
+
def _post_process_units_to_frames(self, units, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities):
|
|
1076
|
+
ENTITY_KEY = "entity_text"
|
|
1077
|
+
frame_list = []
|
|
1078
|
+
for unit in units:
|
|
1079
|
+
entity_json = []
|
|
1080
|
+
if unit.status != "success":
|
|
1081
|
+
warnings.warn(f"Skipping failed unit ({unit.start}, {unit.end}): {unit.text}", RuntimeWarning)
|
|
1082
|
+
continue
|
|
1083
|
+
for entity in extract_json(gen_text=unit.gen_text):
|
|
1084
|
+
if ENTITY_KEY in entity:
|
|
1085
|
+
entity_json.append(entity)
|
|
1086
|
+
else:
|
|
1087
|
+
warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{ENTITY_KEY}"). This frame will be dropped.', RuntimeWarning)
|
|
1088
|
+
|
|
1089
|
+
spans = self._find_entity_spans(text=unit.text,
|
|
1090
|
+
entities=[e[ENTITY_KEY] for e in entity_json],
|
|
1091
|
+
case_sensitive=case_sensitive,
|
|
1092
|
+
fuzzy_match=fuzzy_match,
|
|
1093
|
+
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
1094
|
+
fuzzy_score_cutoff=fuzzy_score_cutoff,
|
|
1095
|
+
allow_overlap_entities=allow_overlap_entities)
|
|
1096
|
+
for ent, span in zip(entity_json, spans):
|
|
1097
|
+
if span is not None:
|
|
1098
|
+
start, end = span
|
|
1099
|
+
entity_text = unit.text[start:end]
|
|
1100
|
+
start += unit.start
|
|
1101
|
+
end += unit.start
|
|
1102
|
+
attr = {}
|
|
1103
|
+
if "attr" in ent and ent["attr"] is not None:
|
|
1104
|
+
attr = ent["attr"]
|
|
1105
|
+
|
|
1106
|
+
frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
|
|
1107
|
+
start=start,
|
|
1108
|
+
end=end,
|
|
1109
|
+
entity_text=entity_text,
|
|
1110
|
+
attr=attr)
|
|
1111
|
+
frame_list.append(frame)
|
|
1112
|
+
return frame_list
|
|
1113
|
+
|
|
1043
1114
|
|
|
1044
1115
|
def extract_frames(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
|
|
1045
1116
|
verbose:bool=False, concurrent:bool=False, concurrent_batch_size:int=32,
|
|
@@ -1088,7 +1159,7 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
1088
1159
|
warnings.warn("verbose=True is not supported in concurrent mode.", RuntimeWarning)
|
|
1089
1160
|
|
|
1090
1161
|
nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
|
|
1091
|
-
extraction_results = asyncio.run(self.
|
|
1162
|
+
extraction_results = asyncio.run(self._extract_async(text_content=text_content,
|
|
1092
1163
|
document_key=document_key,
|
|
1093
1164
|
concurrent_batch_size=concurrent_batch_size,
|
|
1094
1165
|
return_messages_log=return_messages_log)
|
|
@@ -1101,248 +1172,31 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
1101
1172
|
|
|
1102
1173
|
units, messages_log = extraction_results if return_messages_log else (extraction_results, None)
|
|
1103
1174
|
|
|
1104
|
-
frame_list =
|
|
1105
|
-
for unit in units:
|
|
1106
|
-
entity_json = []
|
|
1107
|
-
if unit.status != "success":
|
|
1108
|
-
warnings.warn(f"Skipping failed unit ({unit.start}, {unit.end}): {unit.text}", RuntimeWarning)
|
|
1109
|
-
continue
|
|
1110
|
-
for entity in extract_json(gen_text=unit.gen_text):
|
|
1111
|
-
if ENTITY_KEY in entity:
|
|
1112
|
-
entity_json.append(entity)
|
|
1113
|
-
else:
|
|
1114
|
-
warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{ENTITY_KEY}"). This frame will be dropped.', RuntimeWarning)
|
|
1115
|
-
|
|
1116
|
-
spans = self._find_entity_spans(text=unit.text,
|
|
1117
|
-
entities=[e[ENTITY_KEY] for e in entity_json],
|
|
1118
|
-
case_sensitive=case_sensitive,
|
|
1119
|
-
fuzzy_match=fuzzy_match,
|
|
1120
|
-
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
1121
|
-
fuzzy_score_cutoff=fuzzy_score_cutoff,
|
|
1122
|
-
allow_overlap_entities=allow_overlap_entities)
|
|
1123
|
-
for ent, span in zip(entity_json, spans):
|
|
1124
|
-
if span is not None:
|
|
1125
|
-
start, end = span
|
|
1126
|
-
entity_text = unit.text[start:end]
|
|
1127
|
-
start += unit.start
|
|
1128
|
-
end += unit.start
|
|
1129
|
-
attr = {}
|
|
1130
|
-
if "attr" in ent and ent["attr"] is not None:
|
|
1131
|
-
attr = ent["attr"]
|
|
1132
|
-
|
|
1133
|
-
frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
|
|
1134
|
-
start=start,
|
|
1135
|
-
end=end,
|
|
1136
|
-
entity_text=entity_text,
|
|
1137
|
-
attr=attr)
|
|
1138
|
-
frame_list.append(frame)
|
|
1175
|
+
frame_list = self._post_process_units_to_frames(units, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities)
|
|
1139
1176
|
|
|
1140
1177
|
if return_messages_log:
|
|
1141
1178
|
return frame_list, messages_log
|
|
1142
1179
|
return frame_list
|
|
1143
|
-
|
|
1144
1180
|
|
|
1145
|
-
async def
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1181
|
+
async def extract_frames_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
|
|
1182
|
+
concurrent_batch_size:int=32, case_sensitive:bool=False,
|
|
1183
|
+
fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
|
|
1184
|
+
allow_overlap_entities:bool=False, return_messages_log:bool=False) -> List[LLMInformationExtractionFrame]:
|
|
1149
1185
|
"""
|
|
1150
|
-
This
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
The key in the `text_contents` dictionaries that holds the document text.
|
|
1160
|
-
cpu_concurrency: int, optional
|
|
1161
|
-
The number of parallel threads to use for CPU-bound tasks like chunking.
|
|
1162
|
-
llm_concurrency: int, optional
|
|
1163
|
-
The number of concurrent requests to make to the LLM.
|
|
1164
|
-
case_sensitive : bool, Optional
|
|
1165
|
-
if True, entity text matching will be case-sensitive.
|
|
1166
|
-
fuzzy_match : bool, Optional
|
|
1167
|
-
if True, fuzzy matching will be applied to find entity text.
|
|
1168
|
-
fuzzy_buffer_size : float, Optional
|
|
1169
|
-
the buffer size for fuzzy matching. Default is 20% of entity text length.
|
|
1170
|
-
fuzzy_score_cutoff : float, Optional
|
|
1171
|
-
the Jaccard score cutoff for fuzzy matching.
|
|
1172
|
-
Matched entity text must have a score higher than this value or a None will be returned.
|
|
1173
|
-
allow_overlap_entities : bool, Optional
|
|
1174
|
-
if True, entities can overlap in the text.
|
|
1175
|
-
return_messages_log : bool, Optional
|
|
1176
|
-
if True, a list of messages will be returned.
|
|
1177
|
-
|
|
1178
|
-
Yields:
|
|
1179
|
-
-------
|
|
1180
|
-
AsyncGenerator[Dict[str, any], None]
|
|
1181
|
-
A dictionary for each completed document, containing its 'idx' and extracted 'frames'.
|
|
1182
|
-
"""
|
|
1183
|
-
# Validate text_contents must be a list of str or dict, and not both
|
|
1184
|
-
if not isinstance(text_contents, list):
|
|
1185
|
-
raise ValueError("text_contents must be a list of strings or dictionaries.")
|
|
1186
|
-
if all(isinstance(doc, str) for doc in text_contents):
|
|
1187
|
-
pass
|
|
1188
|
-
elif all(isinstance(doc, dict) for doc in text_contents):
|
|
1189
|
-
pass
|
|
1190
|
-
# Set CPU executor and queues
|
|
1191
|
-
cpu_executor = ThreadPoolExecutor(max_workers=cpu_concurrency)
|
|
1192
|
-
tasks_queue = asyncio.Queue(maxsize=llm_concurrency * 2)
|
|
1193
|
-
# Store to track units and pending counts
|
|
1194
|
-
results_store = {
|
|
1195
|
-
idx: {'pending': 0, 'units': [], 'text': doc if isinstance(doc, str) else doc.get(document_key, "")}
|
|
1196
|
-
for idx, doc in enumerate(text_contents)
|
|
1197
|
-
}
|
|
1198
|
-
|
|
1199
|
-
output_queue = asyncio.Queue()
|
|
1200
|
-
messages_logger = MessagesLogger() if return_messages_log else None
|
|
1201
|
-
|
|
1202
|
-
async def producer():
|
|
1203
|
-
try:
|
|
1204
|
-
for idx, text_content in enumerate(text_contents):
|
|
1205
|
-
text = text_content if isinstance(text_content, str) else text_content.get(document_key, "")
|
|
1206
|
-
if not text:
|
|
1207
|
-
warnings.warn(f"Document at index {idx} is empty or missing the document key '{document_key}'.")
|
|
1208
|
-
# signal that this document is done
|
|
1209
|
-
await output_queue.put({'idx': idx, 'frames': []})
|
|
1210
|
-
continue
|
|
1211
|
-
|
|
1212
|
-
units = await self.unit_chunker.chunk_async(text, cpu_executor)
|
|
1213
|
-
await self.context_chunker.fit_async(text, units, cpu_executor)
|
|
1214
|
-
results_store[idx]['pending'] = len(units)
|
|
1215
|
-
|
|
1216
|
-
# Handle cases where a document yields no units
|
|
1217
|
-
if not units:
|
|
1218
|
-
# signal that this document is done
|
|
1219
|
-
await output_queue.put({'idx': idx, 'frames': []})
|
|
1220
|
-
continue
|
|
1221
|
-
|
|
1222
|
-
# Iterate through units
|
|
1223
|
-
for unit in units:
|
|
1224
|
-
context = await self.context_chunker.chunk_async(unit, cpu_executor)
|
|
1225
|
-
messages = []
|
|
1226
|
-
if self.system_prompt:
|
|
1227
|
-
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
1228
|
-
|
|
1229
|
-
if not context:
|
|
1230
|
-
if isinstance(text_content, str):
|
|
1231
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
|
|
1232
|
-
else:
|
|
1233
|
-
unit_content = text_content.copy()
|
|
1234
|
-
unit_content[document_key] = unit.text
|
|
1235
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
|
|
1236
|
-
else:
|
|
1237
|
-
# insert context to user prompt
|
|
1238
|
-
if isinstance(text_content, str):
|
|
1239
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
1240
|
-
else:
|
|
1241
|
-
context_content = text_content.copy()
|
|
1242
|
-
context_content[document_key] = context
|
|
1243
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
|
|
1244
|
-
# simulate conversation where assistant confirms
|
|
1245
|
-
messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
|
|
1246
|
-
# place unit of interest
|
|
1247
|
-
messages.append({'role': 'user', 'content': unit.text})
|
|
1248
|
-
|
|
1249
|
-
await tasks_queue.put({'idx': idx, 'unit': unit, 'messages': messages})
|
|
1250
|
-
finally:
|
|
1251
|
-
for _ in range(llm_concurrency):
|
|
1252
|
-
await tasks_queue.put(None)
|
|
1253
|
-
|
|
1254
|
-
async def worker():
|
|
1255
|
-
while True:
|
|
1256
|
-
task_item = await tasks_queue.get()
|
|
1257
|
-
if task_item is None:
|
|
1258
|
-
tasks_queue.task_done()
|
|
1259
|
-
break
|
|
1260
|
-
|
|
1261
|
-
idx = task_item['idx']
|
|
1262
|
-
unit = task_item['unit']
|
|
1263
|
-
doc_results = results_store[idx]
|
|
1264
|
-
|
|
1265
|
-
try:
|
|
1266
|
-
gen_text = await self.inference_engine.chat_async(
|
|
1267
|
-
messages=task_item['messages'], messages_logger=messages_logger
|
|
1268
|
-
)
|
|
1269
|
-
unit.set_generated_text(gen_text["response"])
|
|
1270
|
-
unit.set_status("success")
|
|
1271
|
-
doc_results['units'].append(unit)
|
|
1272
|
-
except Exception as e:
|
|
1273
|
-
warnings.warn(f"Error processing unit for doc idx {idx}: {e}")
|
|
1274
|
-
finally:
|
|
1275
|
-
doc_results['pending'] -= 1
|
|
1276
|
-
if doc_results['pending'] <= 0:
|
|
1277
|
-
final_frames = self._post_process_and_create_frames(doc_results, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities)
|
|
1278
|
-
output_payload = {'idx': idx, 'frames': final_frames}
|
|
1279
|
-
if return_messages_log:
|
|
1280
|
-
output_payload['messages_log'] = messages_logger.get_messages_log()
|
|
1281
|
-
await output_queue.put(output_payload)
|
|
1282
|
-
|
|
1283
|
-
tasks_queue.task_done()
|
|
1284
|
-
|
|
1285
|
-
# Start producer and workers
|
|
1286
|
-
producer_task = asyncio.create_task(producer())
|
|
1287
|
-
worker_tasks = [asyncio.create_task(worker()) for _ in range(llm_concurrency)]
|
|
1288
|
-
|
|
1289
|
-
# Main loop to gather results
|
|
1290
|
-
docs_completed = 0
|
|
1291
|
-
while docs_completed < len(text_contents):
|
|
1292
|
-
result = await output_queue.get()
|
|
1293
|
-
yield result
|
|
1294
|
-
docs_completed += 1
|
|
1295
|
-
|
|
1296
|
-
# Final cleanup
|
|
1297
|
-
await producer_task
|
|
1298
|
-
await tasks_queue.join()
|
|
1299
|
-
|
|
1300
|
-
# Cancel any lingering worker tasks
|
|
1301
|
-
for task in worker_tasks:
|
|
1302
|
-
task.cancel()
|
|
1303
|
-
await asyncio.gather(*worker_tasks, return_exceptions=True)
|
|
1304
|
-
|
|
1305
|
-
cpu_executor.shutdown(wait=False)
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
def _post_process_and_create_frames(self, doc_results, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities):
|
|
1309
|
-
"""Helper function to run post-processing logic for a completed document."""
|
|
1310
|
-
ENTITY_KEY = "entity_text"
|
|
1311
|
-
frame_list = []
|
|
1312
|
-
for res in sorted(doc_results['units'], key=lambda r: r.start):
|
|
1313
|
-
entity_json = []
|
|
1314
|
-
for entity in extract_json(gen_text=res.gen_text):
|
|
1315
|
-
if ENTITY_KEY in entity:
|
|
1316
|
-
entity_json.append(entity)
|
|
1317
|
-
else:
|
|
1318
|
-
warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{ENTITY_KEY}"). This frame will be dropped.', RuntimeWarning)
|
|
1186
|
+
This is the async version of extract_frames.
|
|
1187
|
+
"""
|
|
1188
|
+
extraction_results = await self._extract_async(text_content=text_content,
|
|
1189
|
+
document_key=document_key,
|
|
1190
|
+
concurrent_batch_size=concurrent_batch_size,
|
|
1191
|
+
return_messages_log=return_messages_log)
|
|
1192
|
+
|
|
1193
|
+
units, messages_log = extraction_results if return_messages_log else (extraction_results, None)
|
|
1194
|
+
frame_list = self._post_process_units_to_frames(units, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities)
|
|
1319
1195
|
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
entities=[e[ENTITY_KEY] for e in entity_json],
|
|
1323
|
-
case_sensitive=case_sensitive,
|
|
1324
|
-
fuzzy_match=fuzzy_match,
|
|
1325
|
-
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
1326
|
-
fuzzy_score_cutoff=fuzzy_score_cutoff,
|
|
1327
|
-
allow_overlap_entities=allow_overlap_entities
|
|
1328
|
-
)
|
|
1329
|
-
for ent, span in zip(entity_json, spans):
|
|
1330
|
-
if span is not None:
|
|
1331
|
-
start, end = span
|
|
1332
|
-
entity_text = res.text[start:end]
|
|
1333
|
-
start += res.start
|
|
1334
|
-
end += res.start
|
|
1335
|
-
attr = ent.get("attr", {}) or {}
|
|
1336
|
-
frame = LLMInformationExtractionFrame(
|
|
1337
|
-
frame_id=f"{len(frame_list)}",
|
|
1338
|
-
start=start,
|
|
1339
|
-
end=end,
|
|
1340
|
-
entity_text=entity_text,
|
|
1341
|
-
attr=attr
|
|
1342
|
-
)
|
|
1343
|
-
frame_list.append(frame)
|
|
1196
|
+
if return_messages_log:
|
|
1197
|
+
return frame_list, messages_log
|
|
1344
1198
|
return frame_list
|
|
1345
|
-
|
|
1199
|
+
|
|
1346
1200
|
|
|
1347
1201
|
class ReviewFrameExtractor(DirectFrameExtractor):
|
|
1348
1202
|
def __init__(self, unit_chunker:UnitChunker, context_chunker:ContextChunker, inference_engine:InferenceEngine,
|
|
@@ -1620,7 +1474,7 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
1620
1474
|
for chunk in response_stream:
|
|
1621
1475
|
yield chunk
|
|
1622
1476
|
|
|
1623
|
-
async def
|
|
1477
|
+
async def _extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
|
|
1624
1478
|
concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[FrameExtractionUnit]:
|
|
1625
1479
|
"""
|
|
1626
1480
|
This is the asynchronous version of the extract() method with the review step.
|
|
@@ -2123,7 +1977,7 @@ class AttributeExtractor(Extractor):
|
|
|
2123
1977
|
return (new_frames, messages_log) if return_messages_log else new_frames
|
|
2124
1978
|
|
|
2125
1979
|
|
|
2126
|
-
async def
|
|
1980
|
+
async def _extract_async(self, frames:List[LLMInformationExtractionFrame], text:str, context_size:int=256,
|
|
2127
1981
|
concurrent_batch_size:int=32, inplace:bool=True, return_messages_log:bool=False) -> Union[None, List[LLMInformationExtractionFrame]]:
|
|
2128
1982
|
"""
|
|
2129
1983
|
This method extracts attributes from the document asynchronously.
|
|
@@ -2195,6 +2049,16 @@ class AttributeExtractor(Extractor):
|
|
|
2195
2049
|
else:
|
|
2196
2050
|
return (new_frames, messages_logger.get_messages_log()) if return_messages_log else new_frames
|
|
2197
2051
|
|
|
2052
|
+
async def extract_attributes_async(self, frames:List[LLMInformationExtractionFrame], text:str, context_size:int=256,
|
|
2053
|
+
concurrent_batch_size:int=32, inplace:bool=True,
|
|
2054
|
+
return_messages_log:bool=False) -> Union[None, List[LLMInformationExtractionFrame]]:
|
|
2055
|
+
"""
|
|
2056
|
+
This is the async version of extract_attributes.
|
|
2057
|
+
"""
|
|
2058
|
+
return await self._extract_async(frames=frames, text=text, context_size=context_size,
|
|
2059
|
+
concurrent_batch_size=concurrent_batch_size, inplace=inplace, return_messages_log=return_messages_log)
|
|
2060
|
+
|
|
2061
|
+
|
|
2198
2062
|
def extract_attributes(self, frames:List[LLMInformationExtractionFrame], text:str, context_size:int=256,
|
|
2199
2063
|
concurrent:bool=False, concurrent_batch_size:int=32, verbose:bool=False,
|
|
2200
2064
|
return_messages_log:bool=False, inplace:bool=True) -> Union[None, List[LLMInformationExtractionFrame]]:
|
|
@@ -2230,7 +2094,7 @@ class AttributeExtractor(Extractor):
|
|
|
2230
2094
|
|
|
2231
2095
|
nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
|
|
2232
2096
|
|
|
2233
|
-
return asyncio.run(self.
|
|
2097
|
+
return asyncio.run(self._extract_async(frames=frames, text=text, context_size=context_size,
|
|
2234
2098
|
concurrent_batch_size=concurrent_batch_size,
|
|
2235
2099
|
inplace=inplace, return_messages_log=return_messages_log))
|
|
2236
2100
|
else:
|
|
@@ -2375,6 +2239,17 @@ class RelationExtractor(Extractor):
|
|
|
2375
2239
|
return asyncio.run(self._extract_async(doc, buffer_size, concurrent_batch_size, return_messages_log))
|
|
2376
2240
|
else:
|
|
2377
2241
|
return self._extract(doc, buffer_size, verbose, return_messages_log)
|
|
2242
|
+
|
|
2243
|
+
async def extract_relations_async(self, doc: LLMInformationExtractionDocument, buffer_size: int = 128, concurrent_batch_size: int = 32, return_messages_log: bool = False) -> Union[List[Dict], Tuple[List[Dict], List]]:
|
|
2244
|
+
"""
|
|
2245
|
+
This is the async version of extract_relations.
|
|
2246
|
+
"""
|
|
2247
|
+
if not doc.has_frame():
|
|
2248
|
+
raise ValueError("Input document must have frames.")
|
|
2249
|
+
if doc.has_duplicate_frame_ids():
|
|
2250
|
+
raise ValueError("All frame_ids in the input document must be unique.")
|
|
2251
|
+
|
|
2252
|
+
return await self._extract_async(doc, buffer_size, concurrent_batch_size, return_messages_log)
|
|
2378
2253
|
|
|
2379
2254
|
|
|
2380
2255
|
class BinaryRelationExtractor(RelationExtractor):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llm-ie
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Enshuo (David) Hsu
|
|
@@ -10,7 +10,8 @@ Classifier: Programming Language :: Python :: 3
|
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.11
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.12
|
|
12
12
|
Requires-Dist: colorama (>=0.4.6,<0.5.0)
|
|
13
|
-
Requires-Dist: json_repair (>=0.30
|
|
13
|
+
Requires-Dist: json_repair (>=0.30)
|
|
14
|
+
Requires-Dist: llm-inference-engine (>=0.1.1,<0.2.0)
|
|
14
15
|
Requires-Dist: nest_asyncio (>=1.6.0,<2.0.0)
|
|
15
16
|
Requires-Dist: nltk (>=3.8,<4.0)
|
|
16
17
|
Description-Content-Type: text/markdown
|
|
@@ -22,10 +22,10 @@ llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=9
|
|
|
22
22
|
llm_ie/asset/prompt_guide/StructExtractor_prompt_guide.txt,sha256=x8L4n_LVl6ofQu6cDE9YP4SB2FSQ4GrTee8y1XKwwwc,1922
|
|
23
23
|
llm_ie/chunkers.py,sha256=b4APRwaLMU40QXVEhOK8m1DZi_jr-VCHAFwbMjqVBgA,11308
|
|
24
24
|
llm_ie/data_types.py,sha256=iG_jdqhpBi33xnsfFQYayCXNBK-2N-8u1xIhoKfJzRI,18294
|
|
25
|
-
llm_ie/engines.py,sha256=
|
|
26
|
-
llm_ie/extractors.py,sha256=
|
|
25
|
+
llm_ie/engines.py,sha256=Lxzj0gfbUjaU8TpWWM7MqS71Vmpqdq_mIHoLiXqOmXs,1089
|
|
26
|
+
llm_ie/extractors.py,sha256=5tvYfWWvwsfRHGgHYOE5080hx9tLm__eyORbimZ8UUY,115189
|
|
27
27
|
llm_ie/prompt_editor.py,sha256=Hqukm2HMgsoGpXV3vZ__7CGgfMhd-UUIwTKGnfSDltM,12055
|
|
28
28
|
llm_ie/utils.py,sha256=k6M4l8GsKOMcmO6UwONQ353Zk-TeoBj6HXGjlAn-JE0,3679
|
|
29
|
-
llm_ie-1.
|
|
30
|
-
llm_ie-1.
|
|
31
|
-
llm_ie-1.
|
|
29
|
+
llm_ie-1.4.0.dist-info/METADATA,sha256=Y0f5yOV_GKP_qKQFje6Cw0p-EhNjsNzXVqJ839aZIBo,775
|
|
30
|
+
llm_ie-1.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
31
|
+
llm_ie-1.4.0.dist-info/RECORD,,
|
|
File without changes
|