llm-ie 0.4.6__tar.gz → 0.4.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llm_ie-0.4.6 → llm_ie-0.4.7}/PKG-INFO +9 -5
- {llm_ie-0.4.6 → llm_ie-0.4.7}/README.md +8 -4
- {llm_ie-0.4.6 → llm_ie-0.4.7}/pyproject.toml +1 -1
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/extractors.py +287 -63
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/__init__.py +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/PromptEditor_prompts/chat.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/PromptEditor_prompts/comment.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/PromptEditor_prompts/rewrite.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/PromptEditor_prompts/system.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_revision_review_prompt.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/prompt_guide/SentenceCoTFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/data_types.py +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/engines.py +0 -0
- {llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/prompt_editor.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llm-ie
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.7
|
|
4
4
|
Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Enshuo (David) Hsu
|
|
@@ -1206,10 +1206,14 @@ We benchmarked the frame and relation extractors on biomedical information extra
|
|
|
1206
1206
|
## Citation
|
|
1207
1207
|
For more information and benchmarks, please check our paper:
|
|
1208
1208
|
```bibtex
|
|
1209
|
-
@article{
|
|
1210
|
-
title={LLM-IE:
|
|
1209
|
+
@article{hsu2025llm,
|
|
1210
|
+
title={LLM-IE: a python package for biomedical generative information extraction with large language models},
|
|
1211
1211
|
author={Hsu, Enshuo and Roberts, Kirk},
|
|
1212
|
-
journal={
|
|
1213
|
-
|
|
1212
|
+
journal={JAMIA open},
|
|
1213
|
+
volume={8},
|
|
1214
|
+
number={2},
|
|
1215
|
+
pages={ooaf012},
|
|
1216
|
+
year={2025},
|
|
1217
|
+
publisher={Oxford University Press}
|
|
1214
1218
|
}
|
|
1215
1219
|
```
|
|
@@ -1189,10 +1189,14 @@ We benchmarked the frame and relation extractors on biomedical information extra
|
|
|
1189
1189
|
## Citation
|
|
1190
1190
|
For more information and benchmarks, please check our paper:
|
|
1191
1191
|
```bibtex
|
|
1192
|
-
@article{
|
|
1193
|
-
title={LLM-IE:
|
|
1192
|
+
@article{hsu2025llm,
|
|
1193
|
+
title={LLM-IE: a python package for biomedical generative information extraction with large language models},
|
|
1194
1194
|
author={Hsu, Enshuo and Roberts, Kirk},
|
|
1195
|
-
journal={
|
|
1196
|
-
|
|
1195
|
+
journal={JAMIA open},
|
|
1196
|
+
volume={8},
|
|
1197
|
+
number={2},
|
|
1198
|
+
pages={ooaf012},
|
|
1199
|
+
year={2025},
|
|
1200
|
+
publisher={Oxford University Press}
|
|
1197
1201
|
}
|
|
1198
1202
|
```
|
|
@@ -288,7 +288,7 @@ class FrameExtractor(Extractor):
|
|
|
288
288
|
return entity_spans
|
|
289
289
|
|
|
290
290
|
@abc.abstractmethod
|
|
291
|
-
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048, **kwrs) -> str:
|
|
291
|
+
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048, return_messages_log:bool=False, **kwrs) -> str:
|
|
292
292
|
"""
|
|
293
293
|
This method inputs text content and outputs a string generated by LLM
|
|
294
294
|
|
|
@@ -300,6 +300,8 @@ class FrameExtractor(Extractor):
|
|
|
300
300
|
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
301
301
|
max_new_tokens : str, Optional
|
|
302
302
|
the max number of new tokens LLM can generate.
|
|
303
|
+
return_messages_log : bool, Optional
|
|
304
|
+
if True, a list of messages will be returned.
|
|
303
305
|
|
|
304
306
|
Return : str
|
|
305
307
|
the output from LLM. Need post-processing.
|
|
@@ -309,7 +311,7 @@ class FrameExtractor(Extractor):
|
|
|
309
311
|
|
|
310
312
|
@abc.abstractmethod
|
|
311
313
|
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
|
|
312
|
-
document_key:str=None, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
314
|
+
document_key:str=None, return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
313
315
|
"""
|
|
314
316
|
This method inputs text content and outputs a list of LLMInformationExtractionFrame
|
|
315
317
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -327,6 +329,8 @@ class FrameExtractor(Extractor):
|
|
|
327
329
|
document_key : str, Optional
|
|
328
330
|
specify the key in text_content where document text is.
|
|
329
331
|
If text_content is str, this parameter will be ignored.
|
|
332
|
+
return_messages_log : bool, Optional
|
|
333
|
+
if True, a list of messages will be returned.
|
|
330
334
|
|
|
331
335
|
Return : str
|
|
332
336
|
a list of frames.
|
|
@@ -357,7 +361,7 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
357
361
|
|
|
358
362
|
|
|
359
363
|
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048,
|
|
360
|
-
temperature:float=0.0, stream:bool=False, **kwrs) -> str:
|
|
364
|
+
temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> str:
|
|
361
365
|
"""
|
|
362
366
|
This method inputs a text and outputs a string generated by LLM.
|
|
363
367
|
|
|
@@ -373,6 +377,8 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
373
377
|
the temperature for token sampling.
|
|
374
378
|
stream : bool, Optional
|
|
375
379
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
380
|
+
return_messages_log : bool, Optional
|
|
381
|
+
if True, a list of messages will be returned.
|
|
376
382
|
|
|
377
383
|
Return : str
|
|
378
384
|
the output from LLM. Need post-processing.
|
|
@@ -390,13 +396,19 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
390
396
|
**kwrs
|
|
391
397
|
)
|
|
392
398
|
|
|
399
|
+
if return_messages_log:
|
|
400
|
+
messages.append({"role": "assistant", "content": response})
|
|
401
|
+
messages_log = [messages]
|
|
402
|
+
return response, messages_log
|
|
403
|
+
|
|
393
404
|
return response
|
|
394
405
|
|
|
395
406
|
|
|
396
407
|
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
|
|
397
408
|
temperature:float=0.0, document_key:str=None, stream:bool=False,
|
|
398
409
|
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2,
|
|
399
|
-
fuzzy_score_cutoff:float=0.8, allow_overlap_entities:bool=False,
|
|
410
|
+
fuzzy_score_cutoff:float=0.8, allow_overlap_entities:bool=False,
|
|
411
|
+
return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
400
412
|
"""
|
|
401
413
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
402
414
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -430,6 +442,8 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
430
442
|
allow_overlap_entities : bool, Optional
|
|
431
443
|
if True, entities can overlap in the text.
|
|
432
444
|
Note that this can cause multiple frames to be generated on the same entity span if they have same entity text.
|
|
445
|
+
return_messages_log : bool, Optional
|
|
446
|
+
if True, a list of messages will be returned.
|
|
433
447
|
|
|
434
448
|
Return : str
|
|
435
449
|
a list of frames.
|
|
@@ -442,11 +456,13 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
442
456
|
text = text_content[document_key]
|
|
443
457
|
|
|
444
458
|
frame_list = []
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
459
|
+
extraction_results = self.extract(text_content=text_content,
|
|
460
|
+
max_new_tokens=max_new_tokens,
|
|
461
|
+
temperature=temperature,
|
|
462
|
+
stream=stream,
|
|
463
|
+
return_messages_log=return_messages_log,
|
|
464
|
+
**kwrs)
|
|
465
|
+
gen_text, messages_log = extraction_results if return_messages_log else (extraction_results, None)
|
|
450
466
|
|
|
451
467
|
entity_json = []
|
|
452
468
|
for entity in self._extract_json(gen_text=gen_text):
|
|
@@ -472,6 +488,10 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
472
488
|
entity_text=text[start:end],
|
|
473
489
|
attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
|
|
474
490
|
frame_list.append(frame)
|
|
491
|
+
|
|
492
|
+
if return_messages_log:
|
|
493
|
+
return frame_list, messages_log
|
|
494
|
+
|
|
475
495
|
return frame_list
|
|
476
496
|
|
|
477
497
|
|
|
@@ -518,7 +538,7 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
518
538
|
|
|
519
539
|
|
|
520
540
|
def extract(self, text_content:Union[str, Dict[str,str]],
|
|
521
|
-
max_new_tokens:int=4096, temperature:float=0.0, stream:bool=False, **kwrs) -> str:
|
|
541
|
+
max_new_tokens:int=4096, temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> str:
|
|
522
542
|
"""
|
|
523
543
|
This method inputs a text and outputs a string generated by LLM.
|
|
524
544
|
|
|
@@ -534,6 +554,8 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
534
554
|
the temperature for token sampling.
|
|
535
555
|
stream : bool, Optional
|
|
536
556
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
557
|
+
return_messages_log : bool, Optional
|
|
558
|
+
if True, a list of messages will be returned.
|
|
537
559
|
|
|
538
560
|
Return : str
|
|
539
561
|
the output from LLM. Need post-processing.
|
|
@@ -570,10 +592,18 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
570
592
|
)
|
|
571
593
|
|
|
572
594
|
# Output
|
|
595
|
+
output_text = ""
|
|
573
596
|
if self.review_mode == "revision":
|
|
574
|
-
|
|
597
|
+
output_text = review
|
|
575
598
|
elif self.review_mode == "addition":
|
|
576
|
-
|
|
599
|
+
output_text = initial + '\n' + review
|
|
600
|
+
|
|
601
|
+
if return_messages_log:
|
|
602
|
+
messages.append({"role": "assistant", "content": review})
|
|
603
|
+
messages_log = [messages]
|
|
604
|
+
return output_text, messages_log
|
|
605
|
+
|
|
606
|
+
return output_text
|
|
577
607
|
|
|
578
608
|
|
|
579
609
|
class SentenceFrameExtractor(FrameExtractor):
|
|
@@ -657,7 +687,7 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
657
687
|
|
|
658
688
|
|
|
659
689
|
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
660
|
-
document_key:str=None, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
690
|
+
document_key:str=None, temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
661
691
|
"""
|
|
662
692
|
This method inputs a text and outputs a list of outputs per sentence.
|
|
663
693
|
|
|
@@ -676,6 +706,8 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
676
706
|
the temperature for token sampling.
|
|
677
707
|
stream : bool, Optional
|
|
678
708
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
709
|
+
return_messages_log : bool, Optional
|
|
710
|
+
if True, a list of messages will be returned.
|
|
679
711
|
|
|
680
712
|
Return : str
|
|
681
713
|
the output from LLM. Need post-processing.
|
|
@@ -690,6 +722,9 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
690
722
|
raise ValueError("document_key must be provided when text_content is dict.")
|
|
691
723
|
sentences = self._get_sentences(text_content[document_key])
|
|
692
724
|
|
|
725
|
+
if return_messages_log:
|
|
726
|
+
messages_log = []
|
|
727
|
+
|
|
693
728
|
# generate sentence by sentence
|
|
694
729
|
for i, sent in enumerate(sentences):
|
|
695
730
|
# construct chat messages
|
|
@@ -701,10 +736,20 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
701
736
|
|
|
702
737
|
if self.context_sentences == 0:
|
|
703
738
|
# no context, just place sentence of interest
|
|
704
|
-
|
|
739
|
+
if isinstance(text_content, str):
|
|
740
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
|
|
741
|
+
else:
|
|
742
|
+
sentence_content = text_content.copy()
|
|
743
|
+
sentence_content[document_key] = sent['sentence_text']
|
|
744
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
|
|
705
745
|
else:
|
|
706
746
|
# insert context
|
|
707
|
-
|
|
747
|
+
if isinstance(text_content, str):
|
|
748
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
749
|
+
else:
|
|
750
|
+
context_content = text_content.copy()
|
|
751
|
+
context_content[document_key] = context
|
|
752
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
|
|
708
753
|
# simulate conversation
|
|
709
754
|
messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
|
|
710
755
|
# place sentence of interest
|
|
@@ -724,6 +769,10 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
724
769
|
stream=stream,
|
|
725
770
|
**kwrs
|
|
726
771
|
)
|
|
772
|
+
|
|
773
|
+
if return_messages_log:
|
|
774
|
+
messages.append({"role": "assistant", "content": gen_text})
|
|
775
|
+
messages_log.append(messages)
|
|
727
776
|
|
|
728
777
|
# add to output
|
|
729
778
|
output.append({'sentence_start': sent['start'],
|
|
@@ -731,11 +780,15 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
731
780
|
'sentence_text': sent['sentence_text'],
|
|
732
781
|
'gen_text': gen_text})
|
|
733
782
|
|
|
783
|
+
if return_messages_log:
|
|
784
|
+
return output, messages_log
|
|
785
|
+
|
|
734
786
|
return output
|
|
735
787
|
|
|
736
788
|
|
|
737
789
|
async def extract_async(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
738
|
-
document_key:str=None, temperature:float=0.0, concurrent_batch_size:int=32,
|
|
790
|
+
document_key:str=None, temperature:float=0.0, concurrent_batch_size:int=32,
|
|
791
|
+
return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
739
792
|
"""
|
|
740
793
|
The asynchronous version of the extract() method.
|
|
741
794
|
|
|
@@ -754,6 +807,11 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
754
807
|
the temperature for token sampling.
|
|
755
808
|
concurrent_batch_size : int, Optional
|
|
756
809
|
the number of sentences to process in concurrent.
|
|
810
|
+
return_messages_log : bool, Optional
|
|
811
|
+
if True, a list of messages will be returned.
|
|
812
|
+
|
|
813
|
+
Return : str
|
|
814
|
+
the output from LLM. Need post-processing.
|
|
757
815
|
"""
|
|
758
816
|
# Check if self.inference_engine.chat_async() is implemented
|
|
759
817
|
if not hasattr(self.inference_engine, 'chat_async'):
|
|
@@ -769,10 +827,14 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
769
827
|
raise ValueError("document_key must be provided when text_content is dict.")
|
|
770
828
|
sentences = self._get_sentences(text_content[document_key])
|
|
771
829
|
|
|
830
|
+
if return_messages_log:
|
|
831
|
+
messages_log = []
|
|
832
|
+
|
|
772
833
|
# generate sentence by sentence
|
|
773
834
|
for i in range(0, len(sentences), concurrent_batch_size):
|
|
774
835
|
tasks = []
|
|
775
836
|
batch = sentences[i:i + concurrent_batch_size]
|
|
837
|
+
batch_messages = []
|
|
776
838
|
for j, sent in enumerate(batch):
|
|
777
839
|
# construct chat messages
|
|
778
840
|
messages = []
|
|
@@ -783,10 +845,20 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
783
845
|
|
|
784
846
|
if self.context_sentences == 0:
|
|
785
847
|
# no context, just place sentence of interest
|
|
786
|
-
|
|
848
|
+
if isinstance(text_content, str):
|
|
849
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
|
|
850
|
+
else:
|
|
851
|
+
sentence_content = text_content.copy()
|
|
852
|
+
sentence_content[document_key] = sent['sentence_text']
|
|
853
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
|
|
787
854
|
else:
|
|
788
855
|
# insert context
|
|
789
|
-
|
|
856
|
+
if isinstance(text_content, str):
|
|
857
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
858
|
+
else:
|
|
859
|
+
context_content = text_content.copy()
|
|
860
|
+
context_content[document_key] = context
|
|
861
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
|
|
790
862
|
# simulate conversation
|
|
791
863
|
messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
|
|
792
864
|
# place sentence of interest
|
|
@@ -802,16 +874,25 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
802
874
|
)
|
|
803
875
|
)
|
|
804
876
|
tasks.append(task)
|
|
877
|
+
batch_messages.append(messages)
|
|
805
878
|
|
|
806
879
|
# Wait until the batch is done, collect results and move on to next batch
|
|
807
880
|
responses = await asyncio.gather(*tasks)
|
|
808
881
|
|
|
809
882
|
# Collect outputs
|
|
810
|
-
for gen_text, sent in zip(responses, batch):
|
|
883
|
+
for gen_text, sent, messages in zip(responses, batch, batch_messages):
|
|
884
|
+
if return_messages_log:
|
|
885
|
+
messages.append({"role": "assistant", "content": gen_text})
|
|
886
|
+
messages_log.append(messages)
|
|
887
|
+
|
|
811
888
|
output.append({'sentence_start': sent['start'],
|
|
812
889
|
'sentence_end': sent['end'],
|
|
813
890
|
'sentence_text': sent['sentence_text'],
|
|
814
891
|
'gen_text': gen_text})
|
|
892
|
+
|
|
893
|
+
if return_messages_log:
|
|
894
|
+
return output, messages_log
|
|
895
|
+
|
|
815
896
|
return output
|
|
816
897
|
|
|
817
898
|
|
|
@@ -819,7 +900,7 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
819
900
|
document_key:str=None, temperature:float=0.0, stream:bool=False,
|
|
820
901
|
concurrent:bool=False, concurrent_batch_size:int=32,
|
|
821
902
|
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
|
|
822
|
-
allow_overlap_entities:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
903
|
+
allow_overlap_entities:bool=False, return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
823
904
|
"""
|
|
824
905
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
825
906
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -857,6 +938,8 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
857
938
|
allow_overlap_entities : bool, Optional
|
|
858
939
|
if True, entities can overlap in the text.
|
|
859
940
|
Note that this can cause multiple frames to be generated on the same entity span if they have same entity text.
|
|
941
|
+
return_messages_log : bool, Optional
|
|
942
|
+
if True, a list of messages will be returned.
|
|
860
943
|
|
|
861
944
|
Return : str
|
|
862
945
|
a list of frames.
|
|
@@ -866,20 +949,25 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
866
949
|
warnings.warn("stream=True is not supported in concurrent mode.", RuntimeWarning)
|
|
867
950
|
|
|
868
951
|
nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
952
|
+
extraction_results = asyncio.run(self.extract_async(text_content=text_content,
|
|
953
|
+
max_new_tokens=max_new_tokens,
|
|
954
|
+
document_key=document_key,
|
|
955
|
+
temperature=temperature,
|
|
956
|
+
concurrent_batch_size=concurrent_batch_size,
|
|
957
|
+
return_messages_log=return_messages_log,
|
|
958
|
+
**kwrs)
|
|
959
|
+
)
|
|
876
960
|
else:
|
|
877
|
-
|
|
961
|
+
extraction_results = self.extract(text_content=text_content,
|
|
878
962
|
max_new_tokens=max_new_tokens,
|
|
879
963
|
document_key=document_key,
|
|
880
964
|
temperature=temperature,
|
|
881
965
|
stream=stream,
|
|
966
|
+
return_messages_log=return_messages_log,
|
|
882
967
|
**kwrs)
|
|
968
|
+
|
|
969
|
+
llm_output_sentences, messages_log = extraction_results if return_messages_log else (extraction_results, None)
|
|
970
|
+
|
|
883
971
|
frame_list = []
|
|
884
972
|
for sent in llm_output_sentences:
|
|
885
973
|
entity_json = []
|
|
@@ -908,6 +996,9 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
908
996
|
entity_text=entity_text,
|
|
909
997
|
attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
|
|
910
998
|
frame_list.append(frame)
|
|
999
|
+
|
|
1000
|
+
if return_messages_log:
|
|
1001
|
+
return frame_list, messages_log
|
|
911
1002
|
return frame_list
|
|
912
1003
|
|
|
913
1004
|
|
|
@@ -963,7 +1054,7 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
963
1054
|
|
|
964
1055
|
|
|
965
1056
|
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
966
|
-
document_key:str=None, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
1057
|
+
document_key:str=None, temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
967
1058
|
"""
|
|
968
1059
|
This method inputs a text and outputs a list of outputs per sentence.
|
|
969
1060
|
|
|
@@ -982,6 +1073,8 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
982
1073
|
the temperature for token sampling.
|
|
983
1074
|
stream : bool, Optional
|
|
984
1075
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
1076
|
+
return_messages_log : bool, Optional
|
|
1077
|
+
if True, a list of messages will be returned.
|
|
985
1078
|
|
|
986
1079
|
Return : str
|
|
987
1080
|
the output from LLM. Need post-processing.
|
|
@@ -996,6 +1089,9 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
996
1089
|
raise ValueError("document_key must be provided when text_content is dict.")
|
|
997
1090
|
sentences = self._get_sentences(text_content[document_key])
|
|
998
1091
|
|
|
1092
|
+
if return_messages_log:
|
|
1093
|
+
messages_log = []
|
|
1094
|
+
|
|
999
1095
|
# generate sentence by sentence
|
|
1000
1096
|
for i, sent in enumerate(sentences):
|
|
1001
1097
|
# construct chat messages
|
|
@@ -1007,10 +1103,20 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
1007
1103
|
|
|
1008
1104
|
if self.context_sentences == 0:
|
|
1009
1105
|
# no context, just place sentence of interest
|
|
1010
|
-
|
|
1106
|
+
if isinstance(text_content, str):
|
|
1107
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
|
|
1108
|
+
else:
|
|
1109
|
+
sentence_content = text_content.copy()
|
|
1110
|
+
sentence_content[document_key] = sent['sentence_text']
|
|
1111
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
|
|
1011
1112
|
else:
|
|
1012
1113
|
# insert context
|
|
1013
|
-
|
|
1114
|
+
if isinstance(text_content, str):
|
|
1115
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
1116
|
+
else:
|
|
1117
|
+
context_content = text_content.copy()
|
|
1118
|
+
context_content[document_key] = context
|
|
1119
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
|
|
1014
1120
|
# simulate conversation
|
|
1015
1121
|
messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
|
|
1016
1122
|
# place sentence of interest
|
|
@@ -1033,6 +1139,7 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
1033
1139
|
# Review
|
|
1034
1140
|
if stream:
|
|
1035
1141
|
print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
|
|
1142
|
+
|
|
1036
1143
|
messages.append({'role': 'assistant', 'content': initial})
|
|
1037
1144
|
messages.append({'role': 'user', 'content': self.review_prompt})
|
|
1038
1145
|
|
|
@@ -1050,15 +1157,23 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
1050
1157
|
elif self.review_mode == "addition":
|
|
1051
1158
|
gen_text = initial + '\n' + review
|
|
1052
1159
|
|
|
1160
|
+
if return_messages_log:
|
|
1161
|
+
messages.append({"role": "assistant", "content": review})
|
|
1162
|
+
messages_log.append(messages)
|
|
1163
|
+
|
|
1053
1164
|
# add to output
|
|
1054
1165
|
output.append({'sentence_start': sent['start'],
|
|
1055
1166
|
'sentence_end': sent['end'],
|
|
1056
1167
|
'sentence_text': sent['sentence_text'],
|
|
1057
1168
|
'gen_text': gen_text})
|
|
1169
|
+
|
|
1170
|
+
if return_messages_log:
|
|
1171
|
+
return output, messages_log
|
|
1172
|
+
|
|
1058
1173
|
return output
|
|
1059
1174
|
|
|
1060
1175
|
async def extract_async(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
1061
|
-
document_key:str=None, temperature:float=0.0, concurrent_batch_size:int=32, **kwrs) -> List[Dict[str,str]]:
|
|
1176
|
+
document_key:str=None, temperature:float=0.0, concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
1062
1177
|
"""
|
|
1063
1178
|
The asynchronous version of the extract() method.
|
|
1064
1179
|
|
|
@@ -1077,6 +1192,8 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
1077
1192
|
the temperature for token sampling.
|
|
1078
1193
|
concurrent_batch_size : int, Optional
|
|
1079
1194
|
the number of sentences to process in concurrent.
|
|
1195
|
+
return_messages_log : bool, Optional
|
|
1196
|
+
if True, a list of messages will be returned.
|
|
1080
1197
|
|
|
1081
1198
|
Return : str
|
|
1082
1199
|
the output from LLM. Need post-processing.
|
|
@@ -1095,6 +1212,9 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
1095
1212
|
raise ValueError("document_key must be provided when text_content is dict.")
|
|
1096
1213
|
sentences = self._get_sentences(text_content[document_key])
|
|
1097
1214
|
|
|
1215
|
+
if return_messages_log:
|
|
1216
|
+
messages_log = []
|
|
1217
|
+
|
|
1098
1218
|
# generate initial outputs sentence by sentence
|
|
1099
1219
|
for i in range(0, len(sentences), concurrent_batch_size):
|
|
1100
1220
|
messages_list = []
|
|
@@ -1111,10 +1231,20 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
1111
1231
|
|
|
1112
1232
|
if self.context_sentences == 0:
|
|
1113
1233
|
# no context, just place sentence of interest
|
|
1114
|
-
|
|
1234
|
+
if isinstance(text_content, str):
|
|
1235
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
|
|
1236
|
+
else:
|
|
1237
|
+
sentence_content = text_content.copy()
|
|
1238
|
+
sentence_content[document_key] = sent['sentence_text']
|
|
1239
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
|
|
1115
1240
|
else:
|
|
1116
1241
|
# insert context
|
|
1117
|
-
|
|
1242
|
+
if isinstance(text_content, str):
|
|
1243
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
1244
|
+
else:
|
|
1245
|
+
context_content = text_content.copy()
|
|
1246
|
+
context_content[document_key] = context
|
|
1247
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
|
|
1118
1248
|
# simulate conversation
|
|
1119
1249
|
messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
|
|
1120
1250
|
# place sentence of interest
|
|
@@ -1175,11 +1305,19 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
1175
1305
|
elif self.review_mode == "addition":
|
|
1176
1306
|
gen_text = init['gen_text'] + '\n' + rev['gen_text']
|
|
1177
1307
|
|
|
1308
|
+
if return_messages_log:
|
|
1309
|
+
messages = init["messages"]
|
|
1310
|
+
messages.append({"role": "assistant", "content": rev['gen_text']})
|
|
1311
|
+
messages_log.append(messages)
|
|
1312
|
+
|
|
1178
1313
|
# add to output
|
|
1179
1314
|
output.append({'sentence_start': init['sentence_start'],
|
|
1180
1315
|
'sentence_end': init['sentence_end'],
|
|
1181
1316
|
'sentence_text': init['sentence_text'],
|
|
1182
1317
|
'gen_text': gen_text})
|
|
1318
|
+
|
|
1319
|
+
if return_messages_log:
|
|
1320
|
+
return output, messages_log
|
|
1183
1321
|
return output
|
|
1184
1322
|
|
|
1185
1323
|
|
|
@@ -1221,7 +1359,7 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
|
|
|
1221
1359
|
|
|
1222
1360
|
|
|
1223
1361
|
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
1224
|
-
document_key:str=None, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
1362
|
+
document_key:str=None, temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
1225
1363
|
"""
|
|
1226
1364
|
This method inputs a text and outputs a list of outputs per sentence.
|
|
1227
1365
|
|
|
@@ -1240,6 +1378,8 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
|
|
|
1240
1378
|
the temperature for token sampling.
|
|
1241
1379
|
stream : bool, Optional
|
|
1242
1380
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
1381
|
+
return_messages_log : bool, Optional
|
|
1382
|
+
if True, a list of messages will be returned.
|
|
1243
1383
|
|
|
1244
1384
|
Return : str
|
|
1245
1385
|
the output from LLM. Need post-processing.
|
|
@@ -1252,6 +1392,9 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
|
|
|
1252
1392
|
elif isinstance(text_content, dict):
|
|
1253
1393
|
sentences = self._get_sentences(text_content[document_key])
|
|
1254
1394
|
|
|
1395
|
+
if return_messages_log:
|
|
1396
|
+
messages_log = []
|
|
1397
|
+
|
|
1255
1398
|
# generate sentence by sentence
|
|
1256
1399
|
for i, sent in enumerate(sentences):
|
|
1257
1400
|
# construct chat messages
|
|
@@ -1263,10 +1406,20 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
|
|
|
1263
1406
|
|
|
1264
1407
|
if self.context_sentences == 0:
|
|
1265
1408
|
# no context, just place sentence of interest
|
|
1266
|
-
|
|
1409
|
+
if isinstance(text_content, str):
|
|
1410
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
|
|
1411
|
+
else:
|
|
1412
|
+
sentence_content = text_content.copy()
|
|
1413
|
+
sentence_content[document_key] = sent['sentence_text']
|
|
1414
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
|
|
1267
1415
|
else:
|
|
1268
1416
|
# insert context
|
|
1269
|
-
|
|
1417
|
+
if isinstance(text_content, str):
|
|
1418
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
1419
|
+
else:
|
|
1420
|
+
context_content = text_content.copy()
|
|
1421
|
+
context_content[document_key] = context
|
|
1422
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
|
|
1270
1423
|
# simulate conversation
|
|
1271
1424
|
messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
|
|
1272
1425
|
# place sentence of interest
|
|
@@ -1286,11 +1439,18 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
|
|
|
1286
1439
|
**kwrs
|
|
1287
1440
|
)
|
|
1288
1441
|
|
|
1442
|
+
if return_messages_log:
|
|
1443
|
+
messages.append({"role": "assistant", "content": gen_text})
|
|
1444
|
+
messages_log.append(messages)
|
|
1445
|
+
|
|
1289
1446
|
# add to output
|
|
1290
1447
|
output.append({'sentence_start': sent['start'],
|
|
1291
1448
|
'sentence_end': sent['end'],
|
|
1292
1449
|
'sentence_text': sent['sentence_text'],
|
|
1293
1450
|
'gen_text': gen_text})
|
|
1451
|
+
|
|
1452
|
+
if return_messages_log:
|
|
1453
|
+
return output, messages_log
|
|
1294
1454
|
return output
|
|
1295
1455
|
|
|
1296
1456
|
|
|
@@ -1361,7 +1521,7 @@ class RelationExtractor(Extractor):
|
|
|
1361
1521
|
|
|
1362
1522
|
@abc.abstractmethod
|
|
1363
1523
|
def extract_relations(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
|
|
1364
|
-
temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict]:
|
|
1524
|
+
temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict]:
|
|
1365
1525
|
"""
|
|
1366
1526
|
This method considers all combinations of two frames.
|
|
1367
1527
|
|
|
@@ -1377,6 +1537,8 @@ class RelationExtractor(Extractor):
|
|
|
1377
1537
|
the temperature for token sampling.
|
|
1378
1538
|
stream : bool, Optional
|
|
1379
1539
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
1540
|
+
return_messages_log : bool, Optional
|
|
1541
|
+
if True, a list of messages will be returned.
|
|
1380
1542
|
|
|
1381
1543
|
Return : List[Dict]
|
|
1382
1544
|
a list of dict with {"frame_1", "frame_2"} for all relations.
|
|
@@ -1446,7 +1608,7 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1446
1608
|
|
|
1447
1609
|
|
|
1448
1610
|
def extract(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
|
|
1449
|
-
temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict]:
|
|
1611
|
+
temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict]:
|
|
1450
1612
|
"""
|
|
1451
1613
|
This method considers all combinations of two frames. Use the possible_relation_func to filter impossible pairs.
|
|
1452
1614
|
Outputs pairs that are related.
|
|
@@ -1463,11 +1625,17 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1463
1625
|
the temperature for token sampling.
|
|
1464
1626
|
stream : bool, Optional
|
|
1465
1627
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
1628
|
+
return_messages_log : bool, Optional
|
|
1629
|
+
if True, a list of messages will be returned.
|
|
1466
1630
|
|
|
1467
1631
|
Return : List[Dict]
|
|
1468
1632
|
a list of dict with {"frame_1_id", "frame_2_id"}.
|
|
1469
1633
|
"""
|
|
1470
1634
|
pairs = itertools.combinations(doc.frames, 2)
|
|
1635
|
+
|
|
1636
|
+
if return_messages_log:
|
|
1637
|
+
messages_log = []
|
|
1638
|
+
|
|
1471
1639
|
output = []
|
|
1472
1640
|
for frame_1, frame_2 in pairs:
|
|
1473
1641
|
pos_rel = self.possible_relation_func(frame_1, frame_2)
|
|
@@ -1495,13 +1663,19 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1495
1663
|
)
|
|
1496
1664
|
rel_json = self._extract_json(gen_text)
|
|
1497
1665
|
if self._post_process(rel_json):
|
|
1498
|
-
output.append({'
|
|
1666
|
+
output.append({'frame_1_id':frame_1.frame_id, 'frame_2_id':frame_2.frame_id})
|
|
1667
|
+
|
|
1668
|
+
if return_messages_log:
|
|
1669
|
+
messages.append({"role": "assistant", "content": gen_text})
|
|
1670
|
+
messages_log.append(messages)
|
|
1499
1671
|
|
|
1672
|
+
if return_messages_log:
|
|
1673
|
+
return output, messages_log
|
|
1500
1674
|
return output
|
|
1501
1675
|
|
|
1502
1676
|
|
|
1503
1677
|
async def extract_async(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
|
|
1504
|
-
temperature:float=0.0, concurrent_batch_size:int=32, **kwrs) -> List[Dict]:
|
|
1678
|
+
temperature:float=0.0, concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[Dict]:
|
|
1505
1679
|
"""
|
|
1506
1680
|
This is the asynchronous version of the extract() method.
|
|
1507
1681
|
|
|
@@ -1517,6 +1691,8 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1517
1691
|
the temperature for token sampling.
|
|
1518
1692
|
concurrent_batch_size : int, Optional
|
|
1519
1693
|
the number of frame pairs to process in concurrent.
|
|
1694
|
+
return_messages_log : bool, Optional
|
|
1695
|
+
if True, a list of messages will be returned.
|
|
1520
1696
|
|
|
1521
1697
|
Return : List[Dict]
|
|
1522
1698
|
a list of dict with {"frame_1", "frame_2"}.
|
|
@@ -1526,12 +1702,17 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1526
1702
|
raise NotImplementedError(f"{self.inference_engine.__class__.__name__} does not have chat_async() method.")
|
|
1527
1703
|
|
|
1528
1704
|
pairs = itertools.combinations(doc.frames, 2)
|
|
1705
|
+
if return_messages_log:
|
|
1706
|
+
messages_log = []
|
|
1707
|
+
|
|
1529
1708
|
n_frames = len(doc.frames)
|
|
1530
1709
|
num_pairs = (n_frames * (n_frames-1)) // 2
|
|
1531
|
-
|
|
1532
|
-
tasks = []
|
|
1710
|
+
output = []
|
|
1533
1711
|
for i in range(0, num_pairs, concurrent_batch_size):
|
|
1712
|
+
rel_pair_list = []
|
|
1713
|
+
tasks = []
|
|
1534
1714
|
batch = list(itertools.islice(pairs, concurrent_batch_size))
|
|
1715
|
+
batch_messages = []
|
|
1535
1716
|
for frame_1, frame_2 in batch:
|
|
1536
1717
|
pos_rel = self.possible_relation_func(frame_1, frame_2)
|
|
1537
1718
|
|
|
@@ -1546,6 +1727,7 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1546
1727
|
"frame_1": str(frame_1.to_dict()),
|
|
1547
1728
|
"frame_2": str(frame_2.to_dict())}
|
|
1548
1729
|
)})
|
|
1730
|
+
|
|
1549
1731
|
task = asyncio.create_task(
|
|
1550
1732
|
self.inference_engine.chat_async(
|
|
1551
1733
|
messages=messages,
|
|
@@ -1555,20 +1737,27 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1555
1737
|
)
|
|
1556
1738
|
)
|
|
1557
1739
|
tasks.append(task)
|
|
1740
|
+
batch_messages.append(messages)
|
|
1558
1741
|
|
|
1559
1742
|
responses = await asyncio.gather(*tasks)
|
|
1560
1743
|
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
output.append(d)
|
|
1744
|
+
for d, response, messages in zip(rel_pair_list, responses, batch_messages):
|
|
1745
|
+
if return_messages_log:
|
|
1746
|
+
messages.append({"role": "assistant", "content": response})
|
|
1747
|
+
messages_log.append(messages)
|
|
1566
1748
|
|
|
1749
|
+
rel_json = self._extract_json(response)
|
|
1750
|
+
if self._post_process(rel_json):
|
|
1751
|
+
output.append(d)
|
|
1752
|
+
|
|
1753
|
+
if return_messages_log:
|
|
1754
|
+
return output, messages_log
|
|
1567
1755
|
return output
|
|
1568
1756
|
|
|
1569
1757
|
|
|
1570
1758
|
def extract_relations(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
|
|
1571
|
-
temperature:float=0.0, concurrent:bool=False, concurrent_batch_size:int=32,
|
|
1759
|
+
temperature:float=0.0, concurrent:bool=False, concurrent_batch_size:int=32,
|
|
1760
|
+
stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict]:
|
|
1572
1761
|
"""
|
|
1573
1762
|
This method considers all combinations of two frames. Use the possible_relation_func to filter impossible pairs.
|
|
1574
1763
|
|
|
@@ -1588,6 +1777,8 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1588
1777
|
the number of frame pairs to process in concurrent.
|
|
1589
1778
|
stream : bool, Optional
|
|
1590
1779
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
1780
|
+
return_messages_log : bool, Optional
|
|
1781
|
+
if True, a list of messages will be returned.
|
|
1591
1782
|
|
|
1592
1783
|
Return : List[Dict]
|
|
1593
1784
|
a list of dict with {"frame_1", "frame_2"} for all relations.
|
|
@@ -1608,6 +1799,7 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1608
1799
|
max_new_tokens=max_new_tokens,
|
|
1609
1800
|
temperature=temperature,
|
|
1610
1801
|
concurrent_batch_size=concurrent_batch_size,
|
|
1802
|
+
return_messages_log=return_messages_log,
|
|
1611
1803
|
**kwrs)
|
|
1612
1804
|
)
|
|
1613
1805
|
else:
|
|
@@ -1616,6 +1808,7 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1616
1808
|
max_new_tokens=max_new_tokens,
|
|
1617
1809
|
temperature=temperature,
|
|
1618
1810
|
stream=stream,
|
|
1811
|
+
return_messages_log=return_messages_log,
|
|
1619
1812
|
**kwrs)
|
|
1620
1813
|
|
|
1621
1814
|
|
|
@@ -1689,7 +1882,7 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
1689
1882
|
|
|
1690
1883
|
|
|
1691
1884
|
def extract(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
|
|
1692
|
-
temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict]:
|
|
1885
|
+
temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict]:
|
|
1693
1886
|
"""
|
|
1694
1887
|
This method considers all combinations of two frames. Use the possible_relation_types_func to filter impossible pairs.
|
|
1695
1888
|
|
|
@@ -1705,11 +1898,17 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
1705
1898
|
the temperature for token sampling.
|
|
1706
1899
|
stream : bool, Optional
|
|
1707
1900
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
1901
|
+
return_messages_log : bool, Optional
|
|
1902
|
+
if True, a list of messages will be returned.
|
|
1708
1903
|
|
|
1709
1904
|
Return : List[Dict]
|
|
1710
|
-
a list of dict with {"
|
|
1905
|
+
a list of dict with {"frame_1_id", "frame_2_id", "relation"} for all frame pairs.
|
|
1711
1906
|
"""
|
|
1712
1907
|
pairs = itertools.combinations(doc.frames, 2)
|
|
1908
|
+
|
|
1909
|
+
if return_messages_log:
|
|
1910
|
+
messages_log = []
|
|
1911
|
+
|
|
1713
1912
|
output = []
|
|
1714
1913
|
for frame_1, frame_2 in pairs:
|
|
1715
1914
|
pos_rel_types = self.possible_relation_types_func(frame_1, frame_2)
|
|
@@ -1736,16 +1935,23 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
1736
1935
|
stream=stream,
|
|
1737
1936
|
**kwrs
|
|
1738
1937
|
)
|
|
1938
|
+
|
|
1939
|
+
if return_messages_log:
|
|
1940
|
+
messages.append({"role": "assistant", "content": gen_text})
|
|
1941
|
+
messages_log.append(messages)
|
|
1942
|
+
|
|
1739
1943
|
rel_json = self._extract_json(gen_text)
|
|
1740
1944
|
rel = self._post_process(rel_json, pos_rel_types)
|
|
1741
1945
|
if rel:
|
|
1742
|
-
output.append({'
|
|
1946
|
+
output.append({'frame_1_id':frame_1.frame_id, 'frame_2_id':frame_2.frame_id, 'relation':rel})
|
|
1743
1947
|
|
|
1948
|
+
if return_messages_log:
|
|
1949
|
+
return output, messages_log
|
|
1744
1950
|
return output
|
|
1745
1951
|
|
|
1746
1952
|
|
|
1747
1953
|
async def extract_async(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
|
|
1748
|
-
temperature:float=0.0, concurrent_batch_size:int=32, **kwrs) -> List[Dict]:
|
|
1954
|
+
temperature:float=0.0, concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[Dict]:
|
|
1749
1955
|
"""
|
|
1750
1956
|
This is the asynchronous version of the extract() method.
|
|
1751
1957
|
|
|
@@ -1761,21 +1967,28 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
1761
1967
|
the temperature for token sampling.
|
|
1762
1968
|
concurrent_batch_size : int, Optional
|
|
1763
1969
|
the number of frame pairs to process in concurrent.
|
|
1970
|
+
return_messages_log : bool, Optional
|
|
1971
|
+
if True, a list of messages will be returned.
|
|
1764
1972
|
|
|
1765
1973
|
Return : List[Dict]
|
|
1766
|
-
a list of dict with {"
|
|
1974
|
+
a list of dict with {"frame_1_id", "frame_2_id", "relation"} for all frame pairs.
|
|
1767
1975
|
"""
|
|
1768
1976
|
# Check if self.inference_engine.chat_async() is implemented
|
|
1769
1977
|
if not hasattr(self.inference_engine, 'chat_async'):
|
|
1770
1978
|
raise NotImplementedError(f"{self.inference_engine.__class__.__name__} does not have chat_async() method.")
|
|
1771
1979
|
|
|
1772
1980
|
pairs = itertools.combinations(doc.frames, 2)
|
|
1981
|
+
if return_messages_log:
|
|
1982
|
+
messages_log = []
|
|
1983
|
+
|
|
1773
1984
|
n_frames = len(doc.frames)
|
|
1774
1985
|
num_pairs = (n_frames * (n_frames-1)) // 2
|
|
1775
|
-
|
|
1776
|
-
tasks = []
|
|
1986
|
+
output = []
|
|
1777
1987
|
for i in range(0, num_pairs, concurrent_batch_size):
|
|
1988
|
+
rel_pair_list = []
|
|
1989
|
+
tasks = []
|
|
1778
1990
|
batch = list(itertools.islice(pairs, concurrent_batch_size))
|
|
1991
|
+
batch_messages = []
|
|
1779
1992
|
for frame_1, frame_2 in batch:
|
|
1780
1993
|
pos_rel_types = self.possible_relation_types_func(frame_1, frame_2)
|
|
1781
1994
|
|
|
@@ -1800,21 +2013,28 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
1800
2013
|
)
|
|
1801
2014
|
)
|
|
1802
2015
|
tasks.append(task)
|
|
2016
|
+
batch_messages.append(messages)
|
|
1803
2017
|
|
|
1804
2018
|
responses = await asyncio.gather(*tasks)
|
|
1805
2019
|
|
|
1806
|
-
|
|
1807
|
-
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
2020
|
+
for d, response, messages in zip(rel_pair_list, responses, batch_messages):
|
|
2021
|
+
if return_messages_log:
|
|
2022
|
+
messages.append({"role": "assistant", "content": response})
|
|
2023
|
+
messages_log.append(messages)
|
|
2024
|
+
|
|
2025
|
+
rel_json = self._extract_json(response)
|
|
2026
|
+
rel = self._post_process(rel_json, d['pos_rel_types'])
|
|
2027
|
+
if rel:
|
|
2028
|
+
output.append({'frame_1_id':d['frame_1'], 'frame_2_id':d['frame_2'], 'relation':rel})
|
|
1812
2029
|
|
|
2030
|
+
if return_messages_log:
|
|
2031
|
+
return output, messages_log
|
|
1813
2032
|
return output
|
|
1814
2033
|
|
|
1815
2034
|
|
|
1816
2035
|
def extract_relations(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
|
|
1817
|
-
temperature:float=0.0, concurrent:bool=False, concurrent_batch_size:int=32,
|
|
2036
|
+
temperature:float=0.0, concurrent:bool=False, concurrent_batch_size:int=32,
|
|
2037
|
+
stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict]:
|
|
1818
2038
|
"""
|
|
1819
2039
|
This method considers all combinations of two frames. Use the possible_relation_types_func to filter impossible pairs.
|
|
1820
2040
|
|
|
@@ -1834,6 +2054,8 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
1834
2054
|
the number of frame pairs to process in concurrent.
|
|
1835
2055
|
stream : bool, Optional
|
|
1836
2056
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
2057
|
+
return_messages_log : bool, Optional
|
|
2058
|
+
if True, a list of messages will be returned.
|
|
1837
2059
|
|
|
1838
2060
|
Return : List[Dict]
|
|
1839
2061
|
a list of dict with {"frame_1", "frame_2", "relation"} for all relations.
|
|
@@ -1854,6 +2076,7 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
1854
2076
|
max_new_tokens=max_new_tokens,
|
|
1855
2077
|
temperature=temperature,
|
|
1856
2078
|
concurrent_batch_size=concurrent_batch_size,
|
|
2079
|
+
return_messages_log=return_messages_log,
|
|
1857
2080
|
**kwrs)
|
|
1858
2081
|
)
|
|
1859
2082
|
else:
|
|
@@ -1862,5 +2085,6 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
1862
2085
|
max_new_tokens=max_new_tokens,
|
|
1863
2086
|
temperature=temperature,
|
|
1864
2087
|
stream=stream,
|
|
2088
|
+
return_messages_log=return_messages_log,
|
|
1865
2089
|
**kwrs)
|
|
1866
2090
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
{llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
|
File without changes
|
{llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
|
File without changes
|
{llm_ie-0.4.6 → llm_ie-0.4.7}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|