llm-ie 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_ie/__init__.py +2 -2
- llm_ie/engines.py +497 -250
- llm_ie/extractors.py +100 -251
- llm_ie/prompt_editor.py +13 -13
- {llm_ie-1.0.0.dist-info → llm_ie-1.1.0.dist-info}/METADATA +2 -2
- {llm_ie-1.0.0.dist-info → llm_ie-1.1.0.dist-info}/RECORD +7 -7
- {llm_ie-1.0.0.dist-info → llm_ie-1.1.0.dist-info}/WHEEL +0 -0
llm_ie/extractors.py
CHANGED
|
@@ -17,7 +17,7 @@ from colorama import Fore, Style
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class Extractor:
|
|
20
|
-
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None
|
|
20
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None):
|
|
21
21
|
"""
|
|
22
22
|
This is the abstract class for (frame and relation) extractors.
|
|
23
23
|
Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
|
|
@@ -172,7 +172,7 @@ class Extractor:
|
|
|
172
172
|
class FrameExtractor(Extractor):
|
|
173
173
|
from nltk.tokenize import RegexpTokenizer
|
|
174
174
|
def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker,
|
|
175
|
-
prompt_template:str, system_prompt:str=None, context_chunker:ContextChunker=None
|
|
175
|
+
prompt_template:str, system_prompt:str=None, context_chunker:ContextChunker=None):
|
|
176
176
|
"""
|
|
177
177
|
This is the abstract class for frame extraction.
|
|
178
178
|
Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
|
|
@@ -192,8 +192,7 @@ class FrameExtractor(Extractor):
|
|
|
192
192
|
"""
|
|
193
193
|
super().__init__(inference_engine=inference_engine,
|
|
194
194
|
prompt_template=prompt_template,
|
|
195
|
-
system_prompt=system_prompt
|
|
196
|
-
**kwrs)
|
|
195
|
+
system_prompt=system_prompt)
|
|
197
196
|
|
|
198
197
|
self.unit_chunker = unit_chunker
|
|
199
198
|
if context_chunker is None:
|
|
@@ -332,7 +331,7 @@ class FrameExtractor(Extractor):
|
|
|
332
331
|
return entity_spans
|
|
333
332
|
|
|
334
333
|
@abc.abstractmethod
|
|
335
|
-
def extract(self, text_content:Union[str, Dict[str,str]],
|
|
334
|
+
def extract(self, text_content:Union[str, Dict[str,str]], return_messages_log:bool=False, **kwrs) -> str:
|
|
336
335
|
"""
|
|
337
336
|
This method inputs text content and outputs a string generated by LLM
|
|
338
337
|
|
|
@@ -342,8 +341,6 @@ class FrameExtractor(Extractor):
|
|
|
342
341
|
the input text content to put in prompt template.
|
|
343
342
|
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
344
343
|
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
345
|
-
max_new_tokens : str, Optional
|
|
346
|
-
the max number of new tokens LLM can generate.
|
|
347
344
|
return_messages_log : bool, Optional
|
|
348
345
|
if True, a list of messages will be returned.
|
|
349
346
|
|
|
@@ -354,7 +351,7 @@ class FrameExtractor(Extractor):
|
|
|
354
351
|
|
|
355
352
|
|
|
356
353
|
@abc.abstractmethod
|
|
357
|
-
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str,
|
|
354
|
+
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str,
|
|
358
355
|
document_key:str=None, return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
359
356
|
"""
|
|
360
357
|
This method inputs text content and outputs a list of LLMInformationExtractionFrame
|
|
@@ -368,8 +365,6 @@ class FrameExtractor(Extractor):
|
|
|
368
365
|
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
369
366
|
entity_key : str
|
|
370
367
|
the key (in ouptut JSON) for entity text. Any extraction that does not include entity key will be dropped.
|
|
371
|
-
max_new_tokens : str, Optional
|
|
372
|
-
the max number of new tokens LLM should generate.
|
|
373
368
|
document_key : str, Optional
|
|
374
369
|
specify the key in text_content where document text is.
|
|
375
370
|
If text_content is str, this parameter will be ignored.
|
|
@@ -384,7 +379,7 @@ class FrameExtractor(Extractor):
|
|
|
384
379
|
|
|
385
380
|
class DirectFrameExtractor(FrameExtractor):
|
|
386
381
|
def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker,
|
|
387
|
-
prompt_template:str, system_prompt:str=None, context_chunker:ContextChunker=None
|
|
382
|
+
prompt_template:str, system_prompt:str=None, context_chunker:ContextChunker=None):
|
|
388
383
|
"""
|
|
389
384
|
This class is for general unit-context frame extraction.
|
|
390
385
|
Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
|
|
@@ -406,12 +401,11 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
406
401
|
unit_chunker=unit_chunker,
|
|
407
402
|
prompt_template=prompt_template,
|
|
408
403
|
system_prompt=system_prompt,
|
|
409
|
-
context_chunker=context_chunker
|
|
410
|
-
**kwrs)
|
|
404
|
+
context_chunker=context_chunker)
|
|
411
405
|
|
|
412
406
|
|
|
413
|
-
def extract(self, text_content:Union[str, Dict[str,str]],
|
|
414
|
-
document_key:str=None,
|
|
407
|
+
def extract(self, text_content:Union[str, Dict[str,str]],
|
|
408
|
+
document_key:str=None, verbose:bool=False, return_messages_log:bool=False) -> List[FrameExtractionUnitResult]:
|
|
415
409
|
"""
|
|
416
410
|
This method inputs a text and outputs a list of outputs per unit.
|
|
417
411
|
|
|
@@ -421,13 +415,9 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
421
415
|
the input text content to put in prompt template.
|
|
422
416
|
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
423
417
|
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
424
|
-
max_new_tokens : int, Optional
|
|
425
|
-
the max number of new tokens LLM should generate.
|
|
426
418
|
document_key : str, Optional
|
|
427
419
|
specify the key in text_content where document text is.
|
|
428
420
|
If text_content is str, this parameter will be ignored.
|
|
429
|
-
temperature : float, Optional
|
|
430
|
-
the temperature for token sampling.
|
|
431
421
|
verbose : bool, Optional
|
|
432
422
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
433
423
|
return_messages_log : bool, Optional
|
|
@@ -491,27 +481,12 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
491
481
|
|
|
492
482
|
print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
|
|
493
483
|
|
|
494
|
-
response_stream = self.inference_engine.chat(
|
|
495
|
-
messages=messages,
|
|
496
|
-
max_new_tokens=max_new_tokens,
|
|
497
|
-
temperature=temperature,
|
|
498
|
-
stream=True,
|
|
499
|
-
**kwrs
|
|
500
|
-
)
|
|
501
|
-
|
|
502
|
-
gen_text = ""
|
|
503
|
-
for chunk in response_stream:
|
|
504
|
-
gen_text += chunk
|
|
505
|
-
print(chunk, end='', flush=True)
|
|
506
484
|
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
stream=False,
|
|
513
|
-
**kwrs
|
|
514
|
-
)
|
|
485
|
+
gen_text = self.inference_engine.chat(
|
|
486
|
+
messages=messages,
|
|
487
|
+
verbose=verbose,
|
|
488
|
+
stream=False
|
|
489
|
+
)
|
|
515
490
|
|
|
516
491
|
if return_messages_log:
|
|
517
492
|
messages.append({"role": "assistant", "content": gen_text})
|
|
@@ -530,8 +505,8 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
530
505
|
|
|
531
506
|
return output
|
|
532
507
|
|
|
533
|
-
def stream(self, text_content: Union[str, Dict[str, str]],
|
|
534
|
-
|
|
508
|
+
def stream(self, text_content: Union[str, Dict[str, str]],
|
|
509
|
+
document_key: str = None) -> Generator[Dict[str, Any], None, List[FrameExtractionUnitResult]]:
|
|
535
510
|
"""
|
|
536
511
|
Streams LLM responses per unit with structured event types,
|
|
537
512
|
and returns collected data for post-processing.
|
|
@@ -542,7 +517,8 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
542
517
|
- {"type": "info", "data": str_message}: General informational messages.
|
|
543
518
|
- {"type": "unit", "data": dict_unit_info}: Signals start of a new unit. dict_unit_info contains {'id', 'text', 'start', 'end'}
|
|
544
519
|
- {"type": "context", "data": str_context}: Context string for the current unit.
|
|
545
|
-
- {"type": "
|
|
520
|
+
- {"type": "reasoning", "data": str_chunk}: A reasoning model thinking chunk from the LLM.
|
|
521
|
+
- {"type": "response", "data": str_chunk}: A response/answer chunk from the LLM.
|
|
546
522
|
|
|
547
523
|
Returns:
|
|
548
524
|
--------
|
|
@@ -601,13 +577,10 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
601
577
|
|
|
602
578
|
response_stream = self.inference_engine.chat(
|
|
603
579
|
messages=messages,
|
|
604
|
-
|
|
605
|
-
temperature=temperature,
|
|
606
|
-
stream=True,
|
|
607
|
-
**kwrs
|
|
580
|
+
stream=True
|
|
608
581
|
)
|
|
609
582
|
for chunk in response_stream:
|
|
610
|
-
yield
|
|
583
|
+
yield chunk
|
|
611
584
|
current_gen_text += chunk
|
|
612
585
|
|
|
613
586
|
# Store the result for this unit
|
|
@@ -622,8 +595,8 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
622
595
|
yield {"type": "info", "data": "All units processed by LLM."}
|
|
623
596
|
return collected_results
|
|
624
597
|
|
|
625
|
-
async def extract_async(self, text_content:Union[str, Dict[str,str]],
|
|
626
|
-
concurrent_batch_size:int=32, return_messages_log:bool=False
|
|
598
|
+
async def extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
|
|
599
|
+
concurrent_batch_size:int=32, return_messages_log:bool=False) -> List[FrameExtractionUnitResult]:
|
|
627
600
|
"""
|
|
628
601
|
This is the asynchronous version of the extract() method.
|
|
629
602
|
|
|
@@ -633,13 +606,9 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
633
606
|
the input text content to put in prompt template.
|
|
634
607
|
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
635
608
|
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
636
|
-
max_new_tokens : int, Optional
|
|
637
|
-
the max number of new tokens LLM should generate.
|
|
638
609
|
document_key : str, Optional
|
|
639
610
|
specify the key in text_content where document text is.
|
|
640
611
|
If text_content is str, this parameter will be ignored.
|
|
641
|
-
temperature : float, Optional
|
|
642
|
-
the temperature for token sampling.
|
|
643
612
|
concurrent_batch_size : int, Optional
|
|
644
613
|
the batch size for concurrent processing.
|
|
645
614
|
return_messages_log : bool, Optional
|
|
@@ -701,17 +670,14 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
701
670
|
# Process units concurrently with asyncio.Semaphore
|
|
702
671
|
semaphore = asyncio.Semaphore(concurrent_batch_size)
|
|
703
672
|
|
|
704
|
-
async def semaphore_helper(task_data: Dict,
|
|
673
|
+
async def semaphore_helper(task_data: Dict, **kwrs):
|
|
705
674
|
unit = task_data["unit"]
|
|
706
675
|
messages = task_data["messages"]
|
|
707
676
|
original_index = task_data["original_index"]
|
|
708
677
|
|
|
709
678
|
async with semaphore:
|
|
710
679
|
gen_text = await self.inference_engine.chat_async(
|
|
711
|
-
messages=messages
|
|
712
|
-
max_new_tokens=max_new_tokens,
|
|
713
|
-
temperature=temperature,
|
|
714
|
-
**kwrs
|
|
680
|
+
messages=messages
|
|
715
681
|
)
|
|
716
682
|
return {"original_index": original_index, "unit": unit, "gen_text": gen_text, "messages": messages}
|
|
717
683
|
|
|
@@ -719,10 +685,7 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
719
685
|
tasks = []
|
|
720
686
|
for task_inp in tasks_input:
|
|
721
687
|
task = asyncio.create_task(semaphore_helper(
|
|
722
|
-
task_inp
|
|
723
|
-
max_new_tokens=max_new_tokens,
|
|
724
|
-
temperature=temperature,
|
|
725
|
-
**kwrs
|
|
688
|
+
task_inp
|
|
726
689
|
))
|
|
727
690
|
tasks.append(task)
|
|
728
691
|
|
|
@@ -759,11 +722,10 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
759
722
|
return output
|
|
760
723
|
|
|
761
724
|
|
|
762
|
-
def extract_frames(self, text_content:Union[str, Dict[str,str]],
|
|
763
|
-
|
|
764
|
-
concurrent:bool=False, concurrent_batch_size:int=32,
|
|
725
|
+
def extract_frames(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
|
|
726
|
+
verbose:bool=False, concurrent:bool=False, concurrent_batch_size:int=32,
|
|
765
727
|
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
|
|
766
|
-
allow_overlap_entities:bool=False, return_messages_log:bool=False
|
|
728
|
+
allow_overlap_entities:bool=False, return_messages_log:bool=False) -> List[LLMInformationExtractionFrame]:
|
|
767
729
|
"""
|
|
768
730
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
769
731
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -774,13 +736,9 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
774
736
|
the input text content to put in prompt template.
|
|
775
737
|
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
776
738
|
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
777
|
-
max_new_tokens : str, Optional
|
|
778
|
-
the max number of new tokens LLM should generate.
|
|
779
739
|
document_key : str, Optional
|
|
780
740
|
specify the key in text_content where document text is.
|
|
781
741
|
If text_content is str, this parameter will be ignored.
|
|
782
|
-
temperature : float, Optional
|
|
783
|
-
the temperature for token sampling.
|
|
784
742
|
verbose : bool, Optional
|
|
785
743
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
786
744
|
concurrent : bool, Optional
|
|
@@ -812,21 +770,15 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
812
770
|
|
|
813
771
|
nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
|
|
814
772
|
extraction_results = asyncio.run(self.extract_async(text_content=text_content,
|
|
815
|
-
max_new_tokens=max_new_tokens,
|
|
816
773
|
document_key=document_key,
|
|
817
|
-
temperature=temperature,
|
|
818
774
|
concurrent_batch_size=concurrent_batch_size,
|
|
819
|
-
return_messages_log=return_messages_log
|
|
820
|
-
**kwrs)
|
|
775
|
+
return_messages_log=return_messages_log)
|
|
821
776
|
)
|
|
822
777
|
else:
|
|
823
778
|
extraction_results = self.extract(text_content=text_content,
|
|
824
|
-
max_new_tokens=max_new_tokens,
|
|
825
779
|
document_key=document_key,
|
|
826
|
-
temperature=temperature,
|
|
827
780
|
verbose=verbose,
|
|
828
|
-
return_messages_log=return_messages_log
|
|
829
|
-
**kwrs)
|
|
781
|
+
return_messages_log=return_messages_log)
|
|
830
782
|
|
|
831
783
|
llm_output_results, messages_log = extraction_results if return_messages_log else (extraction_results, None)
|
|
832
784
|
|
|
@@ -869,8 +821,8 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
869
821
|
|
|
870
822
|
|
|
871
823
|
class ReviewFrameExtractor(DirectFrameExtractor):
|
|
872
|
-
def __init__(self, unit_chunker:UnitChunker, context_chunker:ContextChunker,
|
|
873
|
-
|
|
824
|
+
def __init__(self, unit_chunker:UnitChunker, context_chunker:ContextChunker, inference_engine:InferenceEngine,
|
|
825
|
+
prompt_template:str, review_mode:str, review_prompt:str=None, system_prompt:str=None):
|
|
874
826
|
"""
|
|
875
827
|
This class add a review step after the DirectFrameExtractor.
|
|
876
828
|
The Review process asks LLM to review its output and:
|
|
@@ -901,8 +853,7 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
901
853
|
unit_chunker=unit_chunker,
|
|
902
854
|
prompt_template=prompt_template,
|
|
903
855
|
system_prompt=system_prompt,
|
|
904
|
-
context_chunker=context_chunker
|
|
905
|
-
**kwrs)
|
|
856
|
+
context_chunker=context_chunker)
|
|
906
857
|
# check review mode
|
|
907
858
|
if review_mode not in {"addition", "revision"}:
|
|
908
859
|
raise ValueError('review_mode must be one of {"addition", "revision"}.')
|
|
@@ -939,8 +890,8 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
939
890
|
if self.review_prompt is None:
|
|
940
891
|
raise ValueError(f"Cannot find review prompt for {self.__class__.__name__} in the package. Please provide a review_prompt.")
|
|
941
892
|
|
|
942
|
-
def extract(self, text_content:Union[str, Dict[str,str]],
|
|
943
|
-
|
|
893
|
+
def extract(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
|
|
894
|
+
verbose:bool=False, return_messages_log:bool=False) -> List[FrameExtractionUnitResult]:
|
|
944
895
|
"""
|
|
945
896
|
This method inputs a text and outputs a list of outputs per unit.
|
|
946
897
|
|
|
@@ -950,13 +901,9 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
950
901
|
the input text content to put in prompt template.
|
|
951
902
|
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
952
903
|
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
953
|
-
max_new_tokens : int, Optional
|
|
954
|
-
the max number of new tokens LLM should generate.
|
|
955
904
|
document_key : str, Optional
|
|
956
905
|
specify the key in text_content where document text is.
|
|
957
906
|
If text_content is str, this parameter will be ignored.
|
|
958
|
-
temperature : float, Optional
|
|
959
|
-
the temperature for token sampling.
|
|
960
907
|
verbose : bool, Optional
|
|
961
908
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
962
909
|
return_messages_log : bool, Optional
|
|
@@ -1020,28 +967,13 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
1020
967
|
print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
|
|
1021
968
|
|
|
1022
969
|
print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
|
|
1023
|
-
|
|
1024
|
-
response_stream = self.inference_engine.chat(
|
|
1025
|
-
messages=messages,
|
|
1026
|
-
max_new_tokens=max_new_tokens,
|
|
1027
|
-
temperature=temperature,
|
|
1028
|
-
stream=True,
|
|
1029
|
-
**kwrs
|
|
1030
|
-
)
|
|
1031
|
-
|
|
1032
|
-
initial = ""
|
|
1033
|
-
for chunk in response_stream:
|
|
1034
|
-
initial += chunk
|
|
1035
|
-
print(chunk, end='', flush=True)
|
|
1036
970
|
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
**kwrs
|
|
1044
|
-
)
|
|
971
|
+
|
|
972
|
+
initial = self.inference_engine.chat(
|
|
973
|
+
messages=messages,
|
|
974
|
+
verbose=verbose,
|
|
975
|
+
stream=False
|
|
976
|
+
)
|
|
1045
977
|
|
|
1046
978
|
if return_messages_log:
|
|
1047
979
|
messages.append({"role": "assistant", "content": initial})
|
|
@@ -1053,29 +985,12 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
1053
985
|
|
|
1054
986
|
messages.append({'role': 'assistant', 'content': initial})
|
|
1055
987
|
messages.append({'role': 'user', 'content': self.review_prompt})
|
|
1056
|
-
|
|
1057
|
-
if verbose:
|
|
1058
|
-
response_stream = self.inference_engine.chat(
|
|
1059
|
-
messages=messages,
|
|
1060
|
-
max_new_tokens=max_new_tokens,
|
|
1061
|
-
temperature=temperature,
|
|
1062
|
-
stream=True,
|
|
1063
|
-
**kwrs
|
|
1064
|
-
)
|
|
1065
|
-
|
|
1066
|
-
review = ""
|
|
1067
|
-
for chunk in response_stream:
|
|
1068
|
-
review += chunk
|
|
1069
|
-
print(chunk, end='', flush=True)
|
|
1070
988
|
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
stream=False,
|
|
1077
|
-
**kwrs
|
|
1078
|
-
)
|
|
989
|
+
review = self.inference_engine.chat(
|
|
990
|
+
messages=messages,
|
|
991
|
+
verbose=verbose,
|
|
992
|
+
stream=False
|
|
993
|
+
)
|
|
1079
994
|
|
|
1080
995
|
# Output
|
|
1081
996
|
if self.review_mode == "revision":
|
|
@@ -1101,8 +1016,7 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
1101
1016
|
return output
|
|
1102
1017
|
|
|
1103
1018
|
|
|
1104
|
-
def stream(self, text_content:Union[str, Dict[str,str]],
|
|
1105
|
-
document_key:str=None, temperature:float=0.0, **kwrs) -> Generator[str, None, None]:
|
|
1019
|
+
def stream(self, text_content:Union[str, Dict[str,str]], document_key:str=None) -> Generator[str, None, None]:
|
|
1106
1020
|
"""
|
|
1107
1021
|
This method inputs a text and outputs a list of outputs per unit.
|
|
1108
1022
|
|
|
@@ -1112,13 +1026,9 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
1112
1026
|
the input text content to put in prompt template.
|
|
1113
1027
|
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
1114
1028
|
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
1115
|
-
max_new_tokens : int, Optional
|
|
1116
|
-
the max number of new tokens LLM should generate.
|
|
1117
1029
|
document_key : str, Optional
|
|
1118
1030
|
specify the key in text_content where document text is.
|
|
1119
1031
|
If text_content is str, this parameter will be ignored.
|
|
1120
|
-
temperature : float, Optional
|
|
1121
|
-
the temperature for token sampling.
|
|
1122
1032
|
|
|
1123
1033
|
Return : List[FrameExtractionUnitResult]
|
|
1124
1034
|
the output from LLM for each unit. Contains the start, end, text, and generated text.
|
|
@@ -1176,10 +1086,7 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
1176
1086
|
|
|
1177
1087
|
response_stream = self.inference_engine.chat(
|
|
1178
1088
|
messages=messages,
|
|
1179
|
-
|
|
1180
|
-
temperature=temperature,
|
|
1181
|
-
stream=True,
|
|
1182
|
-
**kwrs
|
|
1089
|
+
stream=True
|
|
1183
1090
|
)
|
|
1184
1091
|
|
|
1185
1092
|
initial = ""
|
|
@@ -1195,16 +1102,13 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
1195
1102
|
|
|
1196
1103
|
response_stream = self.inference_engine.chat(
|
|
1197
1104
|
messages=messages,
|
|
1198
|
-
|
|
1199
|
-
temperature=temperature,
|
|
1200
|
-
stream=True,
|
|
1201
|
-
**kwrs
|
|
1105
|
+
stream=True
|
|
1202
1106
|
)
|
|
1203
1107
|
|
|
1204
1108
|
for chunk in response_stream:
|
|
1205
1109
|
yield chunk
|
|
1206
1110
|
|
|
1207
|
-
async def extract_async(self, text_content:Union[str, Dict[str,str]],
|
|
1111
|
+
async def extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
|
|
1208
1112
|
concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[FrameExtractionUnitResult]:
|
|
1209
1113
|
"""
|
|
1210
1114
|
This is the asynchronous version of the extract() method with the review step.
|
|
@@ -1215,13 +1119,9 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
1215
1119
|
the input text content to put in prompt template.
|
|
1216
1120
|
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
1217
1121
|
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
1218
|
-
max_new_tokens : int, Optional
|
|
1219
|
-
the max number of new tokens LLM should generate.
|
|
1220
1122
|
document_key : str, Optional
|
|
1221
1123
|
specify the key in text_content where document text is.
|
|
1222
1124
|
If text_content is str, this parameter will be ignored.
|
|
1223
|
-
temperature : float, Optional
|
|
1224
|
-
the temperature for token sampling.
|
|
1225
1125
|
concurrent_batch_size : int, Optional
|
|
1226
1126
|
the batch size for concurrent processing.
|
|
1227
1127
|
return_messages_log : bool, Optional
|
|
@@ -1282,17 +1182,14 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
1282
1182
|
|
|
1283
1183
|
semaphore = asyncio.Semaphore(concurrent_batch_size)
|
|
1284
1184
|
|
|
1285
|
-
async def initial_semaphore_helper(task_data: Dict
|
|
1185
|
+
async def initial_semaphore_helper(task_data: Dict):
|
|
1286
1186
|
unit = task_data["unit"]
|
|
1287
1187
|
messages = task_data["messages"]
|
|
1288
1188
|
original_index = task_data["original_index"]
|
|
1289
1189
|
|
|
1290
1190
|
async with semaphore:
|
|
1291
1191
|
gen_text = await self.inference_engine.chat_async(
|
|
1292
|
-
messages=messages
|
|
1293
|
-
max_new_tokens=max_new_tokens,
|
|
1294
|
-
temperature=temperature,
|
|
1295
|
-
**kwrs
|
|
1192
|
+
messages=messages
|
|
1296
1193
|
)
|
|
1297
1194
|
# Return initial generation result along with the messages used and the unit
|
|
1298
1195
|
return {"original_index": original_index, "unit": unit, "initial_gen_text": gen_text, "initial_messages": messages}
|
|
@@ -1300,10 +1197,7 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
1300
1197
|
# Create and gather initial generation tasks
|
|
1301
1198
|
initial_tasks = [
|
|
1302
1199
|
asyncio.create_task(initial_semaphore_helper(
|
|
1303
|
-
task_inp
|
|
1304
|
-
max_new_tokens=max_new_tokens,
|
|
1305
|
-
temperature=temperature,
|
|
1306
|
-
**kwrs
|
|
1200
|
+
task_inp
|
|
1307
1201
|
))
|
|
1308
1202
|
for task_inp in initial_tasks_input
|
|
1309
1203
|
]
|
|
@@ -1333,16 +1227,13 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
1333
1227
|
})
|
|
1334
1228
|
|
|
1335
1229
|
|
|
1336
|
-
async def review_semaphore_helper(task_data: Dict,
|
|
1230
|
+
async def review_semaphore_helper(task_data: Dict, **kwrs):
|
|
1337
1231
|
messages = task_data["messages"]
|
|
1338
1232
|
original_index = task_data["original_index"]
|
|
1339
1233
|
|
|
1340
1234
|
async with semaphore:
|
|
1341
1235
|
review_gen_text = await self.inference_engine.chat_async(
|
|
1342
|
-
messages=messages
|
|
1343
|
-
max_new_tokens=max_new_tokens,
|
|
1344
|
-
temperature=temperature,
|
|
1345
|
-
**kwrs
|
|
1236
|
+
messages=messages
|
|
1346
1237
|
)
|
|
1347
1238
|
# Combine initial and review results
|
|
1348
1239
|
task_data["review_gen_text"] = review_gen_text
|
|
@@ -1354,10 +1245,7 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
1354
1245
|
# Create and gather review tasks
|
|
1355
1246
|
review_tasks = [
|
|
1356
1247
|
asyncio.create_task(review_semaphore_helper(
|
|
1357
|
-
task_inp
|
|
1358
|
-
max_new_tokens=max_new_tokens,
|
|
1359
|
-
temperature=temperature,
|
|
1360
|
-
**kwrs
|
|
1248
|
+
task_inp
|
|
1361
1249
|
))
|
|
1362
1250
|
for task_inp in review_tasks_input
|
|
1363
1251
|
]
|
|
@@ -1405,7 +1293,7 @@ class ReviewFrameExtractor(DirectFrameExtractor):
|
|
|
1405
1293
|
|
|
1406
1294
|
|
|
1407
1295
|
class BasicFrameExtractor(DirectFrameExtractor):
|
|
1408
|
-
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None
|
|
1296
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None):
|
|
1409
1297
|
"""
|
|
1410
1298
|
This class diretly prompt LLM for frame extraction.
|
|
1411
1299
|
Input system prompt (optional), prompt template (with instruction, few-shot examples),
|
|
@@ -1424,11 +1312,10 @@ class BasicFrameExtractor(DirectFrameExtractor):
|
|
|
1424
1312
|
unit_chunker=WholeDocumentUnitChunker(),
|
|
1425
1313
|
prompt_template=prompt_template,
|
|
1426
1314
|
system_prompt=system_prompt,
|
|
1427
|
-
context_chunker=NoContextChunker()
|
|
1428
|
-
**kwrs)
|
|
1315
|
+
context_chunker=NoContextChunker())
|
|
1429
1316
|
|
|
1430
1317
|
class BasicReviewFrameExtractor(ReviewFrameExtractor):
|
|
1431
|
-
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, review_mode:str, review_prompt:str=None, system_prompt:str=None
|
|
1318
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, review_mode:str, review_prompt:str=None, system_prompt:str=None):
|
|
1432
1319
|
"""
|
|
1433
1320
|
This class add a review step after the BasicFrameExtractor.
|
|
1434
1321
|
The Review process asks LLM to review its output and:
|
|
@@ -1457,13 +1344,12 @@ class BasicReviewFrameExtractor(ReviewFrameExtractor):
|
|
|
1457
1344
|
review_mode=review_mode,
|
|
1458
1345
|
review_prompt=review_prompt,
|
|
1459
1346
|
system_prompt=system_prompt,
|
|
1460
|
-
context_chunker=NoContextChunker()
|
|
1461
|
-
**kwrs)
|
|
1347
|
+
context_chunker=NoContextChunker())
|
|
1462
1348
|
|
|
1463
1349
|
|
|
1464
1350
|
class SentenceFrameExtractor(DirectFrameExtractor):
|
|
1465
1351
|
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None,
|
|
1466
|
-
context_sentences:Union[str, int]="all"
|
|
1352
|
+
context_sentences:Union[str, int]="all"):
|
|
1467
1353
|
"""
|
|
1468
1354
|
This class performs sentence-by-sentence information extraction.
|
|
1469
1355
|
The process is as follows:
|
|
@@ -1507,14 +1393,13 @@ class SentenceFrameExtractor(DirectFrameExtractor):
|
|
|
1507
1393
|
unit_chunker=SentenceUnitChunker(),
|
|
1508
1394
|
prompt_template=prompt_template,
|
|
1509
1395
|
system_prompt=system_prompt,
|
|
1510
|
-
context_chunker=context_chunker
|
|
1511
|
-
**kwrs)
|
|
1396
|
+
context_chunker=context_chunker)
|
|
1512
1397
|
|
|
1513
1398
|
|
|
1514
1399
|
class SentenceReviewFrameExtractor(ReviewFrameExtractor):
|
|
1515
1400
|
def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
|
|
1516
1401
|
review_mode:str, review_prompt:str=None, system_prompt:str=None,
|
|
1517
|
-
context_sentences:Union[str, int]="all"
|
|
1402
|
+
context_sentences:Union[str, int]="all"):
|
|
1518
1403
|
"""
|
|
1519
1404
|
This class adds a review step after the SentenceFrameExtractor.
|
|
1520
1405
|
For each sentence, the review process asks LLM to review its output and:
|
|
@@ -1561,12 +1446,11 @@ class SentenceReviewFrameExtractor(ReviewFrameExtractor):
|
|
|
1561
1446
|
review_mode=review_mode,
|
|
1562
1447
|
review_prompt=review_prompt,
|
|
1563
1448
|
system_prompt=system_prompt,
|
|
1564
|
-
context_chunker=context_chunker
|
|
1565
|
-
**kwrs)
|
|
1449
|
+
context_chunker=context_chunker)
|
|
1566
1450
|
|
|
1567
1451
|
|
|
1568
1452
|
class RelationExtractor(Extractor):
|
|
1569
|
-
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None
|
|
1453
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None):
|
|
1570
1454
|
"""
|
|
1571
1455
|
This is the abstract class for relation extraction.
|
|
1572
1456
|
Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
|
|
@@ -1582,8 +1466,7 @@ class RelationExtractor(Extractor):
|
|
|
1582
1466
|
"""
|
|
1583
1467
|
super().__init__(inference_engine=inference_engine,
|
|
1584
1468
|
prompt_template=prompt_template,
|
|
1585
|
-
system_prompt=system_prompt
|
|
1586
|
-
**kwrs)
|
|
1469
|
+
system_prompt=system_prompt)
|
|
1587
1470
|
|
|
1588
1471
|
def _get_ROI(self, frame_1:LLMInformationExtractionFrame, frame_2:LLMInformationExtractionFrame,
|
|
1589
1472
|
text:str, buffer_size:int=100) -> str:
|
|
@@ -1659,7 +1542,7 @@ class RelationExtractor(Extractor):
|
|
|
1659
1542
|
|
|
1660
1543
|
class BinaryRelationExtractor(RelationExtractor):
|
|
1661
1544
|
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, possible_relation_func: Callable,
|
|
1662
|
-
system_prompt:str=None
|
|
1545
|
+
system_prompt:str=None):
|
|
1663
1546
|
"""
|
|
1664
1547
|
This class extracts binary (yes/no) relations between two entities.
|
|
1665
1548
|
Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
|
|
@@ -1677,8 +1560,7 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1677
1560
|
"""
|
|
1678
1561
|
super().__init__(inference_engine=inference_engine,
|
|
1679
1562
|
prompt_template=prompt_template,
|
|
1680
|
-
system_prompt=system_prompt
|
|
1681
|
-
**kwrs)
|
|
1563
|
+
system_prompt=system_prompt)
|
|
1682
1564
|
|
|
1683
1565
|
if possible_relation_func:
|
|
1684
1566
|
# Check if possible_relation_func is a function
|
|
@@ -1718,8 +1600,8 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1718
1600
|
return False
|
|
1719
1601
|
|
|
1720
1602
|
|
|
1721
|
-
def extract(self, doc:LLMInformationExtractionDocument, buffer_size:int=100,
|
|
1722
|
-
|
|
1603
|
+
def extract(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, verbose:bool=False,
|
|
1604
|
+
return_messages_log:bool=False) -> List[Dict]:
|
|
1723
1605
|
"""
|
|
1724
1606
|
This method considers all combinations of two frames. Use the possible_relation_func to filter impossible pairs.
|
|
1725
1607
|
Outputs pairs that are related.
|
|
@@ -1730,11 +1612,7 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1730
1612
|
a document with frames.
|
|
1731
1613
|
buffer_size : int, Optional
|
|
1732
1614
|
the number of characters before and after the two frames in the ROI text.
|
|
1733
|
-
|
|
1734
|
-
the max number of new tokens LLM should generate.
|
|
1735
|
-
temperature : float, Optional
|
|
1736
|
-
the temperature for token sampling.
|
|
1737
|
-
stream : bool, Optional
|
|
1615
|
+
verbose : bool, Optional
|
|
1738
1616
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
1739
1617
|
return_messages_log : bool, Optional
|
|
1740
1618
|
if True, a list of messages will be returned.
|
|
@@ -1753,7 +1631,7 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1753
1631
|
|
|
1754
1632
|
if pos_rel:
|
|
1755
1633
|
roi_text = self._get_ROI(frame_1, frame_2, doc.text, buffer_size=buffer_size)
|
|
1756
|
-
if
|
|
1634
|
+
if verbose:
|
|
1757
1635
|
print(f"\n\n{Fore.GREEN}ROI text:{Style.RESET_ALL} \n{roi_text}\n")
|
|
1758
1636
|
print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
|
|
1759
1637
|
messages = []
|
|
@@ -1767,10 +1645,7 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1767
1645
|
|
|
1768
1646
|
gen_text = self.inference_engine.chat(
|
|
1769
1647
|
messages=messages,
|
|
1770
|
-
|
|
1771
|
-
temperature=temperature,
|
|
1772
|
-
stream=stream,
|
|
1773
|
-
**kwrs
|
|
1648
|
+
verbose=verbose
|
|
1774
1649
|
)
|
|
1775
1650
|
rel_json = self._extract_json(gen_text)
|
|
1776
1651
|
if self._post_process(rel_json):
|
|
@@ -1785,8 +1660,8 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1785
1660
|
return output
|
|
1786
1661
|
|
|
1787
1662
|
|
|
1788
|
-
async def extract_async(self, doc:LLMInformationExtractionDocument, buffer_size:int=100,
|
|
1789
|
-
|
|
1663
|
+
async def extract_async(self, doc:LLMInformationExtractionDocument, buffer_size:int=100,
|
|
1664
|
+
concurrent_batch_size:int=32, return_messages_log:bool=False) -> List[Dict]:
|
|
1790
1665
|
"""
|
|
1791
1666
|
This is the asynchronous version of the extract() method.
|
|
1792
1667
|
|
|
@@ -1841,10 +1716,7 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1841
1716
|
|
|
1842
1717
|
task = asyncio.create_task(
|
|
1843
1718
|
self.inference_engine.chat_async(
|
|
1844
|
-
messages=messages
|
|
1845
|
-
max_new_tokens=max_new_tokens,
|
|
1846
|
-
temperature=temperature,
|
|
1847
|
-
**kwrs
|
|
1719
|
+
messages=messages
|
|
1848
1720
|
)
|
|
1849
1721
|
)
|
|
1850
1722
|
tasks.append(task)
|
|
@@ -1866,9 +1738,9 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1866
1738
|
return output
|
|
1867
1739
|
|
|
1868
1740
|
|
|
1869
|
-
def extract_relations(self, doc:LLMInformationExtractionDocument, buffer_size:int=100,
|
|
1870
|
-
|
|
1871
|
-
|
|
1741
|
+
def extract_relations(self, doc:LLMInformationExtractionDocument, buffer_size:int=100,
|
|
1742
|
+
concurrent:bool=False, concurrent_batch_size:int=32, verbose:bool=False,
|
|
1743
|
+
return_messages_log:bool=False) -> List[Dict]:
|
|
1872
1744
|
"""
|
|
1873
1745
|
This method considers all combinations of two frames. Use the possible_relation_func to filter impossible pairs.
|
|
1874
1746
|
|
|
@@ -1878,15 +1750,11 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1878
1750
|
a document with frames.
|
|
1879
1751
|
buffer_size : int, Optional
|
|
1880
1752
|
the number of characters before and after the two frames in the ROI text.
|
|
1881
|
-
max_new_tokens : str, Optional
|
|
1882
|
-
the max number of new tokens LLM should generate.
|
|
1883
|
-
temperature : float, Optional
|
|
1884
|
-
the temperature for token sampling.
|
|
1885
1753
|
concurrent: bool, Optional
|
|
1886
1754
|
if True, the extraction will be done in concurrent.
|
|
1887
1755
|
concurrent_batch_size : int, Optional
|
|
1888
1756
|
the number of frame pairs to process in concurrent.
|
|
1889
|
-
|
|
1757
|
+
verbose : bool, Optional
|
|
1890
1758
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
1891
1759
|
return_messages_log : bool, Optional
|
|
1892
1760
|
if True, a list of messages will be returned.
|
|
@@ -1901,31 +1769,25 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
1901
1769
|
raise ValueError("All frame_ids in the input document must be unique.")
|
|
1902
1770
|
|
|
1903
1771
|
if concurrent:
|
|
1904
|
-
if
|
|
1772
|
+
if verbose:
|
|
1905
1773
|
warnings.warn("stream=True is not supported in concurrent mode.", RuntimeWarning)
|
|
1906
1774
|
|
|
1907
1775
|
nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
|
|
1908
1776
|
return asyncio.run(self.extract_async(doc=doc,
|
|
1909
1777
|
buffer_size=buffer_size,
|
|
1910
|
-
max_new_tokens=max_new_tokens,
|
|
1911
|
-
temperature=temperature,
|
|
1912
1778
|
concurrent_batch_size=concurrent_batch_size,
|
|
1913
|
-
return_messages_log=return_messages_log
|
|
1914
|
-
**kwrs)
|
|
1779
|
+
return_messages_log=return_messages_log)
|
|
1915
1780
|
)
|
|
1916
1781
|
else:
|
|
1917
1782
|
return self.extract(doc=doc,
|
|
1918
1783
|
buffer_size=buffer_size,
|
|
1919
|
-
|
|
1920
|
-
|
|
1921
|
-
stream=stream,
|
|
1922
|
-
return_messages_log=return_messages_log,
|
|
1923
|
-
**kwrs)
|
|
1784
|
+
verbose=verbose,
|
|
1785
|
+
return_messages_log=return_messages_log)
|
|
1924
1786
|
|
|
1925
1787
|
|
|
1926
1788
|
class MultiClassRelationExtractor(RelationExtractor):
|
|
1927
1789
|
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, possible_relation_types_func: Callable,
|
|
1928
|
-
system_prompt:str=None
|
|
1790
|
+
system_prompt:str=None):
|
|
1929
1791
|
"""
|
|
1930
1792
|
This class extracts relations with relation types.
|
|
1931
1793
|
Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
|
|
@@ -1944,8 +1806,7 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
1944
1806
|
"""
|
|
1945
1807
|
super().__init__(inference_engine=inference_engine,
|
|
1946
1808
|
prompt_template=prompt_template,
|
|
1947
|
-
system_prompt=system_prompt
|
|
1948
|
-
**kwrs)
|
|
1809
|
+
system_prompt=system_prompt)
|
|
1949
1810
|
|
|
1950
1811
|
if possible_relation_types_func:
|
|
1951
1812
|
# Check if possible_relation_types_func is a function
|
|
@@ -1992,8 +1853,7 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
1992
1853
|
return None
|
|
1993
1854
|
|
|
1994
1855
|
|
|
1995
|
-
def extract(self, doc:LLMInformationExtractionDocument, buffer_size:int=100,
|
|
1996
|
-
temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict]:
|
|
1856
|
+
def extract(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, verbose:bool=False, return_messages_log:bool=False) -> List[Dict]:
|
|
1997
1857
|
"""
|
|
1998
1858
|
This method considers all combinations of two frames. Use the possible_relation_types_func to filter impossible pairs.
|
|
1999
1859
|
|
|
@@ -2026,7 +1886,7 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
2026
1886
|
|
|
2027
1887
|
if pos_rel_types:
|
|
2028
1888
|
roi_text = self._get_ROI(frame_1, frame_2, doc.text, buffer_size=buffer_size)
|
|
2029
|
-
if
|
|
1889
|
+
if verbose:
|
|
2030
1890
|
print(f"\n\n{Fore.GREEN}ROI text:{Style.RESET_ALL} \n{roi_text}\n")
|
|
2031
1891
|
print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
|
|
2032
1892
|
messages = []
|
|
@@ -2041,10 +1901,8 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
2041
1901
|
|
|
2042
1902
|
gen_text = self.inference_engine.chat(
|
|
2043
1903
|
messages=messages,
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
stream=stream,
|
|
2047
|
-
**kwrs
|
|
1904
|
+
stream=False,
|
|
1905
|
+
verbose=verbose
|
|
2048
1906
|
)
|
|
2049
1907
|
|
|
2050
1908
|
if return_messages_log:
|
|
@@ -2061,8 +1919,8 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
2061
1919
|
return output
|
|
2062
1920
|
|
|
2063
1921
|
|
|
2064
|
-
async def extract_async(self, doc:LLMInformationExtractionDocument, buffer_size:int=100,
|
|
2065
|
-
|
|
1922
|
+
async def extract_async(self, doc:LLMInformationExtractionDocument, buffer_size:int=100,
|
|
1923
|
+
concurrent_batch_size:int=32, return_messages_log:bool=False) -> List[Dict]:
|
|
2066
1924
|
"""
|
|
2067
1925
|
This is the asynchronous version of the extract() method.
|
|
2068
1926
|
|
|
@@ -2117,10 +1975,7 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
2117
1975
|
)})
|
|
2118
1976
|
task = asyncio.create_task(
|
|
2119
1977
|
self.inference_engine.chat_async(
|
|
2120
|
-
messages=messages
|
|
2121
|
-
max_new_tokens=max_new_tokens,
|
|
2122
|
-
temperature=temperature,
|
|
2123
|
-
**kwrs
|
|
1978
|
+
messages=messages
|
|
2124
1979
|
)
|
|
2125
1980
|
)
|
|
2126
1981
|
tasks.append(task)
|
|
@@ -2143,9 +1998,9 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
2143
1998
|
return output
|
|
2144
1999
|
|
|
2145
2000
|
|
|
2146
|
-
def extract_relations(self, doc:LLMInformationExtractionDocument, buffer_size:int=100,
|
|
2147
|
-
|
|
2148
|
-
|
|
2001
|
+
def extract_relations(self, doc:LLMInformationExtractionDocument, buffer_size:int=100,
|
|
2002
|
+
concurrent:bool=False, concurrent_batch_size:int=32,
|
|
2003
|
+
verbose:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict]:
|
|
2149
2004
|
"""
|
|
2150
2005
|
This method considers all combinations of two frames. Use the possible_relation_types_func to filter impossible pairs.
|
|
2151
2006
|
|
|
@@ -2178,24 +2033,18 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
2178
2033
|
raise ValueError("All frame_ids in the input document must be unique.")
|
|
2179
2034
|
|
|
2180
2035
|
if concurrent:
|
|
2181
|
-
if
|
|
2036
|
+
if verbose:
|
|
2182
2037
|
warnings.warn("stream=True is not supported in concurrent mode.", RuntimeWarning)
|
|
2183
2038
|
|
|
2184
2039
|
nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
|
|
2185
2040
|
return asyncio.run(self.extract_async(doc=doc,
|
|
2186
2041
|
buffer_size=buffer_size,
|
|
2187
|
-
max_new_tokens=max_new_tokens,
|
|
2188
|
-
temperature=temperature,
|
|
2189
2042
|
concurrent_batch_size=concurrent_batch_size,
|
|
2190
|
-
return_messages_log=return_messages_log
|
|
2191
|
-
**kwrs)
|
|
2043
|
+
return_messages_log=return_messages_log)
|
|
2192
2044
|
)
|
|
2193
2045
|
else:
|
|
2194
2046
|
return self.extract(doc=doc,
|
|
2195
2047
|
buffer_size=buffer_size,
|
|
2196
|
-
|
|
2197
|
-
|
|
2198
|
-
stream=stream,
|
|
2199
|
-
return_messages_log=return_messages_log,
|
|
2200
|
-
**kwrs)
|
|
2048
|
+
verbose=verbose,
|
|
2049
|
+
return_messages_log=return_messages_log)
|
|
2201
2050
|
|