llm-ie 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_ie/asset/PromptEditor_prompts/chat.txt +5 -0
- llm_ie/asset/PromptEditor_prompts/rewrite.txt +3 -1
- llm_ie/asset/PromptEditor_prompts/system.txt +1 -0
- llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +3 -0
- llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +2 -0
- llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt +4 -0
- llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_revision_review_prompt.txt +3 -0
- llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +117 -7
- llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt +32 -12
- llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt +35 -12
- llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +117 -7
- llm_ie/asset/prompt_guide/SentenceCoTFrameExtractor_prompt_guide.txt +217 -0
- llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +129 -24
- llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +145 -0
- llm_ie/engines.py +1 -1
- llm_ie/extractors.py +331 -24
- llm_ie/prompt_editor.py +150 -8
- {llm_ie-0.2.2.dist-info → llm_ie-0.3.1.dist-info}/METADATA +89 -44
- llm_ie-0.3.1.dist-info/RECORD +23 -0
- llm_ie-0.2.2.dist-info/RECORD +0 -15
- {llm_ie-0.2.2.dist-info → llm_ie-0.3.1.dist-info}/WHEEL +0 -0
llm_ie/extractors.py
CHANGED
|
@@ -8,6 +8,7 @@ import itertools
|
|
|
8
8
|
from typing import List, Dict, Tuple, Union, Callable
|
|
9
9
|
from llm_ie.data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
10
10
|
from llm_ie.engines import InferenceEngine
|
|
11
|
+
from colorama import Fore, Style
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class Extractor:
|
|
@@ -73,18 +74,49 @@ class Extractor:
|
|
|
73
74
|
|
|
74
75
|
return prompt
|
|
75
76
|
|
|
77
|
+
def _find_dict_strings(self, text: str) -> List[str]:
|
|
78
|
+
"""
|
|
79
|
+
Extracts balanced JSON-like dictionaries from a string, even if nested.
|
|
80
|
+
|
|
81
|
+
Parameters:
|
|
82
|
+
-----------
|
|
83
|
+
text : str
|
|
84
|
+
the input text containing JSON-like structures.
|
|
85
|
+
|
|
86
|
+
Returns : List[str]
|
|
87
|
+
A list of valid JSON-like strings representing dictionaries.
|
|
88
|
+
"""
|
|
89
|
+
open_brace = 0
|
|
90
|
+
start = -1
|
|
91
|
+
json_objects = []
|
|
92
|
+
|
|
93
|
+
for i, char in enumerate(text):
|
|
94
|
+
if char == '{':
|
|
95
|
+
if open_brace == 0:
|
|
96
|
+
# start of a new JSON object
|
|
97
|
+
start = i
|
|
98
|
+
open_brace += 1
|
|
99
|
+
elif char == '}':
|
|
100
|
+
open_brace -= 1
|
|
101
|
+
if open_brace == 0 and start != -1:
|
|
102
|
+
json_objects.append(text[start:i + 1])
|
|
103
|
+
start = -1
|
|
104
|
+
|
|
105
|
+
return json_objects
|
|
106
|
+
|
|
107
|
+
|
|
76
108
|
def _extract_json(self, gen_text:str) -> List[Dict[str, str]]:
|
|
77
109
|
"""
|
|
78
110
|
This method inputs a generated text and output a JSON of information tuples
|
|
79
111
|
"""
|
|
80
|
-
pattern = r'\{.*?\}'
|
|
81
112
|
out = []
|
|
82
|
-
|
|
113
|
+
dict_str_list = self._find_dict_strings(gen_text)
|
|
114
|
+
for dict_str in dict_str_list:
|
|
83
115
|
try:
|
|
84
|
-
|
|
85
|
-
out.append(
|
|
116
|
+
dict_obj = json.loads(dict_str)
|
|
117
|
+
out.append(dict_obj)
|
|
86
118
|
except json.JSONDecodeError:
|
|
87
|
-
|
|
119
|
+
warnings.warn(f'Post-processing failed:\n{dict_str}', RuntimeWarning)
|
|
88
120
|
return out
|
|
89
121
|
|
|
90
122
|
|
|
@@ -244,7 +276,7 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
244
276
|
|
|
245
277
|
|
|
246
278
|
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
|
|
247
|
-
temperature:float=0.0, document_key:str=None, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
279
|
+
temperature:float=0.0, case_sensitive:bool=False, document_key:str=None, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
248
280
|
"""
|
|
249
281
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
250
282
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -261,6 +293,8 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
261
293
|
the max number of new tokens LLM should generate.
|
|
262
294
|
temperature : float, Optional
|
|
263
295
|
the temperature for token sampling.
|
|
296
|
+
case_sensitive : bool, Optional
|
|
297
|
+
if True, entity text matching will be case-sensitive.
|
|
264
298
|
document_key : str, Optional
|
|
265
299
|
specify the key in text_content where document text is.
|
|
266
300
|
If text_content is str, this parameter will be ignored.
|
|
@@ -271,7 +305,14 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
271
305
|
frame_list = []
|
|
272
306
|
gen_text = self.extract(text_content=text_content,
|
|
273
307
|
max_new_tokens=max_new_tokens, temperature=temperature, **kwrs)
|
|
274
|
-
|
|
308
|
+
|
|
309
|
+
entity_json = []
|
|
310
|
+
for entity in self._extract_json(gen_text=gen_text):
|
|
311
|
+
if entity_key in entity:
|
|
312
|
+
entity_json.append(entity)
|
|
313
|
+
else:
|
|
314
|
+
warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
|
|
315
|
+
|
|
275
316
|
if isinstance(text_content, str):
|
|
276
317
|
text = text_content
|
|
277
318
|
elif isinstance(text_content, dict):
|
|
@@ -279,7 +320,7 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
279
320
|
|
|
280
321
|
spans = self._find_entity_spans(text=text,
|
|
281
322
|
entities=[e[entity_key] for e in entity_json],
|
|
282
|
-
case_sensitive=
|
|
323
|
+
case_sensitive=case_sensitive)
|
|
283
324
|
|
|
284
325
|
for i, (ent, span) in enumerate(zip(entity_json, spans)):
|
|
285
326
|
if span is not None:
|
|
@@ -294,8 +335,8 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
294
335
|
|
|
295
336
|
|
|
296
337
|
class ReviewFrameExtractor(BasicFrameExtractor):
|
|
297
|
-
def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
|
|
298
|
-
review_mode:str, system_prompt:str=None, **kwrs):
|
|
338
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
|
|
339
|
+
review_mode:str, review_prompt:str=None,system_prompt:str=None, **kwrs):
|
|
299
340
|
"""
|
|
300
341
|
This class add a review step after the BasicFrameExtractor.
|
|
301
342
|
The Review process asks LLM to review its output and:
|
|
@@ -309,8 +350,9 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
309
350
|
the LLM inferencing engine object. Must implements the chat() method.
|
|
310
351
|
prompt_template : str
|
|
311
352
|
prompt template with "{{<placeholder name>}}" placeholder.
|
|
312
|
-
review_prompt : str
|
|
313
|
-
the prompt text that ask LLM to review. Specify addition or revision in the instruction.
|
|
353
|
+
review_prompt : str: Optional
|
|
354
|
+
the prompt text that ask LLM to review. Specify addition or revision in the instruction.
|
|
355
|
+
if not provided, a default review prompt will be used.
|
|
314
356
|
review_mode : str
|
|
315
357
|
review mode. Must be one of {"addition", "revision"}
|
|
316
358
|
addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
|
|
@@ -319,11 +361,20 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
319
361
|
"""
|
|
320
362
|
super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
|
|
321
363
|
system_prompt=system_prompt, **kwrs)
|
|
322
|
-
self.review_prompt = review_prompt
|
|
323
364
|
if review_mode not in {"addition", "revision"}:
|
|
324
365
|
raise ValueError('review_mode must be one of {"addition", "revision"}.')
|
|
325
366
|
self.review_mode = review_mode
|
|
326
367
|
|
|
368
|
+
if review_prompt:
|
|
369
|
+
self.review_prompt = review_prompt
|
|
370
|
+
else:
|
|
371
|
+
file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
|
|
372
|
+
joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
|
|
373
|
+
with open(file_path, 'r') as f:
|
|
374
|
+
self.review_prompt = f.read()
|
|
375
|
+
|
|
376
|
+
warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
|
|
377
|
+
|
|
327
378
|
|
|
328
379
|
def extract(self, text_content:Union[str, Dict[str,str]],
|
|
329
380
|
max_new_tokens:int=4096, temperature:float=0.0, stream:bool=False, **kwrs) -> str:
|
|
@@ -346,12 +397,15 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
346
397
|
Return : str
|
|
347
398
|
the output from LLM. Need post-processing.
|
|
348
399
|
"""
|
|
349
|
-
# Pormpt extraction
|
|
350
400
|
messages = []
|
|
351
401
|
if self.system_prompt:
|
|
352
402
|
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
353
403
|
|
|
354
404
|
messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
|
|
405
|
+
# Initial output
|
|
406
|
+
if stream:
|
|
407
|
+
print(f"{Fore.BLUE}Initial Output:{Style.RESET_ALL}")
|
|
408
|
+
|
|
355
409
|
initial = self.inference_engine.chat(
|
|
356
410
|
messages=messages,
|
|
357
411
|
max_new_tokens=max_new_tokens,
|
|
@@ -364,6 +418,8 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
364
418
|
messages.append({'role': 'assistant', 'content': initial})
|
|
365
419
|
messages.append({'role': 'user', 'content': self.review_prompt})
|
|
366
420
|
|
|
421
|
+
if stream:
|
|
422
|
+
print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
|
|
367
423
|
review = self.inference_engine.chat(
|
|
368
424
|
messages=messages,
|
|
369
425
|
max_new_tokens=max_new_tokens,
|
|
@@ -428,7 +484,7 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
428
484
|
|
|
429
485
|
|
|
430
486
|
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
431
|
-
document_key:str=None, multi_turn:bool=
|
|
487
|
+
document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
432
488
|
"""
|
|
433
489
|
This method inputs a text and outputs a list of outputs per sentence.
|
|
434
490
|
|
|
@@ -476,8 +532,8 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
476
532
|
for sent in sentences:
|
|
477
533
|
messages.append({'role': 'user', 'content': sent['sentence_text']})
|
|
478
534
|
if stream:
|
|
479
|
-
print(f"\n\
|
|
480
|
-
print("Extraction:")
|
|
535
|
+
print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
|
|
536
|
+
print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
|
|
481
537
|
|
|
482
538
|
gen_text = self.inference_engine.chat(
|
|
483
539
|
messages=messages,
|
|
@@ -503,7 +559,8 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
503
559
|
|
|
504
560
|
|
|
505
561
|
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=512,
|
|
506
|
-
document_key:str=None, multi_turn:bool=
|
|
562
|
+
document_key:str=None, multi_turn:bool=False, temperature:float=0.0, case_sensitive:bool=False,
|
|
563
|
+
stream:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
507
564
|
"""
|
|
508
565
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
509
566
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -529,6 +586,8 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
529
586
|
can better utilize the KV caching.
|
|
530
587
|
temperature : float, Optional
|
|
531
588
|
the temperature for token sampling.
|
|
589
|
+
case_sensitive : bool, Optional
|
|
590
|
+
if True, entity text matching will be case-sensitive.
|
|
532
591
|
stream : bool, Optional
|
|
533
592
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
534
593
|
|
|
@@ -544,9 +603,15 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
544
603
|
**kwrs)
|
|
545
604
|
frame_list = []
|
|
546
605
|
for sent in llm_output_sentence:
|
|
547
|
-
entity_json =
|
|
606
|
+
entity_json = []
|
|
607
|
+
for entity in self._extract_json(gen_text=sent['gen_text']):
|
|
608
|
+
if entity_key in entity:
|
|
609
|
+
entity_json.append(entity)
|
|
610
|
+
else:
|
|
611
|
+
warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
|
|
612
|
+
|
|
548
613
|
spans = self._find_entity_spans(text=sent['sentence_text'],
|
|
549
|
-
|
|
614
|
+
entities=[e[entity_key] for e in entity_json], case_sensitive=case_sensitive)
|
|
550
615
|
for ent, span in zip(entity_json, spans):
|
|
551
616
|
if span is not None:
|
|
552
617
|
start, end = span
|
|
@@ -561,6 +626,248 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
561
626
|
return frame_list
|
|
562
627
|
|
|
563
628
|
|
|
629
|
+
class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
630
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
|
|
631
|
+
review_mode:str, review_prompt:str=None, system_prompt:str=None, **kwrs):
|
|
632
|
+
"""
|
|
633
|
+
This class adds a review step after the SentenceFrameExtractor.
|
|
634
|
+
For each sentence, the review process asks LLM to review its output and:
|
|
635
|
+
1. add more frames while keeping current. This is efficient for boosting recall.
|
|
636
|
+
2. or, regenerate frames (add new and delete existing).
|
|
637
|
+
Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.
|
|
638
|
+
|
|
639
|
+
Parameters:
|
|
640
|
+
----------
|
|
641
|
+
inference_engine : InferenceEngine
|
|
642
|
+
the LLM inferencing engine object. Must implements the chat() method.
|
|
643
|
+
prompt_template : str
|
|
644
|
+
prompt template with "{{<placeholder name>}}" placeholder.
|
|
645
|
+
review_prompt : str: Optional
|
|
646
|
+
the prompt text that ask LLM to review. Specify addition or revision in the instruction.
|
|
647
|
+
if not provided, a default review prompt will be used.
|
|
648
|
+
review_mode : str
|
|
649
|
+
review mode. Must be one of {"addition", "revision"}
|
|
650
|
+
addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
|
|
651
|
+
system_prompt : str, Optional
|
|
652
|
+
system prompt.
|
|
653
|
+
"""
|
|
654
|
+
super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
|
|
655
|
+
system_prompt=system_prompt, **kwrs)
|
|
656
|
+
|
|
657
|
+
if review_mode not in {"addition", "revision"}:
|
|
658
|
+
raise ValueError('review_mode must be one of {"addition", "revision"}.')
|
|
659
|
+
self.review_mode = review_mode
|
|
660
|
+
|
|
661
|
+
if review_prompt:
|
|
662
|
+
self.review_prompt = review_prompt
|
|
663
|
+
else:
|
|
664
|
+
file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
|
|
665
|
+
joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
|
|
666
|
+
with open(file_path, 'r') as f:
|
|
667
|
+
self.review_prompt = f.read()
|
|
668
|
+
|
|
669
|
+
warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
673
|
+
document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
674
|
+
"""
|
|
675
|
+
This method inputs a text and outputs a list of outputs per sentence.
|
|
676
|
+
|
|
677
|
+
Parameters:
|
|
678
|
+
----------
|
|
679
|
+
text_content : Union[str, Dict[str,str]]
|
|
680
|
+
the input text content to put in prompt template.
|
|
681
|
+
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
682
|
+
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
683
|
+
max_new_tokens : str, Optional
|
|
684
|
+
the max number of new tokens LLM should generate.
|
|
685
|
+
document_key : str, Optional
|
|
686
|
+
specify the key in text_content where document text is.
|
|
687
|
+
If text_content is str, this parameter will be ignored.
|
|
688
|
+
multi_turn : bool, Optional
|
|
689
|
+
multi-turn conversation prompting.
|
|
690
|
+
If True, sentences and LLM outputs will be appended to the input message and carry-over.
|
|
691
|
+
If False, only the current sentence is prompted.
|
|
692
|
+
For LLM inference engines that supports prompt cache (e.g., Llama.Cpp, Ollama), use multi-turn conversation prompting
|
|
693
|
+
can better utilize the KV caching.
|
|
694
|
+
temperature : float, Optional
|
|
695
|
+
the temperature for token sampling.
|
|
696
|
+
stream : bool, Optional
|
|
697
|
+
if True, LLM generated text will be printed in terminal in real-time.
|
|
698
|
+
|
|
699
|
+
Return : str
|
|
700
|
+
the output from LLM. Need post-processing.
|
|
701
|
+
"""
|
|
702
|
+
# define output
|
|
703
|
+
output = []
|
|
704
|
+
# sentence tokenization
|
|
705
|
+
if isinstance(text_content, str):
|
|
706
|
+
sentences = self._get_sentences(text_content)
|
|
707
|
+
elif isinstance(text_content, dict):
|
|
708
|
+
sentences = self._get_sentences(text_content[document_key])
|
|
709
|
+
# construct chat messages
|
|
710
|
+
messages = []
|
|
711
|
+
if self.system_prompt:
|
|
712
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
713
|
+
|
|
714
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
|
|
715
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please start with the first sentence.'})
|
|
716
|
+
|
|
717
|
+
# generate sentence by sentence
|
|
718
|
+
for sent in sentences:
|
|
719
|
+
messages.append({'role': 'user', 'content': sent['sentence_text']})
|
|
720
|
+
if stream:
|
|
721
|
+
print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
|
|
722
|
+
print(f"{Fore.BLUE}Initial Output:{Style.RESET_ALL}")
|
|
723
|
+
|
|
724
|
+
initial = self.inference_engine.chat(
|
|
725
|
+
messages=messages,
|
|
726
|
+
max_new_tokens=max_new_tokens,
|
|
727
|
+
temperature=temperature,
|
|
728
|
+
stream=stream,
|
|
729
|
+
**kwrs
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
# Review
|
|
733
|
+
if stream:
|
|
734
|
+
print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
|
|
735
|
+
messages.append({'role': 'assistant', 'content': initial})
|
|
736
|
+
messages.append({'role': 'user', 'content': self.review_prompt})
|
|
737
|
+
|
|
738
|
+
review = self.inference_engine.chat(
|
|
739
|
+
messages=messages,
|
|
740
|
+
max_new_tokens=max_new_tokens,
|
|
741
|
+
temperature=temperature,
|
|
742
|
+
stream=stream,
|
|
743
|
+
**kwrs
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
# Output
|
|
747
|
+
if self.review_mode == "revision":
|
|
748
|
+
gen_text = review
|
|
749
|
+
elif self.review_mode == "addition":
|
|
750
|
+
gen_text = initial + '\n' + review
|
|
751
|
+
|
|
752
|
+
if multi_turn:
|
|
753
|
+
# update chat messages with LLM outputs
|
|
754
|
+
messages.append({'role': 'assistant', 'content': review})
|
|
755
|
+
else:
|
|
756
|
+
# delete sentence and review so that message is reset
|
|
757
|
+
del messages[-3:]
|
|
758
|
+
|
|
759
|
+
# add to output
|
|
760
|
+
output.append({'sentence_start': sent['start'],
|
|
761
|
+
'sentence_end': sent['end'],
|
|
762
|
+
'sentence_text': sent['sentence_text'],
|
|
763
|
+
'gen_text': gen_text})
|
|
764
|
+
return output
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
class SentenceCoTFrameExtractor(SentenceFrameExtractor):
|
|
768
|
+
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
|
769
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
|
|
770
|
+
"""
|
|
771
|
+
This class performs sentence-based Chain-of-thoughts (CoT) information extraction.
|
|
772
|
+
A simulated chat follows this process:
|
|
773
|
+
1. system prompt (optional)
|
|
774
|
+
2. user instructions (schema, background, full text, few-shot example...)
|
|
775
|
+
3. user input first sentence
|
|
776
|
+
4. assistant analyze the sentence
|
|
777
|
+
5. assistant extract outputs
|
|
778
|
+
6. repeat #3, #4, #5
|
|
779
|
+
|
|
780
|
+
Input system prompt (optional), prompt template (with user instructions),
|
|
781
|
+
and specify a LLM.
|
|
782
|
+
|
|
783
|
+
Parameters
|
|
784
|
+
----------
|
|
785
|
+
inference_engine : InferenceEngine
|
|
786
|
+
the LLM inferencing engine object. Must implements the chat() method.
|
|
787
|
+
prompt_template : str
|
|
788
|
+
prompt template with "{{<placeholder name>}}" placeholder.
|
|
789
|
+
system_prompt : str, Optional
|
|
790
|
+
system prompt.
|
|
791
|
+
"""
|
|
792
|
+
super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
|
|
793
|
+
system_prompt=system_prompt, **kwrs)
|
|
794
|
+
|
|
795
|
+
|
|
796
|
+
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
797
|
+
document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
798
|
+
"""
|
|
799
|
+
This method inputs a text and outputs a list of outputs per sentence.
|
|
800
|
+
|
|
801
|
+
Parameters:
|
|
802
|
+
----------
|
|
803
|
+
text_content : Union[str, Dict[str,str]]
|
|
804
|
+
the input text content to put in prompt template.
|
|
805
|
+
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
806
|
+
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
807
|
+
max_new_tokens : str, Optional
|
|
808
|
+
the max number of new tokens LLM should generate.
|
|
809
|
+
document_key : str, Optional
|
|
810
|
+
specify the key in text_content where document text is.
|
|
811
|
+
If text_content is str, this parameter will be ignored.
|
|
812
|
+
multi_turn : bool, Optional
|
|
813
|
+
multi-turn conversation prompting.
|
|
814
|
+
If True, sentences and LLM outputs will be appended to the input message and carry-over.
|
|
815
|
+
If False, only the current sentence is prompted.
|
|
816
|
+
For LLM inference engines that supports prompt cache (e.g., Llama.Cpp, Ollama), use multi-turn conversation prompting
|
|
817
|
+
can better utilize the KV caching.
|
|
818
|
+
temperature : float, Optional
|
|
819
|
+
the temperature for token sampling.
|
|
820
|
+
stream : bool, Optional
|
|
821
|
+
if True, LLM generated text will be printed in terminal in real-time.
|
|
822
|
+
|
|
823
|
+
Return : str
|
|
824
|
+
the output from LLM. Need post-processing.
|
|
825
|
+
"""
|
|
826
|
+
# define output
|
|
827
|
+
output = []
|
|
828
|
+
# sentence tokenization
|
|
829
|
+
if isinstance(text_content, str):
|
|
830
|
+
sentences = self._get_sentences(text_content)
|
|
831
|
+
elif isinstance(text_content, dict):
|
|
832
|
+
sentences = self._get_sentences(text_content[document_key])
|
|
833
|
+
# construct chat messages
|
|
834
|
+
messages = []
|
|
835
|
+
if self.system_prompt:
|
|
836
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
837
|
+
|
|
838
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
|
|
839
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please start with the first sentence.'})
|
|
840
|
+
|
|
841
|
+
# generate sentence by sentence
|
|
842
|
+
for sent in sentences:
|
|
843
|
+
messages.append({'role': 'user', 'content': sent['sentence_text']})
|
|
844
|
+
if stream:
|
|
845
|
+
print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
|
|
846
|
+
print(f"{Fore.BLUE}CoT:{Style.RESET_ALL}")
|
|
847
|
+
|
|
848
|
+
gen_text = self.inference_engine.chat(
|
|
849
|
+
messages=messages,
|
|
850
|
+
max_new_tokens=max_new_tokens,
|
|
851
|
+
temperature=temperature,
|
|
852
|
+
stream=stream,
|
|
853
|
+
**kwrs
|
|
854
|
+
)
|
|
855
|
+
|
|
856
|
+
if multi_turn:
|
|
857
|
+
# update chat messages with LLM outputs
|
|
858
|
+
messages.append({'role': 'assistant', 'content': gen_text})
|
|
859
|
+
else:
|
|
860
|
+
# delete sentence so that message is reset
|
|
861
|
+
del messages[-1]
|
|
862
|
+
|
|
863
|
+
# add to output
|
|
864
|
+
output.append({'sentence_start': sent['start'],
|
|
865
|
+
'sentence_end': sent['end'],
|
|
866
|
+
'sentence_text': sent['sentence_text'],
|
|
867
|
+
'gen_text': gen_text})
|
|
868
|
+
return output
|
|
869
|
+
|
|
870
|
+
|
|
564
871
|
class RelationExtractor(Extractor):
|
|
565
872
|
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
|
|
566
873
|
"""
|
|
@@ -721,8 +1028,8 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
721
1028
|
"""
|
|
722
1029
|
roi_text = self._get_ROI(frame_1, frame_2, text, buffer_size=buffer_size)
|
|
723
1030
|
if stream:
|
|
724
|
-
print(f"\n\
|
|
725
|
-
print("Extraction:")
|
|
1031
|
+
print(f"\n\n{Fore.GREEN}ROI text:{Style.RESET_ALL} \n{roi_text}\n")
|
|
1032
|
+
print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
|
|
726
1033
|
|
|
727
1034
|
messages = []
|
|
728
1035
|
if self.system_prompt:
|
|
@@ -873,8 +1180,8 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
873
1180
|
"""
|
|
874
1181
|
roi_text = self._get_ROI(frame_1, frame_2, text, buffer_size=buffer_size)
|
|
875
1182
|
if stream:
|
|
876
|
-
print(f"\n\
|
|
877
|
-
print("Extraction:")
|
|
1183
|
+
print(f"\n\n{Fore.GREEN}ROI text:{Style.RESET_ALL} \n{roi_text}\n")
|
|
1184
|
+
print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
|
|
878
1185
|
|
|
879
1186
|
messages = []
|
|
880
1187
|
if self.system_prompt:
|