llm-ie 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py CHANGED
@@ -8,6 +8,7 @@ import itertools
8
8
  from typing import List, Dict, Tuple, Union, Callable
9
9
  from llm_ie.data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
10
10
  from llm_ie.engines import InferenceEngine
11
+ from colorama import Fore, Style
11
12
 
12
13
 
13
14
  class Extractor:
@@ -73,18 +74,49 @@ class Extractor:
73
74
 
74
75
  return prompt
75
76
 
77
+ def _find_dict_strings(self, text: str) -> List[str]:
78
+ """
79
+ Extracts balanced JSON-like dictionaries from a string, even if nested.
80
+
81
+ Parameters:
82
+ -----------
83
+ text : str
84
+ the input text containing JSON-like structures.
85
+
86
+ Returns : List[str]
87
+ A list of valid JSON-like strings representing dictionaries.
88
+ """
89
+ open_brace = 0
90
+ start = -1
91
+ json_objects = []
92
+
93
+ for i, char in enumerate(text):
94
+ if char == '{':
95
+ if open_brace == 0:
96
+ # start of a new JSON object
97
+ start = i
98
+ open_brace += 1
99
+ elif char == '}':
100
+ open_brace -= 1
101
+ if open_brace == 0 and start != -1:
102
+ json_objects.append(text[start:i + 1])
103
+ start = -1
104
+
105
+ return json_objects
106
+
107
+
76
108
  def _extract_json(self, gen_text:str) -> List[Dict[str, str]]:
77
109
  """
78
110
  This method inputs a generated text and output a JSON of information tuples
79
111
  """
80
- pattern = r'\{.*?\}'
81
112
  out = []
82
- for match in re.findall(pattern, gen_text, re.DOTALL):
113
+ dict_str_list = self._find_dict_strings(gen_text)
114
+ for dict_str in dict_str_list:
83
115
  try:
84
- tup_dict = json.loads(match)
85
- out.append(tup_dict)
116
+ dict_obj = json.loads(dict_str)
117
+ out.append(dict_obj)
86
118
  except json.JSONDecodeError:
87
- print(f'Post-processing failed at:\n{match}')
119
+ warnings.warn(f'Post-processing failed:\n{dict_str}', RuntimeWarning)
88
120
  return out
89
121
 
90
122
 
@@ -244,7 +276,7 @@ class BasicFrameExtractor(FrameExtractor):
244
276
 
245
277
 
246
278
  def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
247
- temperature:float=0.0, document_key:str=None, **kwrs) -> List[LLMInformationExtractionFrame]:
279
+ temperature:float=0.0, case_sensitive:bool=False, document_key:str=None, **kwrs) -> List[LLMInformationExtractionFrame]:
248
280
  """
249
281
  This method inputs a text and outputs a list of LLMInformationExtractionFrame
250
282
  It use the extract() method and post-process outputs into frames.
@@ -261,6 +293,8 @@ class BasicFrameExtractor(FrameExtractor):
261
293
  the max number of new tokens LLM should generate.
262
294
  temperature : float, Optional
263
295
  the temperature for token sampling.
296
+ case_sensitive : bool, Optional
297
+ if True, entity text matching will be case-sensitive.
264
298
  document_key : str, Optional
265
299
  specify the key in text_content where document text is.
266
300
  If text_content is str, this parameter will be ignored.
@@ -271,7 +305,14 @@ class BasicFrameExtractor(FrameExtractor):
271
305
  frame_list = []
272
306
  gen_text = self.extract(text_content=text_content,
273
307
  max_new_tokens=max_new_tokens, temperature=temperature, **kwrs)
274
- entity_json = self._extract_json(gen_text=gen_text)
308
+
309
+ entity_json = []
310
+ for entity in self._extract_json(gen_text=gen_text):
311
+ if entity_key in entity:
312
+ entity_json.append(entity)
313
+ else:
314
+ warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
315
+
275
316
  if isinstance(text_content, str):
276
317
  text = text_content
277
318
  elif isinstance(text_content, dict):
@@ -279,7 +320,7 @@ class BasicFrameExtractor(FrameExtractor):
279
320
 
280
321
  spans = self._find_entity_spans(text=text,
281
322
  entities=[e[entity_key] for e in entity_json],
282
- case_sensitive=False)
323
+ case_sensitive=case_sensitive)
283
324
 
284
325
  for i, (ent, span) in enumerate(zip(entity_json, spans)):
285
326
  if span is not None:
@@ -294,8 +335,8 @@ class BasicFrameExtractor(FrameExtractor):
294
335
 
295
336
 
296
337
  class ReviewFrameExtractor(BasicFrameExtractor):
297
- def __init__(self, inference_engine:InferenceEngine, prompt_template:str, review_prompt:str,
298
- review_mode:str, system_prompt:str=None, **kwrs):
338
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
339
+ review_mode:str, review_prompt:str=None,system_prompt:str=None, **kwrs):
299
340
  """
300
341
  This class add a review step after the BasicFrameExtractor.
301
342
  The Review process asks LLM to review its output and:
@@ -309,8 +350,9 @@ class ReviewFrameExtractor(BasicFrameExtractor):
309
350
  the LLM inferencing engine object. Must implements the chat() method.
310
351
  prompt_template : str
311
352
  prompt template with "{{<placeholder name>}}" placeholder.
312
- review_prompt : str
313
- the prompt text that ask LLM to review. Specify addition or revision in the instruction.
353
+ review_prompt : str: Optional
354
+ the prompt text that ask LLM to review. Specify addition or revision in the instruction.
355
+ if not provided, a default review prompt will be used.
314
356
  review_mode : str
315
357
  review mode. Must be one of {"addition", "revision"}
316
358
  addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
@@ -319,11 +361,20 @@ class ReviewFrameExtractor(BasicFrameExtractor):
319
361
  """
320
362
  super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
321
363
  system_prompt=system_prompt, **kwrs)
322
- self.review_prompt = review_prompt
323
364
  if review_mode not in {"addition", "revision"}:
324
365
  raise ValueError('review_mode must be one of {"addition", "revision"}.')
325
366
  self.review_mode = review_mode
326
367
 
368
+ if review_prompt:
369
+ self.review_prompt = review_prompt
370
+ else:
371
+ file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
372
+ joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
373
+ with open(file_path, 'r') as f:
374
+ self.review_prompt = f.read()
375
+
376
+ warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
377
+
327
378
 
328
379
  def extract(self, text_content:Union[str, Dict[str,str]],
329
380
  max_new_tokens:int=4096, temperature:float=0.0, stream:bool=False, **kwrs) -> str:
@@ -346,12 +397,15 @@ class ReviewFrameExtractor(BasicFrameExtractor):
346
397
  Return : str
347
398
  the output from LLM. Need post-processing.
348
399
  """
349
- # Pormpt extraction
350
400
  messages = []
351
401
  if self.system_prompt:
352
402
  messages.append({'role': 'system', 'content': self.system_prompt})
353
403
 
354
404
  messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
405
+ # Initial output
406
+ if stream:
407
+ print(f"{Fore.BLUE}Initial Output:{Style.RESET_ALL}")
408
+
355
409
  initial = self.inference_engine.chat(
356
410
  messages=messages,
357
411
  max_new_tokens=max_new_tokens,
@@ -364,6 +418,8 @@ class ReviewFrameExtractor(BasicFrameExtractor):
364
418
  messages.append({'role': 'assistant', 'content': initial})
365
419
  messages.append({'role': 'user', 'content': self.review_prompt})
366
420
 
421
+ if stream:
422
+ print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
367
423
  review = self.inference_engine.chat(
368
424
  messages=messages,
369
425
  max_new_tokens=max_new_tokens,
@@ -428,7 +484,7 @@ class SentenceFrameExtractor(FrameExtractor):
428
484
 
429
485
 
430
486
  def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
431
- document_key:str=None, multi_turn:bool=True, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
487
+ document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
432
488
  """
433
489
  This method inputs a text and outputs a list of outputs per sentence.
434
490
 
@@ -476,8 +532,8 @@ class SentenceFrameExtractor(FrameExtractor):
476
532
  for sent in sentences:
477
533
  messages.append({'role': 'user', 'content': sent['sentence_text']})
478
534
  if stream:
479
- print(f"\n\nSentence: \n{sent['sentence_text']}\n")
480
- print("Extraction:")
535
+ print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
536
+ print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
481
537
 
482
538
  gen_text = self.inference_engine.chat(
483
539
  messages=messages,
@@ -503,7 +559,8 @@ class SentenceFrameExtractor(FrameExtractor):
503
559
 
504
560
 
505
561
  def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=512,
506
- document_key:str=None, multi_turn:bool=True, temperature:float=0.0, stream:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
562
+ document_key:str=None, multi_turn:bool=False, temperature:float=0.0, case_sensitive:bool=False,
563
+ stream:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
507
564
  """
508
565
  This method inputs a text and outputs a list of LLMInformationExtractionFrame
509
566
  It use the extract() method and post-process outputs into frames.
@@ -529,6 +586,8 @@ class SentenceFrameExtractor(FrameExtractor):
529
586
  can better utilize the KV caching.
530
587
  temperature : float, Optional
531
588
  the temperature for token sampling.
589
+ case_sensitive : bool, Optional
590
+ if True, entity text matching will be case-sensitive.
532
591
  stream : bool, Optional
533
592
  if True, LLM generated text will be printed in terminal in real-time.
534
593
 
@@ -544,9 +603,15 @@ class SentenceFrameExtractor(FrameExtractor):
544
603
  **kwrs)
545
604
  frame_list = []
546
605
  for sent in llm_output_sentence:
547
- entity_json = self._extract_json(gen_text=sent['gen_text'])
606
+ entity_json = []
607
+ for entity in self._extract_json(gen_text=sent['gen_text']):
608
+ if entity_key in entity:
609
+ entity_json.append(entity)
610
+ else:
611
+ warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
612
+
548
613
  spans = self._find_entity_spans(text=sent['sentence_text'],
549
- entities=[e[entity_key] for e in entity_json], case_sensitive=False)
614
+ entities=[e[entity_key] for e in entity_json], case_sensitive=case_sensitive)
550
615
  for ent, span in zip(entity_json, spans):
551
616
  if span is not None:
552
617
  start, end = span
@@ -561,6 +626,248 @@ class SentenceFrameExtractor(FrameExtractor):
561
626
  return frame_list
562
627
 
563
628
 
629
+ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
630
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
631
+ review_mode:str, review_prompt:str=None, system_prompt:str=None, **kwrs):
632
+ """
633
+ This class adds a review step after the SentenceFrameExtractor.
634
+ For each sentence, the review process asks LLM to review its output and:
635
+ 1. add more frames while keeping current. This is efficient for boosting recall.
636
+ 2. or, regenerate frames (add new and delete existing).
637
+ Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.
638
+
639
+ Parameters:
640
+ ----------
641
+ inference_engine : InferenceEngine
642
+ the LLM inferencing engine object. Must implements the chat() method.
643
+ prompt_template : str
644
+ prompt template with "{{<placeholder name>}}" placeholder.
645
+ review_prompt : str: Optional
646
+ the prompt text that ask LLM to review. Specify addition or revision in the instruction.
647
+ if not provided, a default review prompt will be used.
648
+ review_mode : str
649
+ review mode. Must be one of {"addition", "revision"}
650
+ addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
651
+ system_prompt : str, Optional
652
+ system prompt.
653
+ """
654
+ super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
655
+ system_prompt=system_prompt, **kwrs)
656
+
657
+ if review_mode not in {"addition", "revision"}:
658
+ raise ValueError('review_mode must be one of {"addition", "revision"}.')
659
+ self.review_mode = review_mode
660
+
661
+ if review_prompt:
662
+ self.review_prompt = review_prompt
663
+ else:
664
+ file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
665
+ joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
666
+ with open(file_path, 'r') as f:
667
+ self.review_prompt = f.read()
668
+
669
+ warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
670
+
671
+
672
+ def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
673
+ document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
674
+ """
675
+ This method inputs a text and outputs a list of outputs per sentence.
676
+
677
+ Parameters:
678
+ ----------
679
+ text_content : Union[str, Dict[str,str]]
680
+ the input text content to put in prompt template.
681
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
682
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
683
+ max_new_tokens : str, Optional
684
+ the max number of new tokens LLM should generate.
685
+ document_key : str, Optional
686
+ specify the key in text_content where document text is.
687
+ If text_content is str, this parameter will be ignored.
688
+ multi_turn : bool, Optional
689
+ multi-turn conversation prompting.
690
+ If True, sentences and LLM outputs will be appended to the input message and carry-over.
691
+ If False, only the current sentence is prompted.
692
+ For LLM inference engines that supports prompt cache (e.g., Llama.Cpp, Ollama), use multi-turn conversation prompting
693
+ can better utilize the KV caching.
694
+ temperature : float, Optional
695
+ the temperature for token sampling.
696
+ stream : bool, Optional
697
+ if True, LLM generated text will be printed in terminal in real-time.
698
+
699
+ Return : str
700
+ the output from LLM. Need post-processing.
701
+ """
702
+ # define output
703
+ output = []
704
+ # sentence tokenization
705
+ if isinstance(text_content, str):
706
+ sentences = self._get_sentences(text_content)
707
+ elif isinstance(text_content, dict):
708
+ sentences = self._get_sentences(text_content[document_key])
709
+ # construct chat messages
710
+ messages = []
711
+ if self.system_prompt:
712
+ messages.append({'role': 'system', 'content': self.system_prompt})
713
+
714
+ messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
715
+ messages.append({'role': 'assistant', 'content': 'Sure, please start with the first sentence.'})
716
+
717
+ # generate sentence by sentence
718
+ for sent in sentences:
719
+ messages.append({'role': 'user', 'content': sent['sentence_text']})
720
+ if stream:
721
+ print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
722
+ print(f"{Fore.BLUE}Initial Output:{Style.RESET_ALL}")
723
+
724
+ initial = self.inference_engine.chat(
725
+ messages=messages,
726
+ max_new_tokens=max_new_tokens,
727
+ temperature=temperature,
728
+ stream=stream,
729
+ **kwrs
730
+ )
731
+
732
+ # Review
733
+ if stream:
734
+ print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
735
+ messages.append({'role': 'assistant', 'content': initial})
736
+ messages.append({'role': 'user', 'content': self.review_prompt})
737
+
738
+ review = self.inference_engine.chat(
739
+ messages=messages,
740
+ max_new_tokens=max_new_tokens,
741
+ temperature=temperature,
742
+ stream=stream,
743
+ **kwrs
744
+ )
745
+
746
+ # Output
747
+ if self.review_mode == "revision":
748
+ gen_text = review
749
+ elif self.review_mode == "addition":
750
+ gen_text = initial + '\n' + review
751
+
752
+ if multi_turn:
753
+ # update chat messages with LLM outputs
754
+ messages.append({'role': 'assistant', 'content': review})
755
+ else:
756
+ # delete sentence and review so that message is reset
757
+ del messages[-3:]
758
+
759
+ # add to output
760
+ output.append({'sentence_start': sent['start'],
761
+ 'sentence_end': sent['end'],
762
+ 'sentence_text': sent['sentence_text'],
763
+ 'gen_text': gen_text})
764
+ return output
765
+
766
+
767
+ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
768
+ from nltk.tokenize.punkt import PunktSentenceTokenizer
769
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
770
+ """
771
+ This class performs sentence-based Chain-of-thoughts (CoT) information extraction.
772
+ A simulated chat follows this process:
773
+ 1. system prompt (optional)
774
+ 2. user instructions (schema, background, full text, few-shot example...)
775
+ 3. user input first sentence
776
+ 4. assistant analyze the sentence
777
+ 5. assistant extract outputs
778
+ 6. repeat #3, #4, #5
779
+
780
+ Input system prompt (optional), prompt template (with user instructions),
781
+ and specify a LLM.
782
+
783
+ Parameters
784
+ ----------
785
+ inference_engine : InferenceEngine
786
+ the LLM inferencing engine object. Must implements the chat() method.
787
+ prompt_template : str
788
+ prompt template with "{{<placeholder name>}}" placeholder.
789
+ system_prompt : str, Optional
790
+ system prompt.
791
+ """
792
+ super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
793
+ system_prompt=system_prompt, **kwrs)
794
+
795
+
796
+ def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
797
+ document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
798
+ """
799
+ This method inputs a text and outputs a list of outputs per sentence.
800
+
801
+ Parameters:
802
+ ----------
803
+ text_content : Union[str, Dict[str,str]]
804
+ the input text content to put in prompt template.
805
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
806
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
807
+ max_new_tokens : str, Optional
808
+ the max number of new tokens LLM should generate.
809
+ document_key : str, Optional
810
+ specify the key in text_content where document text is.
811
+ If text_content is str, this parameter will be ignored.
812
+ multi_turn : bool, Optional
813
+ multi-turn conversation prompting.
814
+ If True, sentences and LLM outputs will be appended to the input message and carry-over.
815
+ If False, only the current sentence is prompted.
816
+ For LLM inference engines that supports prompt cache (e.g., Llama.Cpp, Ollama), use multi-turn conversation prompting
817
+ can better utilize the KV caching.
818
+ temperature : float, Optional
819
+ the temperature for token sampling.
820
+ stream : bool, Optional
821
+ if True, LLM generated text will be printed in terminal in real-time.
822
+
823
+ Return : str
824
+ the output from LLM. Need post-processing.
825
+ """
826
+ # define output
827
+ output = []
828
+ # sentence tokenization
829
+ if isinstance(text_content, str):
830
+ sentences = self._get_sentences(text_content)
831
+ elif isinstance(text_content, dict):
832
+ sentences = self._get_sentences(text_content[document_key])
833
+ # construct chat messages
834
+ messages = []
835
+ if self.system_prompt:
836
+ messages.append({'role': 'system', 'content': self.system_prompt})
837
+
838
+ messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
839
+ messages.append({'role': 'assistant', 'content': 'Sure, please start with the first sentence.'})
840
+
841
+ # generate sentence by sentence
842
+ for sent in sentences:
843
+ messages.append({'role': 'user', 'content': sent['sentence_text']})
844
+ if stream:
845
+ print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
846
+ print(f"{Fore.BLUE}CoT:{Style.RESET_ALL}")
847
+
848
+ gen_text = self.inference_engine.chat(
849
+ messages=messages,
850
+ max_new_tokens=max_new_tokens,
851
+ temperature=temperature,
852
+ stream=stream,
853
+ **kwrs
854
+ )
855
+
856
+ if multi_turn:
857
+ # update chat messages with LLM outputs
858
+ messages.append({'role': 'assistant', 'content': gen_text})
859
+ else:
860
+ # delete sentence so that message is reset
861
+ del messages[-1]
862
+
863
+ # add to output
864
+ output.append({'sentence_start': sent['start'],
865
+ 'sentence_end': sent['end'],
866
+ 'sentence_text': sent['sentence_text'],
867
+ 'gen_text': gen_text})
868
+ return output
869
+
870
+
564
871
  class RelationExtractor(Extractor):
565
872
  def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
566
873
  """
@@ -721,8 +1028,8 @@ class BinaryRelationExtractor(RelationExtractor):
721
1028
  """
722
1029
  roi_text = self._get_ROI(frame_1, frame_2, text, buffer_size=buffer_size)
723
1030
  if stream:
724
- print(f"\n\nROI text: \n{roi_text}\n")
725
- print("Extraction:")
1031
+ print(f"\n\n{Fore.GREEN}ROI text:{Style.RESET_ALL} \n{roi_text}\n")
1032
+ print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
726
1033
 
727
1034
  messages = []
728
1035
  if self.system_prompt:
@@ -873,8 +1180,8 @@ class MultiClassRelationExtractor(RelationExtractor):
873
1180
  """
874
1181
  roi_text = self._get_ROI(frame_1, frame_2, text, buffer_size=buffer_size)
875
1182
  if stream:
876
- print(f"\n\nROI text: \n{roi_text}\n")
877
- print("Extraction:")
1183
+ print(f"\n\n{Fore.GREEN}ROI text:{Style.RESET_ALL} \n{roi_text}\n")
1184
+ print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
878
1185
 
879
1186
  messages = []
880
1187
  if self.system_prompt: