PyPI - llm-ie - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

llm-ie 0.2.2py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

llm_ie/extractors.py CHANGED Viewed

@@ -8,6 +8,7 @@ import itertools
 from typing import List, Dict, Tuple, Union, Callable
 from llm_ie.data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
 from llm_ie.engines import InferenceEngine
+from colorama import Fore, Style
 class Extractor:
@@ -73,18 +74,49 @@ class Extractor:
         return prompt
+    def _find_dict_strings(self, text: str) -> List[str]:
+        """
+        Extracts balanced JSON-like dictionaries from a string, even if nested.
+        Parameters:
+        -----------
+        text : str
+            the input text containing JSON-like structures.
+        Returns : List[str]
+            A list of valid JSON-like strings representing dictionaries.
+        """
+        open_brace = 0
+        start = -1
+        json_objects = []
+        for i, char in enumerate(text):
+            if char == '{':
+                if open_brace == 0:
+                    # start of a new JSON object
+                    start = i
+                open_brace += 1
+            elif char == '}':
+                open_brace -= 1
+                if open_brace == 0 and start != -1:
+                    json_objects.append(text[start:i + 1])
+                    start = -1
+        return json_objects
     def _extract_json(self, gen_text:str) -> List[Dict[str, str]]:
         """
         This method inputs a generated text and output a JSON of information tuples
         """
-        pattern = r'\{.*?\}'
         out = []
-        for match in re.findall(pattern, gen_text, re.DOTALL):
+        dict_str_list = self._find_dict_strings(gen_text)
+        for dict_str in dict_str_list:
             try:
-                tup_dict = json.loads(match)
-                out.append(tup_dict)
+                dict_obj = json.loads(dict_str)
+                out.append(dict_obj)
             except json.JSONDecodeError:
-                print(f'Post-processing failed at:\n{match}')
+                warnings.warn(f'Post-processing failed:\n{dict_str}', RuntimeWarning)
         return out
@@ -244,7 +276,7 @@ class BasicFrameExtractor(FrameExtractor):
     def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
-                       temperature:float=0.0, document_key:str=None, **kwrs) -> List[LLMInformationExtractionFrame]:
+                       temperature:float=0.0, case_sensitive:bool=False, document_key:str=None, **kwrs) -> List[LLMInformationExtractionFrame]:
         """
         This method inputs a text and outputs a list of LLMInformationExtractionFrame
         It use the extract() method and post-process outputs into frames.
@@ -261,6 +293,8 @@ class BasicFrameExtractor(FrameExtractor):
             the max number of new tokens LLM should generate.
         temperature : float, Optional
             the temperature for token sampling.
+        case_sensitive : bool, Optional
+            if True, entity text matching will be case-sensitive.
         document_key : str, Optional
             specify the key in text_content where document text is.
             If text_content is str, this parameter will be ignored.
@@ -271,7 +305,14 @@ class BasicFrameExtractor(FrameExtractor):
         frame_list = []
         gen_text = self.extract(text_content=text_content,
                                 max_new_tokens=max_new_tokens, temperature=temperature, **kwrs)
-        entity_json = self._extract_json(gen_text=gen_text)
+        entity_json = []
+        for entity in self._extract_json(gen_text=gen_text):
+            if entity_key in entity:
+                entity_json.append(entity)
+            else:
+                warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
         if isinstance(text_content, str):
             text = text_content
         elif isinstance(text_content, dict):
@@ -279,7 +320,7 @@ class BasicFrameExtractor(FrameExtractor):
         spans = self._find_entity_spans(text=text,
                                         entities=[e[entity_key] for e in entity_json],
-                                        case_sensitive=False)
+                                        case_sensitive=case_sensitive)
         for i, (ent, span) in enumerate(zip(entity_json, spans)):
             if span is not None:
@@ -294,8 +335,8 @@ class BasicFrameExtractor(FrameExtractor):
 class ReviewFrameExtractor(BasicFrameExtractor):
-    def __init__(self, inference_engine:InferenceEngine, prompt_template:str, review_prompt:str,
-                 review_mode:str, system_prompt:str=None, **kwrs):
+    def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
+                 review_mode:str, review_prompt:str=None,system_prompt:str=None, **kwrs):
         """
         This class add a review step after the BasicFrameExtractor.
         The Review process asks LLM to review its output and:
@@ -309,8 +350,9 @@ class ReviewFrameExtractor(BasicFrameExtractor):
             the LLM inferencing engine object. Must implements the chat() method.
         prompt_template : str
             prompt template with "{{<placeholder name>}}" placeholder.
-        review_prompt : str
-            the prompt text that ask LLM to review. Specify addition or revision in the instruction.
+        review_prompt : str: Optional
+            the prompt text that ask LLM to review. Specify addition or revision in the instruction.
+            if not provided, a default review prompt will be used.
         review_mode : str
             review mode. Must be one of {"addition", "revision"}
             addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
@@ -319,11 +361,20 @@ class ReviewFrameExtractor(BasicFrameExtractor):
         """
         super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
                          system_prompt=system_prompt, **kwrs)
-        self.review_prompt = review_prompt
         if review_mode not in {"addition", "revision"}:
             raise ValueError('review_mode must be one of {"addition", "revision"}.')
         self.review_mode = review_mode
+        if review_prompt:
+            self.review_prompt = review_prompt
+        else:
+            file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
+                joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
+            with open(file_path, 'r') as f:
+                self.review_prompt = f.read()
+            warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
     def extract(self, text_content:Union[str, Dict[str,str]],
                 max_new_tokens:int=4096, temperature:float=0.0, stream:bool=False, **kwrs) -> str:
@@ -346,12 +397,15 @@ class ReviewFrameExtractor(BasicFrameExtractor):
         Return : str
             the output from LLM. Need post-processing.
         """
-        # Pormpt extraction
         messages = []
         if self.system_prompt:
             messages.append({'role': 'system', 'content': self.system_prompt})
         messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
+        # Initial output
+        if stream:
+            print(f"{Fore.BLUE}Initial Output:{Style.RESET_ALL}")
         initial = self.inference_engine.chat(
                         messages=messages,
                         max_new_tokens=max_new_tokens,
@@ -364,6 +418,8 @@ class ReviewFrameExtractor(BasicFrameExtractor):
         messages.append({'role': 'assistant', 'content': initial})
         messages.append({'role': 'user', 'content': self.review_prompt})
+        if stream:
+            print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
         review = self.inference_engine.chat(
                         messages=messages,
                         max_new_tokens=max_new_tokens,
@@ -428,7 +484,7 @@ class SentenceFrameExtractor(FrameExtractor):
     def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
-                document_key:str=None, multi_turn:bool=True, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
+                document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
         """
         This method inputs a text and outputs a list of outputs per sentence.
@@ -476,8 +532,8 @@ class SentenceFrameExtractor(FrameExtractor):
         for sent in sentences:
             messages.append({'role': 'user', 'content': sent['sentence_text']})
             if stream:
-                print(f"\n\nSentence: \n{sent['sentence_text']}\n")
-                print("Extraction:")
+                print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
+                print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
             gen_text = self.inference_engine.chat(
                             messages=messages,
@@ -503,7 +559,8 @@ class SentenceFrameExtractor(FrameExtractor):
     def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=512,
-                       document_key:str=None, multi_turn:bool=True, temperature:float=0.0, stream:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
+                       document_key:str=None, multi_turn:bool=False, temperature:float=0.0, case_sensitive:bool=False,
+                       stream:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
         """
         This method inputs a text and outputs a list of LLMInformationExtractionFrame
         It use the extract() method and post-process outputs into frames.
@@ -529,6 +586,8 @@ class SentenceFrameExtractor(FrameExtractor):
             can better utilize the KV caching.
         temperature : float, Optional
             the temperature for token sampling.
+        case_sensitive : bool, Optional
+            if True, entity text matching will be case-sensitive.
         stream : bool, Optional
             if True, LLM generated text will be printed in terminal in real-time.
@@ -544,9 +603,15 @@ class SentenceFrameExtractor(FrameExtractor):
                                            **kwrs)
         frame_list = []
         for sent in llm_output_sentence:
-            entity_json = self._extract_json(gen_text=sent['gen_text'])
+            entity_json = []
+            for entity in self._extract_json(gen_text=sent['gen_text']):
+                if entity_key in entity:
+                    entity_json.append(entity)
+                else:
+                    warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
             spans = self._find_entity_spans(text=sent['sentence_text'],
-                                                entities=[e[entity_key] for e in entity_json], case_sensitive=False)
+                                            entities=[e[entity_key] for e in entity_json], case_sensitive=case_sensitive)
             for ent, span in zip(entity_json, spans):
                 if span is not None:
                     start, end = span
@@ -561,6 +626,248 @@ class SentenceFrameExtractor(FrameExtractor):
         return frame_list
+class SentenceReviewFrameExtractor(SentenceFrameExtractor):
+    def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
+                 review_mode:str, review_prompt:str=None, system_prompt:str=None, **kwrs):
+        """
+        This class adds a review step after the SentenceFrameExtractor.
+        For each sentence, the review process asks LLM to review its output and:
+            1. add more frames while keeping current. This is efficient for boosting recall.
+            2. or, regenerate frames (add new and delete existing).
+        Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.
+        Parameters:
+        ----------
+        inference_engine : InferenceEngine
+            the LLM inferencing engine object. Must implements the chat() method.
+        prompt_template : str
+            prompt template with "{{<placeholder name>}}" placeholder.
+        review_prompt : str: Optional
+            the prompt text that ask LLM to review. Specify addition or revision in the instruction.
+            if not provided, a default review prompt will be used.
+        review_mode : str
+            review mode. Must be one of {"addition", "revision"}
+            addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
+        system_prompt : str, Optional
+            system prompt.
+        """
+        super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
+                         system_prompt=system_prompt, **kwrs)
+        if review_mode not in {"addition", "revision"}:
+            raise ValueError('review_mode must be one of {"addition", "revision"}.')
+        self.review_mode = review_mode
+        if review_prompt:
+            self.review_prompt = review_prompt
+        else:
+            file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
+                joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
+            with open(file_path, 'r') as f:
+                self.review_prompt = f.read()
+            warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
+    def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
+                document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
+        """
+        This method inputs a text and outputs a list of outputs per sentence.
+        Parameters:
+        ----------
+        text_content : Union[str, Dict[str,str]]
+            the input text content to put in prompt template.
+            If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
+            If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
+        max_new_tokens : str, Optional
+            the max number of new tokens LLM should generate.
+        document_key : str, Optional
+            specify the key in text_content where document text is.
+            If text_content is str, this parameter will be ignored.
+        multi_turn : bool, Optional
+            multi-turn conversation prompting.
+            If True, sentences and LLM outputs will be appended to the input message and carry-over.
+            If False, only the current sentence is prompted.
+            For LLM inference engines that supports prompt cache (e.g., Llama.Cpp, Ollama), use multi-turn conversation prompting
+            can better utilize the KV caching.
+        temperature : float, Optional
+            the temperature for token sampling.
+        stream : bool, Optional
+            if True, LLM generated text will be printed in terminal in real-time.
+        Return : str
+            the output from LLM. Need post-processing.
+        """
+        # define output
+        output = []
+        # sentence tokenization
+        if isinstance(text_content, str):
+            sentences = self._get_sentences(text_content)
+        elif isinstance(text_content, dict):
+            sentences = self._get_sentences(text_content[document_key])
+        # construct chat messages
+        messages = []
+        if self.system_prompt:
+            messages.append({'role': 'system', 'content': self.system_prompt})
+        messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
+        messages.append({'role': 'assistant', 'content': 'Sure, please start with the first sentence.'})
+        # generate sentence by sentence
+        for sent in sentences:
+            messages.append({'role': 'user', 'content': sent['sentence_text']})
+            if stream:
+                print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
+                print(f"{Fore.BLUE}Initial Output:{Style.RESET_ALL}")
+            initial = self.inference_engine.chat(
+                            messages=messages,
+                            max_new_tokens=max_new_tokens,
+                            temperature=temperature,
+                            stream=stream,
+                            **kwrs
+                        )
+            # Review
+            if stream:
+                print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
+            messages.append({'role': 'assistant', 'content': initial})
+            messages.append({'role': 'user', 'content': self.review_prompt})
+            review = self.inference_engine.chat(
+                            messages=messages,
+                            max_new_tokens=max_new_tokens,
+                            temperature=temperature,
+                            stream=stream,
+                            **kwrs
+                        )
+            # Output
+            if self.review_mode == "revision":
+                gen_text = review
+            elif self.review_mode == "addition":
+                gen_text = initial + '\n' + review
+            if multi_turn:
+                # update chat messages with LLM outputs
+                messages.append({'role': 'assistant', 'content': review})
+            else:
+                # delete sentence and review so that message is reset
+                del messages[-3:]
+            # add to output
+            output.append({'sentence_start': sent['start'],
+                            'sentence_end': sent['end'],
+                            'sentence_text': sent['sentence_text'],
+                            'gen_text': gen_text})
+        return output
+class SentenceCoTFrameExtractor(SentenceFrameExtractor):
+    from nltk.tokenize.punkt import PunktSentenceTokenizer
+    def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
+        """
+        This class performs sentence-based Chain-of-thoughts (CoT) information extraction.
+        A simulated chat follows this process:
+            1. system prompt (optional)
+            2. user instructions (schema, background, full text, few-shot example...)
+            3. user input first sentence
+            4. assistant analyze the sentence
+            5. assistant extract outputs
+            6. repeat #3, #4, #5
+        Input system prompt (optional), prompt template (with user instructions),
+        and specify a LLM.
+        Parameters
+        ----------
+        inference_engine : InferenceEngine
+            the LLM inferencing engine object. Must implements the chat() method.
+        prompt_template : str
+            prompt template with "{{<placeholder name>}}" placeholder.
+        system_prompt : str, Optional
+            system prompt.
+        """
+        super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
+                         system_prompt=system_prompt, **kwrs)
+    def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
+                document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
+        """
+        This method inputs a text and outputs a list of outputs per sentence.
+        Parameters:
+        ----------
+        text_content : Union[str, Dict[str,str]]
+            the input text content to put in prompt template.
+            If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
+            If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
+        max_new_tokens : str, Optional
+            the max number of new tokens LLM should generate.
+        document_key : str, Optional
+            specify the key in text_content where document text is.
+            If text_content is str, this parameter will be ignored.
+        multi_turn : bool, Optional
+            multi-turn conversation prompting.
+            If True, sentences and LLM outputs will be appended to the input message and carry-over.
+            If False, only the current sentence is prompted.
+            For LLM inference engines that supports prompt cache (e.g., Llama.Cpp, Ollama), use multi-turn conversation prompting
+            can better utilize the KV caching.
+        temperature : float, Optional
+            the temperature for token sampling.
+        stream : bool, Optional
+            if True, LLM generated text will be printed in terminal in real-time.
+        Return : str
+            the output from LLM. Need post-processing.
+        """
+        # define output
+        output = []
+        # sentence tokenization
+        if isinstance(text_content, str):
+            sentences = self._get_sentences(text_content)
+        elif isinstance(text_content, dict):
+            sentences = self._get_sentences(text_content[document_key])
+        # construct chat messages
+        messages = []
+        if self.system_prompt:
+            messages.append({'role': 'system', 'content': self.system_prompt})
+        messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
+        messages.append({'role': 'assistant', 'content': 'Sure, please start with the first sentence.'})
+        # generate sentence by sentence
+        for sent in sentences:
+            messages.append({'role': 'user', 'content': sent['sentence_text']})
+            if stream:
+                print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
+                print(f"{Fore.BLUE}CoT:{Style.RESET_ALL}")
+            gen_text = self.inference_engine.chat(
+                            messages=messages,
+                            max_new_tokens=max_new_tokens,
+                            temperature=temperature,
+                            stream=stream,
+                            **kwrs
+                        )
+            if multi_turn:
+                # update chat messages with LLM outputs
+                messages.append({'role': 'assistant', 'content': gen_text})
+            else:
+                # delete sentence so that message is reset
+                del messages[-1]
+            # add to output
+            output.append({'sentence_start': sent['start'],
+                            'sentence_end': sent['end'],
+                            'sentence_text': sent['sentence_text'],
+                            'gen_text': gen_text})
+        return output
 class RelationExtractor(Extractor):
     def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
         """
@@ -721,8 +1028,8 @@ class BinaryRelationExtractor(RelationExtractor):
         """
         roi_text = self._get_ROI(frame_1, frame_2, text, buffer_size=buffer_size)
         if stream:
-            print(f"\n\nROI text: \n{roi_text}\n")
-            print("Extraction:")
+            print(f"\n\n{Fore.GREEN}ROI text:{Style.RESET_ALL} \n{roi_text}\n")
+            print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
         messages = []
         if self.system_prompt:
@@ -873,8 +1180,8 @@ class MultiClassRelationExtractor(RelationExtractor):
         """
         roi_text = self._get_ROI(frame_1, frame_2, text, buffer_size=buffer_size)
         if stream:
-            print(f"\n\nROI text: \n{roi_text}\n")
-            print("Extraction:")
+            print(f"\n\n{Fore.GREEN}ROI text:{Style.RESET_ALL} \n{roi_text}\n")
+            print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
         messages = []
         if self.system_prompt:

llm-ie 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

llm-ie 0.2.2py3-none-any.whl → 0.3.1py3-none-any.whl