PyPI - llm-ie - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

llm-ie 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

llm_ie/extractors.py +144 -24
llm_ie/prompt_editor.py +10 -2
{llm_ie-0.3.1.dist-info → llm_ie-0.3.3.dist-info}/METADATA +1 -1
{llm_ie-0.3.1.dist-info → llm_ie-0.3.3.dist-info}/RECORD +5 -5
{llm_ie-0.3.1.dist-info → llm_ie-0.3.3.dist-info}/WHEEL +0 -0

llm_ie/extractors.py CHANGED Viewed

@@ -5,10 +5,11 @@ import inspect
 import importlib.resources
 import warnings
 import itertools
-from typing import List, Dict, Tuple, Union, Callable
+from typing import Set, List, Dict, Tuple, Union, Callable
 from llm_ie.data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
 from llm_ie.engines import InferenceEngine
 from colorama import Fore, Style
+from nltk.tokenize import RegexpTokenizer
 class Extractor:
@@ -37,7 +38,7 @@ class Extractor:
         This method returns the pre-defined prompt guideline for the extractor from the package asset.
         """
         file_path = importlib.resources.files('llm_ie.asset.prompt_guide').joinpath(f"{cls.__name__}_prompt_guide.txt")
-        with open(file_path, 'r') as f:
+        with open(file_path, 'r', encoding="utf-8") as f:
             return f.read()
@@ -139,9 +140,71 @@ class FrameExtractor(Extractor):
                          prompt_template=prompt_template,
                          system_prompt=system_prompt,
                          **kwrs)
+        self.tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
+    def _jaccard_score(self, s1:Set[str], s2:Set[str]) -> float:
+        """
+        This method calculates the Jaccard score between two sets of word tokens.
+        """
+        return len(s1.intersection(s2)) / len(s1.union(s2))
+    def _get_word_tokens(self, text) -> Tuple[List[str], List[Tuple[int]]]:
+        """
+        This method tokenizes the input text into a list of word tokens and their spans.
+        """
+        tokens = []
+        spans = []
+        for span in self.tokenizer.span_tokenize(text):
+            spans.append(span)
+            start, end = span
+            tokens.append(text[start:end])
+        return tokens, spans
+    def _get_closest_substring(self, text:str, pattern:str, buffer_size:float=0.2) -> Tuple[Tuple[int, int], float]:
+        """
+        This method finds the closest (highest Jaccard score) substring in text that matches the pattern.
+        the substring must start with the same word token as the pattern. This is due to the observation that
+        LLM often generate the first few words consistently.
+        Parameters
+        ----------
+        text : str
+            the input text.
+        pattern : str
+            the pattern to match.
+        buffer_size : float, Optional
+            the buffer size for the matching window. Default is 20% of pattern length.
+        Returns : Tuple[Tuple[int, int], float]
+            a tuple of 2-tuple span and Jaccard score.
+        """
+        text_tokens, text_spans = self._get_word_tokens(text)
+        pattern_tokens, _ = self._get_word_tokens(pattern)
+        pattern_tokens_set = set(pattern_tokens)
+        window_size = len(pattern_tokens)
+        window_size_min = int(window_size * (1 - buffer_size))
+        window_size_max = int(window_size * (1 + buffer_size))
+        closest_substring_span = None
+        best_score = 0
+        for i in range(len(text_tokens) - window_size_max):
+            for w in range(window_size_min, window_size_max):
+                sub_str_tokens = text_tokens[i:i + w]
+                if sub_str_tokens[0] == pattern_tokens[0]:
+                    score = self._jaccard_score(set(sub_str_tokens), pattern_tokens_set)
+                    if score > best_score:
+                        best_score = score
+                        sub_string_word_spans = text_spans[i:i + w]
+                        closest_substring_span = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
-    def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False) -> List[Tuple[int]]:
+        return closest_substring_span, best_score
+    def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
+                           fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8) -> List[Tuple[int]]:
         """
         This function inputs a text and a list of entity text,
         outputs a list of spans (2-tuple) for each entity.
@@ -151,19 +214,46 @@ class FrameExtractor(Extractor):
         ----------
         text : str
             text that contains entities
+        entities : List[str]
+            a list of entity text to find in the text
+        case_sensitive : bool, Optional
+            if True, entity text matching will be case-sensitive.
+        fuzzy_match : bool, Optional
+            if True, fuzzy matching will be applied to find entity text.
+        fuzzy_buffer_size : float, Optional
+            the buffer size for fuzzy matching. Default is 20% of entity text length.
+        fuzzy_score_cutoff : float, Optional
+            the Jaccard score cutoff for fuzzy matching.
+            Matched entity text must have a score higher than this value or a None will be returned.
         """
+        # Handle case sensitivity
+        if not case_sensitive:
+            text = text.lower()
+        # Match entities
         entity_spans = []
-        for entity in entities:
-            if case_sensitive:
-                match = re.search(re.escape(entity), text)
-            else:
-                match = re.search(re.escape(entity), text, re.IGNORECASE)
+        for entity in entities:
+            if not case_sensitive:
+                entity = entity.lower()
+            # Exact match
+            match = re.search(re.escape(entity), text)
             if match:
                 start, end = match.span()
                 entity_spans.append((start, end))
                 # Replace the found entity with spaces to avoid finding the same instance again
                 text = text[:start] + ' ' * (end - start) + text[end:]
+            # Fuzzy match
+            elif fuzzy_match:
+                closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
+                if best_score >= fuzzy_score_cutoff and closest_substring_span:
+                    entity_spans.append(closest_substring_span)
+                    # Replace the found entity with spaces to avoid finding the same instance again
+                    text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
+                else:
+                    entity_spans.append(None)
+            # No match
             else:
                 entity_spans.append(None)
@@ -276,7 +366,9 @@ class BasicFrameExtractor(FrameExtractor):
     def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
-                       temperature:float=0.0, case_sensitive:bool=False, document_key:str=None, **kwrs) -> List[LLMInformationExtractionFrame]:
+                       temperature:float=0.0, document_key:str=None, stream:bool=False,
+                       case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2,
+                       fuzzy_score_cutoff:float=0.8, **kwrs) -> List[LLMInformationExtractionFrame]:
         """
         This method inputs a text and outputs a list of LLMInformationExtractionFrame
         It use the extract() method and post-process outputs into frames.
@@ -293,18 +385,30 @@ class BasicFrameExtractor(FrameExtractor):
             the max number of new tokens LLM should generate.
         temperature : float, Optional
             the temperature for token sampling.
-        case_sensitive : bool, Optional
-            if True, entity text matching will be case-sensitive.
         document_key : str, Optional
             specify the key in text_content where document text is.
             If text_content is str, this parameter will be ignored.
+        stream : bool, Optional
+            if True, LLM generated text will be printed in terminal in real-time.
+        case_sensitive : bool, Optional
+            if True, entity text matching will be case-sensitive.
+        fuzzy_match : bool, Optional
+            if True, fuzzy matching will be applied to find entity text.
+        fuzzy_buffer_size : float, Optional
+            the buffer size for fuzzy matching. Default is 20% of entity text length.
+        fuzzy_score_cutoff : float, Optional
+            the Jaccard score cutoff for fuzzy matching.
+            Matched entity text must have a score higher than this value or a None will be returned.
         Return : str
             a list of frames.
         """
         frame_list = []
         gen_text = self.extract(text_content=text_content,
-                                max_new_tokens=max_new_tokens, temperature=temperature, **kwrs)
+                                max_new_tokens=max_new_tokens,
+                                temperature=temperature,
+                                stream=stream,
+                                **kwrs)
         entity_json = []
         for entity in self._extract_json(gen_text=gen_text):
@@ -320,7 +424,10 @@ class BasicFrameExtractor(FrameExtractor):
         spans = self._find_entity_spans(text=text,
                                         entities=[e[entity_key] for e in entity_json],
-                                        case_sensitive=case_sensitive)
+                                        case_sensitive=case_sensitive,
+                                        fuzzy_match=fuzzy_match,
+                                        fuzzy_buffer_size=fuzzy_buffer_size,
+                                        fuzzy_score_cutoff=fuzzy_score_cutoff)
         for i, (ent, span) in enumerate(zip(entity_json, spans)):
             if span is not None:
@@ -328,7 +435,7 @@ class BasicFrameExtractor(FrameExtractor):
                 frame = LLMInformationExtractionFrame(frame_id=f"{i}",
                             start=start,
                             end=end,
-                            entity_text=ent[entity_key],
+                            entity_text=text[start:end],
                             attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
                 frame_list.append(frame)
         return frame_list
@@ -370,7 +477,7 @@ class ReviewFrameExtractor(BasicFrameExtractor):
         else:
             file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
                 joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
-            with open(file_path, 'r') as f:
+            with open(file_path, 'r', encoding="utf-8") as f:
                 self.review_prompt = f.read()
             warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
@@ -559,8 +666,9 @@ class SentenceFrameExtractor(FrameExtractor):
     def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=512,
-                       document_key:str=None, multi_turn:bool=False, temperature:float=0.0, case_sensitive:bool=False,
-                       stream:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
+                       document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False,
+                       case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
+                        **kwrs) -> List[LLMInformationExtractionFrame]:
         """
         This method inputs a text and outputs a list of LLMInformationExtractionFrame
         It use the extract() method and post-process outputs into frames.
@@ -586,10 +694,17 @@ class SentenceFrameExtractor(FrameExtractor):
             can better utilize the KV caching.
         temperature : float, Optional
             the temperature for token sampling.
-        case_sensitive : bool, Optional
-            if True, entity text matching will be case-sensitive.
         stream : bool, Optional
             if True, LLM generated text will be printed in terminal in real-time.
+        case_sensitive : bool, Optional
+            if True, entity text matching will be case-sensitive.
+        fuzzy_match : bool, Optional
+            if True, fuzzy matching will be applied to find entity text.
+        fuzzy_buffer_size : float, Optional
+            the buffer size for fuzzy matching. Default is 20% of entity text length.
+        fuzzy_score_cutoff : float, Optional
+            the Jaccard score cutoff for fuzzy matching.
+            Matched entity text must have a score higher than this value or a None will be returned.
         Return : str
             a list of frames.
@@ -611,16 +726,21 @@ class SentenceFrameExtractor(FrameExtractor):
                     warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
             spans = self._find_entity_spans(text=sent['sentence_text'],
-                                            entities=[e[entity_key] for e in entity_json], case_sensitive=case_sensitive)
+                                            entities=[e[entity_key] for e in entity_json],
+                                            case_sensitive=case_sensitive,
+                                            fuzzy_match=fuzzy_match,
+                                            fuzzy_buffer_size=fuzzy_buffer_size,
+                                            fuzzy_score_cutoff=fuzzy_score_cutoff)
             for ent, span in zip(entity_json, spans):
                 if span is not None:
                     start, end = span
+                    entity_text = sent['sentence_text'][start:end]
                     start += sent['sentence_start']
                     end += sent['sentence_start']
                     frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
                                 start=start,
                                 end=end,
-                                entity_text=ent[entity_key],
+                                entity_text=entity_text,
                                 attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
                     frame_list.append(frame)
         return frame_list
@@ -663,7 +783,7 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
         else:
             file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
                 joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
-            with open(file_path, 'r') as f:
+            with open(file_path, 'r', encoding="utf-8") as f:
                 self.review_prompt = f.read()
             warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)

llm_ie/prompt_editor.py CHANGED Viewed

@@ -5,8 +5,6 @@ from llm_ie.engines import InferenceEngine
 from llm_ie.extractors import FrameExtractor
 import re
 from colorama import Fore, Style
-import ipywidgets as widgets
-from IPython.display import display, HTML
 class PromptEditor:
@@ -121,6 +119,16 @@ class PromptEditor:
         """
         This method runs an interactive chat session in Jupyter/IPython using ipywidgets to help users write prompt templates.
         """
+        # Check if ipywidgets is installed
+        if importlib.util.find_spec("ipywidgets") is None:
+            raise ImportError("ipywidgets not found. Please install ipywidgets (```pip install ipywidgets```).")
+        import ipywidgets as widgets
+        # Check if IPython is installed
+        if importlib.util.find_spec("IPython") is None:
+            raise ImportError("IPython not found. Please install IPython (```pip install ipython```).")
+        from IPython.display import display, HTML
         # Load the chat prompt template from the resources
         file_path = importlib.resources.files('llm_ie.asset.PromptEditor_prompts').joinpath('chat.txt')
         with open(file_path, 'r') as f:

{llm_ie-0.3.1.dist-info → llm_ie-0.3.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llm-ie
-Version: 0.3.1
+Version: 0.3.3
 Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
 License: MIT
 Author: Enshuo (David) Hsu

{llm_ie-0.3.1.dist-info → llm_ie-0.3.3.dist-info}/RECORD RENAMED Viewed

@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=oKH_QeD
 llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=oKH_QeDgpw771ZdHk3L7DYz2Jvfm7OolUoTiJyMJI30,9541
 llm_ie/data_types.py,sha256=hPz3WOeAzfn2QKmb0CxHmRdQWZQ4G9zq8U-RJBVFdYk,14329
 llm_ie/engines.py,sha256=PTYs7s_iCPmI-yFUCVCPY_cMGS77ma2VGoz4rdNkODI,9308
-llm_ie/extractors.py,sha256=xgkicRzBPRaQPiKWmQJ5b_aiNv9VEc85jzBA7cQXic8,58331
-llm_ie/prompt_editor.py,sha256=3h_2yIe7OV4auv4Vb9Zdx2q26UhC0xp9c4tt_yDr78I,8144
-llm_ie-0.3.1.dist-info/METADATA,sha256=eJCzg7G_ivz0CcP9KycSeHo986se6tqA8cKLtQyTtw4,41266
-llm_ie-0.3.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-llm_ie-0.3.1.dist-info/RECORD,,
+llm_ie/extractors.py,sha256=yBdIcevjMfwto85Jb0KkRMN-AjIMk92fD5yWB3Qm8MY,64408
+llm_ie/prompt_editor.py,sha256=Xc5ZHsEnM8-YYITokIsM6BVsf2Ec_8ajJDaldPf-P8U,8577
+llm_ie-0.3.3.dist-info/METADATA,sha256=CeTsMNtWhEWCvOqHWSXu0KqOgDp3kMwN2WtBF4N-4zE,41266
+llm_ie-0.3.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+llm_ie-0.3.3.dist-info/RECORD,,

{llm_ie-0.3.1.dist-info → llm_ie-0.3.3.dist-info}/WHEEL RENAMED Viewed

File without changes

llm-ie 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

llm-ie 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl