PyPI - llm-ie - Versions diffs - 0.4.7__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

llm-ie 0.4.7py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

llm_ie/__init__.py +6 -4
llm_ie/asset/default_prompts/BasicReviewFrameExtractor_addition_review_prompt.txt +3 -0
llm_ie/asset/default_prompts/BasicReviewFrameExtractor_revision_review_prompt.txt +2 -0
llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +2 -1
llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +2 -1
llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +104 -86
llm_ie/asset/prompt_guide/BasicReviewFrameExtractor_prompt_guide.txt +163 -0
llm_ie/asset/prompt_guide/DirectFrameExtractor_prompt_guide.txt +163 -0
llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +103 -85
llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +103 -86
llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +103 -86
llm_ie/chunkers.py +191 -0
llm_ie/data_types.py +75 -1
llm_ie/engines.py +600 -262
llm_ie/extractors.py +859 -899
llm_ie/prompt_editor.py +45 -12
llm_ie-1.1.0.dist-info/METADATA +18 -0
llm_ie-1.1.0.dist-info/RECORD +27 -0
llm_ie/asset/prompt_guide/SentenceCoTFrameExtractor_prompt_guide.txt +0 -217
llm_ie-0.4.7.dist-info/METADATA +0 -1219
llm_ie-0.4.7.dist-info/RECORD +0 -23
{llm_ie-0.4.7.dist-info → llm_ie-1.1.0.dist-info}/WHEEL +0 -0

llm_ie/chunkers.py ADDED Viewed

@@ -0,0 +1,191 @@
+import abc
+from typing import Set, List, Dict, Tuple, Union, Callable
+from llm_ie.data_types import FrameExtractionUnit
+class UnitChunker(abc.ABC):
+    def __init__(self):
+        """
+        This is the abstract class for frame extraction unit chunker.
+        It chunks a document into units (e.g., sentences). LLMs process unit by unit.
+        """
+        pass
+    def chunk(self, text:str) -> List[FrameExtractionUnit]:
+        """
+        Parameters:
+        ----------
+        text : str
+            The document text.
+        """
+        return NotImplemented
+class WholeDocumentUnitChunker(UnitChunker):
+    def __init__(self):
+        """
+        This class chunks the whole document into a single unit (no chunking).
+        """
+        super().__init__()
+    def chunk(self, text:str) -> List[FrameExtractionUnit]:
+        """
+        Parameters:
+        ----------
+        text : str
+            The document text.
+        """
+        return [FrameExtractionUnit(
+            start=0,
+            end=len(text),
+            text=text
+        )]
+class SentenceUnitChunker(UnitChunker):
+    from nltk.tokenize.punkt import PunktSentenceTokenizer
+    def __init__(self):
+        """
+        This class uses the NLTK PunktSentenceTokenizer to chunk a document into sentences.
+        """
+        super().__init__()
+    def chunk(self, text:str) -> List[FrameExtractionUnit]:
+        """
+        Parameters:
+        ----------
+        text : str
+            The document text.
+        """
+        sentences = []
+        for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
+            sentences.append(FrameExtractionUnit(
+                start=start,
+                end=end,
+                text=text[start:end]
+            ))
+        return sentences
+class TextLineUnitChunker(UnitChunker):
+    def __init__(self):
+        """
+        This class chunks a document into lines.
+        """
+        super().__init__()
+    def chunk(self, text:str) -> List[FrameExtractionUnit]:
+        """
+        Parameters:
+        ----------
+        text : str
+            The document text.
+        """
+        lines = text.split('\n')
+        line_units = []
+        start = 0
+        for line in lines:
+            end = start + len(line)
+            line_units.append(FrameExtractionUnit(
+                start=start,
+                end=end,
+                text=line
+            ))
+            start = end + 1
+        return line_units
+class ContextChunker(abc.ABC):
+    def __init__(self):
+        """
+        This is the abstract class for context chunker. Given a frame extraction unit,
+        it returns the context for it.
+        """
+        pass
+    def chunk(self, unit:FrameExtractionUnit) -> str:
+        """
+        Parameters:
+        ----------
+        unit : FrameExtractionUnit
+            The frame extraction unit.
+        Return : str
+            The context for the frame extraction unit.
+        """
+        return NotImplemented
+class NoContextChunker(ContextChunker):
+    def __init__(self):
+        """
+        This class does not provide any context.
+        """
+        super().__init__()
+    def fit(self, text:str, units:List[FrameExtractionUnit]):
+        """
+        Parameters:
+        ----------
+        text : str
+            The document text.
+        """
+        pass
+    def chunk(self, unit:FrameExtractionUnit) -> str:
+        return ""
+class WholeDocumentContextChunker(ContextChunker):
+    def __init__(self):
+        """
+        This class provides the whole document as context.
+        """
+        super().__init__()
+        self.text = None
+    def fit(self, text:str, units:List[FrameExtractionUnit]):
+        """
+        Parameters:
+        ----------
+        text : str
+            The document text.
+        """
+        self.text = text
+    def chunk(self, unit:FrameExtractionUnit) -> str:
+        if self.text is None:
+            raise ValueError("The context chunker has not been fitted yet. Please call fit() before chunk().")
+        return self.text
+class SlideWindowContextChunker(ContextChunker):
+    def __init__(self, window_size:int):
+        """
+        This class provides a sliding window context. For example, +-2 sentences around a unit sentence.
+        """
+        super().__init__()
+        self.window_size = window_size
+        self.units = None
+    def fit(self, text:str, units:List[FrameExtractionUnit]):
+        """
+        Parameters:
+        ----------
+        units : List[FrameExtractionUnit]
+            The list of frame extraction units.
+        """
+        self.units = sorted(units)
+    def chunk(self, unit:FrameExtractionUnit) -> str:
+        if self.units is None:
+            raise ValueError("The context chunker has not been fitted yet. Please call fit() before chunk().")
+        index = self.units.index(unit)
+        start = max(0, index - self.window_size)
+        end = min(len(self.units), index + self.window_size + 1)
+        context = []
+        for i in range(start, end):
+            context.append(self.units[i].text)
+        return " ".join(context)

llm_ie/data_types.py CHANGED Viewed

@@ -1,9 +1,83 @@
+from dataclasses import dataclass
 from typing import List, Dict, Tuple, Iterable, Callable
 import importlib.util
 import warnings
 import json
+@dataclass
+class FrameExtractionUnit:
+    def __init__(self, start:int, end:int, text:str):
+        """
+        This class holds the unit text for frame extraction, for example, a sentence.
+        FrameExtractor prompt it one at a time to extract frames.
+        Parameters
+        ----------
+        start : int
+            start character position of the unit text, relative to the whole document
+        end : int
+            end character position of the unit text, relative to the whole document
+        text : str
+            the unit text. Should be the exact string by [start:end]
+        """
+        self.start = start
+        self.end = end
+        self.text = text
+    def __eq__(self, other):
+            if not isinstance(other, FrameExtractionUnit):
+                return NotImplemented
+            return (self.start == other.start and self.end == other.end)
+    def __hash__(self):
+        return hash((self.start, self.end))
+    def __lt__(self, other):
+        if not isinstance(other, FrameExtractionUnit):
+            return NotImplemented
+        return self.start < other.start
+    def __repr__(self):
+        return f"FrameExtractionUnit(start={self.start}, end={self.end}, text='{self.text[:100]}...')"
+@dataclass
+class FrameExtractionUnitResult:
+    def __init__(self, start:int, end:int, text:str, gen_text:str):
+        """
+        This class holds the unit text for frame extraction, for example, a sentence.
+        FrameExtractor prompt it one at a time to extract frames.
+        Parameters
+        ----------
+        start : int
+            start character position of the unit text, relative to the whole document
+        end : int
+            end character position of the unit text, relative to the whole document
+        text : str
+            the unit text. Should be the exact string by [start:end]
+        gen_text : str
+            the generated text by LLM (ideally) following '[{"entity_text": "xxx", "attr": {"key": "value"}}]' format. Does not contain spans (start/end).
+        """
+        self.start = start
+        self.end = end
+        self.text = text
+        self.gen_text = gen_text
+    def __eq__(self, other):
+            if not isinstance(other, FrameExtractionUnit):
+                return NotImplemented
+            return (self.start == other.start and self.end == other.end and self.text == other.text and self.gen_text == other.gen_text)
+    def __hash__(self):
+        return hash((self.start, self.end, self.text, self.gen_text))
+    def __repr__(self):
+        return f"FrameExtractionUnitResult(start={self.start}, end={self.end}, text='{self.text[:100]}...', gen_text='{self.gen_text[:100]}...')"
+@dataclass
 class LLMInformationExtractionFrame:
     def __init__(self, frame_id:str, start:int, end:int, entity_text:str, attr:Dict[str,str]=None):
         """
@@ -33,7 +107,7 @@ class LLMInformationExtractionFrame:
         if attr:
             self.attr = attr.copy()
         else:
-            self.attr = None
+            self.attr = {}
     def is_equal(self, frame:"LLMInformationExtractionFrame") -> bool:
         """

llm-ie 0.4.7__py3-none-any.whl → 1.1.0__py3-none-any.whl

llm-ie 0.4.7py3-none-any.whl → 1.1.0py3-none-any.whl