PyPI - llm-ie - Versions diffs - 1.2.1__py3-none-any.whl → 1.2.3__py3-none-any.whl - Mend

llm-ie 1.2.1py3-none-any.whl → 1.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

llm_ie/__init__.py +5 -4
llm_ie/chunkers.py +78 -4
llm_ie/data_types.py +23 -37
llm_ie/engines.py +663 -112
llm_ie/extractors.py +357 -206
llm_ie/prompt_editor.py +4 -4
{llm_ie-1.2.1.dist-info → llm_ie-1.2.3.dist-info}/METADATA +1 -1
{llm_ie-1.2.1.dist-info → llm_ie-1.2.3.dist-info}/RECORD +9 -9
{llm_ie-1.2.1.dist-info → llm_ie-1.2.3.dist-info}/WHEEL +0 -0

llm_ie/__init__.py CHANGED Viewed

@@ -1,11 +1,12 @@
 from .data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
-from .engines import BasicLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig, LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
+from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
+from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
 from .extractors import DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
-from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
+from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
 from .prompt_editor import PromptEditor
 __all__ = ["LLMInformationExtractionFrame", "LLMInformationExtractionDocument",
-           "BasicLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
+           "BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
            "DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
-           "UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
+           "UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
            "PromptEditor"]

llm_ie/chunkers.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import abc
 from typing import Set, List, Dict, Tuple, Union, Callable
+import asyncio
+import uuid
 from llm_ie.data_types import FrameExtractionUnit
@@ -11,7 +13,8 @@ class UnitChunker(abc.ABC):
         """
         pass
-    def chunk(self, text:str) -> List[FrameExtractionUnit]:
+    @abc.abstractmethod
+    def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
         """
         Parameters:
         ----------
@@ -20,6 +23,12 @@ class UnitChunker(abc.ABC):
         """
         return NotImplemented
+    async def chunk_async(self, text:str, doc_id:str=None, executor=None) -> List[FrameExtractionUnit]:
+        """
+        asynchronous version of chunk method.
+        """
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(executor, self.chunk, text, doc_id)
 class WholeDocumentUnitChunker(UnitChunker):
     def __init__(self):
@@ -28,7 +37,7 @@ class WholeDocumentUnitChunker(UnitChunker):
         """
         super().__init__()
-    def chunk(self, text:str) -> List[FrameExtractionUnit]:
+    def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
         """
         Parameters:
         ----------
@@ -36,11 +45,49 @@ class WholeDocumentUnitChunker(UnitChunker):
             The document text.
         """
         return [FrameExtractionUnit(
+            doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
             start=0,
             end=len(text),
             text=text
         )]
+class SeparatorUnitChunker(UnitChunker):
+    def __init__(self, sep:str):
+        """
+        This class chunks a document by separator provided.
+        Parameters:
+        ----------
+        sep : str
+            a separator string.
+        """
+        super().__init__()
+        if not isinstance(sep, str):
+            raise ValueError("sep must be a string")
+        self.sep = sep
+    def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
+        """
+        Parameters:
+        ----------
+        text : str
+            The document text.
+        """
+        paragraphs = text.split(self.sep)
+        paragraph_units = []
+        start = 0
+        for paragraph in paragraphs:
+            end = start + len(paragraph)
+            paragraph_units.append(FrameExtractionUnit(
+                doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
+                start=start,
+                end=end,
+                text=paragraph
+            ))
+            start = end + len(self.sep)
+        return paragraph_units
 class SentenceUnitChunker(UnitChunker):
     from nltk.tokenize.punkt import PunktSentenceTokenizer
@@ -50,7 +97,7 @@ class SentenceUnitChunker(UnitChunker):
         """
         super().__init__()
-    def chunk(self, text:str) -> List[FrameExtractionUnit]:
+    def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
         """
         Parameters:
         ----------
@@ -60,6 +107,7 @@ class SentenceUnitChunker(UnitChunker):
         sentences = []
         for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
             sentences.append(FrameExtractionUnit(
+                doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
                 start=start,
                 end=end,
                 text=text[start:end]
@@ -74,7 +122,7 @@ class TextLineUnitChunker(UnitChunker):
         """
         super().__init__()
-    def chunk(self, text:str) -> List[FrameExtractionUnit]:
+    def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
         """
         Parameters:
         ----------
@@ -87,6 +135,7 @@ class TextLineUnitChunker(UnitChunker):
         for line in lines:
             end = start + len(line)
             line_units.append(FrameExtractionUnit(
+                doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
                 start=start,
                 end=end,
                 text=line
@@ -103,6 +152,24 @@ class ContextChunker(abc.ABC):
         """
         pass
+    @abc.abstractmethod
+    def fit(self, text:str, units:List[FrameExtractionUnit]):
+        """
+        Parameters:
+        ----------
+        text : str
+            The document text.
+        """
+        pass
+    async def fit_async(self, text:str, units:List[FrameExtractionUnit], executor=None):
+        """
+        asynchronous version of fit method.
+        """
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(executor, self.fit, text, units)
+    @abc.abstractmethod
     def chunk(self, unit:FrameExtractionUnit) -> str:
         """
         Parameters:
@@ -115,6 +182,13 @@ class ContextChunker(abc.ABC):
         """
         return NotImplemented
+    async def chunk_async(self, unit:FrameExtractionUnit, executor=None) -> str:
+        """
+        asynchronous version of chunk method.
+        """
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(executor, self.chunk, unit)
 class NoContextChunker(ContextChunker):
     def __init__(self):

llm_ie/data_types.py CHANGED Viewed

@@ -7,13 +7,15 @@ import json
 @dataclass
 class FrameExtractionUnit:
-    def __init__(self, start:int, end:int, text:str):
+    def __init__(self, doc_id:str, start:int, end:int, text:str):
         """
         This class holds the unit text for frame extraction, for example, a sentence.
         FrameExtractor prompt it one at a time to extract frames.
         Parameters
         ----------
+        doc_id : str, Optional
+            document ID.
         start : int
             start character position of the unit text, relative to the whole document
         end : int
@@ -21,9 +23,28 @@ class FrameExtractionUnit:
         text : str
             the unit text. Should be the exact string by [start:end]
         """
+        self.doc_id = doc_id
         self.start = start
         self.end = end
         self.text = text
+        # status: "pending", "success", "fail"
+        self.status = "pending"
+        # generated text by LLM
+        self.gen_text = None
+    def get_status(self) -> str:
+        return self.status
+    def set_status(self, status:str):
+        if status not in {"pending", "success", "fail"}:
+            raise ValueError('status must be one of {"pending", "success", "fail"}.')
+        self.status = status
+    def get_generated_text(self) -> str:
+        return self.gen_text
+    def set_generated_text(self, gen_text:str):
+        self.gen_text = gen_text
     def __eq__(self, other):
             if not isinstance(other, FrameExtractionUnit):
@@ -39,43 +60,8 @@ class FrameExtractionUnit:
         return self.start < other.start
     def __repr__(self):
-        return f"FrameExtractionUnit(start={self.start}, end={self.end}, text='{self.text[:100]}...')"
-@dataclass
-class FrameExtractionUnitResult:
-    def __init__(self, start:int, end:int, text:str, gen_text:str):
-        """
-        This class holds the unit text for frame extraction, for example, a sentence.
-        FrameExtractor prompt it one at a time to extract frames.
-        Parameters
-        ----------
-        start : int
-            start character position of the unit text, relative to the whole document
-        end : int
-            end character position of the unit text, relative to the whole document
-        text : str
-            the unit text. Should be the exact string by [start:end]
-        gen_text : str
-            the generated text by LLM (ideally) following '[{"entity_text": "xxx", "attr": {"key": "value"}}]' format. Does not contain spans (start/end).
-        """
-        self.start = start
-        self.end = end
-        self.text = text
-        self.gen_text = gen_text
-    def __eq__(self, other):
-            if not isinstance(other, FrameExtractionUnit):
-                return NotImplemented
-            return (self.start == other.start and self.end == other.end and self.text == other.text and self.gen_text == other.gen_text)
+        return f"FrameExtractionUnit(doc_id={self.doc_id}, start={self.start}, end={self.end}, status={self.status}, text='{self.text[:100]}...')"
-    def __hash__(self):
-        return hash((self.start, self.end, self.text, self.gen_text))
-    def __repr__(self):
-        return f"FrameExtractionUnitResult(start={self.start}, end={self.end}, text='{self.text[:100]}...', gen_text='{self.gen_text[:100]}...')"
 @dataclass
 class LLMInformationExtractionFrame:

llm-ie 1.2.1__py3-none-any.whl → 1.2.3__py3-none-any.whl

llm-ie 1.2.1py3-none-any.whl → 1.2.3py3-none-any.whl