PyPI - llm-ie - Versions diffs - 1.2.4__py3-none-any.whl → 1.4.0__py3-none-any.whl - Mend

llm-ie 1.2.4py3-none-any.whl → 1.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

llm_ie/__init__.py +2 -2
llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt +2 -2
llm_ie/asset/prompt_guide/StructExtractor_prompt_guide.txt +53 -0
llm_ie/data_types.py +72 -44
llm_ie/engines.py +33 -1487
llm_ie/extractors.py +538 -243
{llm_ie-1.2.4.dist-info → llm_ie-1.4.0.dist-info}/METADATA +3 -2
{llm_ie-1.2.4.dist-info → llm_ie-1.4.0.dist-info}/RECORD +9 -8
{llm_ie-1.2.4.dist-info → llm_ie-1.4.0.dist-info}/WHEEL +0 -0

llm_ie/extractors.py CHANGED Viewed

@@ -6,8 +6,7 @@ import warnings
 import itertools
 import asyncio
 import nest_asyncio
-from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Set, List, Dict, Tuple, Union, Callable, Generator, Optional, AsyncGenerator
+from typing import Any, Set, List, Dict, Tuple, Union, Callable, Generator, Optional
 from llm_ie.utils import extract_json, apply_prompt_template
 from llm_ie.data_types import FrameExtractionUnit, LLMInformationExtractionFrame, LLMInformationExtractionDocument
 from llm_ie.chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker
@@ -98,6 +97,451 @@ class Extractor:
         return apply_prompt_template(self.prompt_template, text_content)
+class StructExtractor(Extractor):
+    def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker, prompt_template:str,
+                 system_prompt:str=None, context_chunker:ContextChunker=None, aggregation_func:Callable=None):
+        """
+        This class is for unanchored structured information extraction.
+        Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
+        Parameters:
+        ----------
+        inference_engine : InferenceEngine
+            the LLM inferencing engine object. Must implements the chat() method.
+        unit_chunker : UnitChunker
+            the unit chunker object that determines how to chunk the document text into units.
+        prompt_template : str
+            prompt template with "{{<placeholder name>}}" placeholder.
+        system_prompt : str, Optional
+            system prompt.
+        context_chunker : ContextChunker
+            the context chunker object that determines how to get context for each unit.
+        aggregation_func : Callable
+            a function that inputs a list of structured information (dict)
+            and outputs an aggregated structured information (dict).
+            if not specified, the default is to merge all dicts by updating keys and overwriting values sequentially.
+        """
+        super().__init__(inference_engine=inference_engine,
+                         prompt_template=prompt_template,
+                         system_prompt=system_prompt)
+        self.unit_chunker = unit_chunker
+        self.context_chunker = context_chunker
+        self.aggregation_func = aggregation_func
+    def extract(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
+                verbose:bool=False, return_messages_log:bool=False) -> List[FrameExtractionUnit]:
+        """
+        This method inputs text content and outputs a string generated by LLM
+        Parameters:
+        ----------
+        text_content : Union[str, Dict[str,str]]
+            the input text content to put in prompt template.
+            If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
+            If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
+        return_messages_log : bool, Optional
+            if True, a list of messages will be returned.
+        Return : List[FrameExtractionUnit]
+            the output from LLM. Need post-processing.
+        """
+        # unit chunking
+        if isinstance(text_content, str):
+            doc_text = text_content
+        elif isinstance(text_content, dict):
+            if document_key is None:
+                raise ValueError("document_key must be provided when text_content is dict.")
+            doc_text = text_content[document_key]
+        units = self.unit_chunker.chunk(doc_text)
+        # context chunker init
+        self.context_chunker.fit(doc_text, units)
+        # messages log
+        messages_logger = MessagesLogger() if return_messages_log else None
+        # generate unit by unit
+        for i, unit in enumerate(units):
+            try:
+                # construct chat messages
+                messages = []
+                if self.system_prompt:
+                    messages.append({'role': 'system', 'content': self.system_prompt})
+                context = self.context_chunker.chunk(unit)
+                if context == "":
+                    # no context, just place unit in user prompt
+                    if isinstance(text_content, str):
+                        messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
+                    else:
+                        unit_content = text_content.copy()
+                        unit_content[document_key] = unit.text
+                        messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
+                else:
+                    # insert context to user prompt
+                    if isinstance(text_content, str):
+                        messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
+                    else:
+                        context_content = text_content.copy()
+                        context_content[document_key] = context
+                        messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
+                    # simulate conversation where assistant confirms
+                    messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
+                    # place unit of interest
+                    messages.append({'role': 'user', 'content': unit.text})
+                if verbose:
+                    print(f"\n\n{Fore.GREEN}Unit {i + 1}/{len(units)}:{Style.RESET_ALL}\n{unit.text}\n")
+                    if context != "":
+                        print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
+                    print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
+                gen_text = self.inference_engine.chat(
+                                messages=messages,
+                                verbose=verbose,
+                                stream=False,
+                                messages_logger=messages_logger
+                            )
+                # add generated text to unit
+                unit.set_generated_text(gen_text["response"])
+                unit.set_status("success")
+            except Exception as e:
+                unit.set_status("fail")
+                warnings.warn(f"LLM inference failed for unit {i} ({unit.start}, {unit.end}): {e}", RuntimeWarning)
+        if return_messages_log:
+            return units, messages_logger.get_messages_log()
+        return units
+    def stream(self, text_content: Union[str, Dict[str, str]],
+               document_key: str = None) -> Generator[Dict[str, Any], None, List[FrameExtractionUnit]]:
+        """
+        Streams LLM responses per unit with structured event types,
+        and returns collected data for post-processing.
+        Yields:
+        -------
+        Dict[str, Any]: (type, data)
+            - {"type": "info", "data": str_message}: General informational messages.
+            - {"type": "unit", "data": dict_unit_info}: Signals start of a new unit. dict_unit_info contains {'id', 'text', 'start', 'end'}
+            - {"type": "context", "data": str_context}: Context string for the current unit.
+            - {"type": "reasoning", "data": str_chunk}: A reasoning model thinking chunk from the LLM.
+            - {"type": "response", "data": str_chunk}: A response/answer chunk from the LLM.
+        Returns:
+        --------
+        List[FrameExtractionUnit]:
+            A list of FrameExtractionUnit objects, each containing the
+            original unit details and the fully accumulated 'gen_text' from the LLM.
+        """
+        if isinstance(text_content, str):
+            doc_text = text_content
+        elif isinstance(text_content, dict):
+            if document_key is None:
+                raise ValueError("document_key must be provided when text_content is dict.")
+            if document_key not in text_content:
+                raise ValueError(f"document_key '{document_key}' not found in text_content.")
+            doc_text = text_content[document_key]
+        else:
+            raise TypeError("text_content must be a string or a dictionary.")
+        units: List[FrameExtractionUnit] = self.unit_chunker.chunk(doc_text)
+        self.context_chunker.fit(doc_text, units)
+        yield {"type": "info", "data": f"Starting LLM processing for {len(units)} units."}
+        for i, unit in enumerate(units):
+            unit_info_payload = {"id": i, "text": unit.text, "start": unit.start, "end": unit.end}
+            yield {"type": "unit", "data": unit_info_payload}
+            messages = []
+            if self.system_prompt:
+                messages.append({'role': 'system', 'content': self.system_prompt})
+            context_str = self.context_chunker.chunk(unit)
+            # Construct prompt input based on whether text_content was str or dict
+            if context_str:
+                yield {"type": "context", "data": context_str}
+                prompt_input_for_context = context_str
+                if isinstance(text_content, dict):
+                    context_content_dict = text_content.copy()
+                    context_content_dict[document_key] = context_str
+                    prompt_input_for_context = context_content_dict
+                messages.append({'role': 'user', 'content': self._get_user_prompt(prompt_input_for_context)})
+                messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
+                messages.append({'role': 'user', 'content': unit.text})
+            else: # No context
+                prompt_input_for_unit = unit.text
+                if isinstance(text_content, dict):
+                    unit_content_dict = text_content.copy()
+                    unit_content_dict[document_key] = unit.text
+                    prompt_input_for_unit = unit_content_dict
+                messages.append({'role': 'user', 'content': self._get_user_prompt(prompt_input_for_unit)})
+            current_gen_text = ""
+            response_stream = self.inference_engine.chat(
+                messages=messages,
+                stream=True
+            )
+            for chunk in response_stream:
+                yield chunk
+                if chunk["type"] == "response":
+                    current_gen_text += chunk["data"]
+            # Store the result for this unit
+            unit.set_generated_text(current_gen_text)
+            unit.set_status("success")
+        yield {"type": "info", "data": "All units processed by LLM."}
+        return units
+    async def _extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
+                            concurrent_batch_size:int=32, return_messages_log:bool=False) -> List[FrameExtractionUnit]:
+        """
+        This is the asynchronous version of the extract() method.
+        Parameters:
+        ----------
+        text_content : Union[str, Dict[str,str]]
+            the input text content to put in prompt template.
+            If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
+            If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
+        document_key : str, Optional
+            specify the key in text_content where document text is.
+            If text_content is str, this parameter will be ignored.
+        concurrent_batch_size : int, Optional
+            the batch size for concurrent processing.
+        return_messages_log : bool, Optional
+            if True, a list of messages will be returned.
+        Return : List[FrameExtractionUnit]
+            the output from LLM for each unit. Contains the start, end, text, and generated text.
+        """
+        if isinstance(text_content, str):
+            doc_text = text_content
+        elif isinstance(text_content, dict):
+            if document_key is None:
+                raise ValueError("document_key must be provided when text_content is dict.")
+            if document_key not in text_content:
+                 raise ValueError(f"document_key '{document_key}' not found in text_content dictionary.")
+            doc_text = text_content[document_key]
+        else:
+            raise TypeError("text_content must be a string or a dictionary.")
+        units = self.unit_chunker.chunk(doc_text)
+        # context chunker init
+        self.context_chunker.fit(doc_text, units)
+        # messages logger init
+        messages_logger = MessagesLogger() if return_messages_log else None
+        # Prepare inputs for all units first
+        tasks_input = []
+        for i, unit in enumerate(units):
+            # construct chat messages
+            messages = []
+            if self.system_prompt:
+                messages.append({'role': 'system', 'content': self.system_prompt})
+            context = self.context_chunker.chunk(unit)
+            if context == "":
+                 # no context, just place unit in user prompt
+                if isinstance(text_content, str):
+                    messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
+                else:
+                    unit_content = text_content.copy()
+                    unit_content[document_key] = unit.text
+                    messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
+            else:
+                # insert context to user prompt
+                if isinstance(text_content, str):
+                    messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
+                else:
+                    context_content = text_content.copy()
+                    context_content[document_key] = context
+                    messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
+                # simulate conversation where assistant confirms
+                messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
+                # place unit of interest
+                messages.append({'role': 'user', 'content': unit.text})
+            # Store unit and messages together for the task
+            tasks_input.append({"unit": unit, "messages": messages, "original_index": i})
+        # Process units concurrently with asyncio.Semaphore
+        semaphore = asyncio.Semaphore(concurrent_batch_size)
+        async def semaphore_helper(task_data: Dict, **kwrs):
+            unit = task_data["unit"]
+            messages = task_data["messages"]
+            async with semaphore:
+                gen_text = await self.inference_engine.chat_async(
+                    messages=messages,
+                    messages_logger=messages_logger
+                )
+            unit.set_generated_text(gen_text["response"])
+            unit.set_status("success")
+        # Create and gather tasks
+        tasks = []
+        for task_inp in tasks_input:
+            task = asyncio.create_task(semaphore_helper(
+                task_inp
+            ))
+            tasks.append(task)
+        await asyncio.gather(*tasks)
+        # Return units
+        if return_messages_log:
+            return units, messages_logger.get_messages_log()
+        else:
+            return units
+    def _default_struct_aggregate(self, structs: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Given a list of structured information (dict), aggregate them into a single dict by seqentially updating keys
+        and overwriting values.
+        """
+        aggregated_struct = {}
+        for struct in structs:
+            aggregated_struct.update(struct)
+        return aggregated_struct
+    def _post_process_struct(self, units: List[FrameExtractionUnit]) -> Dict[str, Any]:
+        """
+        Helper method to post-process units into a structured dictionary.
+        Shared by extract_struct and extract_struct_async.
+        """
+        struct_json = []
+        for unit in units:
+            if unit.status != "success":
+                continue
+            try:
+                unit_struct_json = extract_json(unit.get_generated_text())
+                struct_json.extend(unit_struct_json)
+            except Exception as e:
+                unit.set_status("fail")
+                warnings.warn(f"Struct extraction failed for unit ({unit.start}, {unit.end}): {e}", RuntimeWarning)
+        if self.aggregation_func is None:
+            struct = self._default_struct_aggregate(struct_json)
+        else:
+            struct = self.aggregation_func(struct_json)
+        return struct
+    def extract_struct(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
+                       verbose:bool=False, concurrent:bool=False, concurrent_batch_size:int=32,
+                       return_messages_log:bool=False) -> List[Dict[str, Any]]:
+        """
+        This method inputs a document text and outputs a list of LLMInformationExtractionFrame
+        It use the extract() method and post-process outputs into frames.
+        Parameters:
+        ----------
+        text_content : Union[str, Dict[str,str]]
+            the input text content to put in prompt template.
+            If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
+            If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
+        document_key : str, Optional
+            specify the key in text_content where document text is.
+            If text_content is str, this parameter will be ignored.
+        verbose : bool, Optional
+            if True, LLM generated text will be printed in terminal in real-time.
+        concurrent : bool, Optional
+            if True, the sentences will be extracted in concurrent.
+        concurrent_batch_size : int, Optional
+            the number of sentences to process in concurrent. Only used when `concurrent` is True.
+        return_messages_log : bool, Optional
+            if True, a list of messages will be returned.
+        Return : List[Dict[str, Any]]
+            a list of unanchored structured information.
+        """
+        if concurrent:
+            if verbose:
+                warnings.warn("verbose=True is not supported in concurrent mode.", RuntimeWarning)
+            nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
+            extraction_results = asyncio.run(self._extract_async(text_content=text_content,
+                                                document_key=document_key,
+                                                concurrent_batch_size=concurrent_batch_size,
+                                                return_messages_log=return_messages_log)
+                                            )
+        else:
+            extraction_results = self.extract(text_content=text_content,
+                                                document_key=document_key,
+                                                verbose=verbose,
+                                                return_messages_log=return_messages_log)
+        units, messages_log = extraction_results if return_messages_log else (extraction_results, None)
+        struct = self._post_process_struct(units)
+        if return_messages_log:
+            return struct, messages_log
+        return struct
+    async def extract_struct_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
+                                   concurrent_batch_size:int=32, return_messages_log:bool=False) -> Dict[str, Any]:
+        """
+        This is the async version of extract_struct.
+        """
+        extraction_results = await self._extract_async(text_content=text_content,
+                                                      document_key=document_key,
+                                                      concurrent_batch_size=concurrent_batch_size,
+                                                      return_messages_log=return_messages_log)
+        units, messages_log = extraction_results if return_messages_log else (extraction_results, None)
+        struct = self._post_process_struct(units)
+        if return_messages_log:
+            return struct, messages_log
+        return struct
+class BasicStructExtractor(StructExtractor):
+    def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
+                 system_prompt:str=None, aggregation_func:Callable=None):
+        """
+        This class prompts the LLM with the whole document at once for structured information extraction.
+        Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
+        Parameters:
+        ----------
+        inference_engine : InferenceEngine
+            the LLM inferencing engine object. Must implements the chat() method.
+        prompt_template : str
+            prompt template with "{{<placeholder name>}}" placeholder.
+        system_prompt : str, Optional
+            system prompt.
+        aggregation_func : Callable
+            a function that inputs a list of structured information (dict)
+            and outputs an aggregated structured information (dict).
+            if not specified, the default is to merge all dicts by updating keys and overwriting values sequentially.
+        """
+        super().__init__(inference_engine=inference_engine,
+                         unit_chunker=WholeDocumentUnitChunker(),
+                         prompt_template=prompt_template,
+                         system_prompt=system_prompt,
+                         context_chunker=WholeDocumentContextChunker())
 class FrameExtractor(Extractor):
     from nltk.tokenize import RegexpTokenizer
     def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker,
@@ -300,11 +744,19 @@ class FrameExtractor(Extractor):
         return_messages_log : bool, Optional
             if True, a list of messages will be returned.
-        Return : str
+        Return : List[LLMInformationExtractionFrame]
             a list of frames.
         """
         return NotImplemented
+    @abc.abstractmethod
+    async def extract_frames_async(self, text_content:Union[str, Dict[str,str]], entity_key:str,
+                                   document_key:str=None, return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
+        """
+        This is the async version of extract_frames.
+        """
+        return NotImplemented
 class DirectFrameExtractor(FrameExtractor):
     def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker,
@@ -513,7 +965,7 @@ class DirectFrameExtractor(FrameExtractor):
         yield {"type": "info", "data": "All units processed by LLM."}
         return units
-    async def extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
+    async def _extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
                             concurrent_batch_size:int=32, return_messages_log:bool=False) -> List[FrameExtractionUnit]:
         """
         This is the asynchronous version of the extract() method.
@@ -620,6 +1072,45 @@ class DirectFrameExtractor(FrameExtractor):
         else:
             return units
+    def _post_process_units_to_frames(self, units, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities):
+        ENTITY_KEY = "entity_text"
+        frame_list = []
+        for unit in units:
+            entity_json = []
+            if unit.status != "success":
+                warnings.warn(f"Skipping failed unit ({unit.start}, {unit.end}): {unit.text}", RuntimeWarning)
+                continue
+            for entity in extract_json(gen_text=unit.gen_text):
+                if ENTITY_KEY in entity:
+                    entity_json.append(entity)
+                else:
+                    warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{ENTITY_KEY}"). This frame will be dropped.', RuntimeWarning)
+            spans = self._find_entity_spans(text=unit.text,
+                                            entities=[e[ENTITY_KEY] for e in entity_json],
+                                            case_sensitive=case_sensitive,
+                                            fuzzy_match=fuzzy_match,
+                                            fuzzy_buffer_size=fuzzy_buffer_size,
+                                            fuzzy_score_cutoff=fuzzy_score_cutoff,
+                                            allow_overlap_entities=allow_overlap_entities)
+            for ent, span in zip(entity_json, spans):
+                if span is not None:
+                    start, end = span
+                    entity_text = unit.text[start:end]
+                    start += unit.start
+                    end += unit.start
+                    attr = {}
+                    if "attr" in ent and ent["attr"] is not None:
+                        attr = ent["attr"]
+                    frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
+                                start=start,
+                                end=end,
+                                entity_text=entity_text,
+                                attr=attr)
+                    frame_list.append(frame)
+        return frame_list
     def extract_frames(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
                        verbose:bool=False, concurrent:bool=False, concurrent_batch_size:int=32,
@@ -659,7 +1150,7 @@ class DirectFrameExtractor(FrameExtractor):
         return_messages_log : bool, Optional
             if True, a list of messages will be returned.
-        Return : str
+        Return : List[LLMInformationExtractionFrame]
             a list of frames.
         """
         ENTITY_KEY = "entity_text"
@@ -668,7 +1159,7 @@ class DirectFrameExtractor(FrameExtractor):
                 warnings.warn("verbose=True is not supported in concurrent mode.", RuntimeWarning)
             nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
-            extraction_results = asyncio.run(self.extract_async(text_content=text_content,
+            extraction_results = asyncio.run(self._extract_async(text_content=text_content,
                                                 document_key=document_key,
                                                 concurrent_batch_size=concurrent_batch_size,
                                                 return_messages_log=return_messages_log)
@@ -681,248 +1172,31 @@ class DirectFrameExtractor(FrameExtractor):
         units, messages_log = extraction_results if return_messages_log else (extraction_results, None)
-        frame_list = []
-        for unit in units:
-            entity_json = []
-            if unit.status != "success":
-                warnings.warn(f"Skipping failed unit ({unit.start}, {unit.end}): {unit.text}", RuntimeWarning)
-                continue
-            for entity in extract_json(gen_text=unit.gen_text):
-                if ENTITY_KEY in entity:
-                    entity_json.append(entity)
-                else:
-                    warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{ENTITY_KEY}"). This frame will be dropped.', RuntimeWarning)
-            spans = self._find_entity_spans(text=unit.text,
-                                            entities=[e[ENTITY_KEY] for e in entity_json],
-                                            case_sensitive=case_sensitive,
-                                            fuzzy_match=fuzzy_match,
-                                            fuzzy_buffer_size=fuzzy_buffer_size,
-                                            fuzzy_score_cutoff=fuzzy_score_cutoff,
-                                            allow_overlap_entities=allow_overlap_entities)
-            for ent, span in zip(entity_json, spans):
-                if span is not None:
-                    start, end = span
-                    entity_text = unit.text[start:end]
-                    start += unit.start
-                    end += unit.start
-                    attr = {}
-                    if "attr" in ent and ent["attr"] is not None:
-                        attr = ent["attr"]
-                    frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
-                                start=start,
-                                end=end,
-                                entity_text=entity_text,
-                                attr=attr)
-                    frame_list.append(frame)
+        frame_list = self._post_process_units_to_frames(units, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities)
         if return_messages_log:
             return frame_list, messages_log
         return frame_list
-    async def extract_frames_from_documents(self, text_contents:List[Union[str,Dict[str, any]]], document_key:str="text",
-            cpu_concurrency:int=4, llm_concurrency:int=32, case_sensitive:bool=False,
-            fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
-            allow_overlap_entities:bool=False, return_messages_log:bool=False) -> AsyncGenerator[Dict[str, any], None]:
+    async def extract_frames_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
+                                   concurrent_batch_size:int=32, case_sensitive:bool=False,
+                                   fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
+                                   allow_overlap_entities:bool=False, return_messages_log:bool=False) -> List[LLMInformationExtractionFrame]:
         """
-        This method inputs a list of documents and yields the results for each document as soon as it is complete.
-        Parameters:
-        -----------
-        text_contents : List[Union[str,Dict[str, any]]]
-            a list of input text contents to put in prompt template.
-            If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
-            If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
-        document_key: str, optional
-            The key in the `text_contents` dictionaries that holds the document text.
-        cpu_concurrency: int, optional
-            The number of parallel threads to use for CPU-bound tasks like chunking.
-        llm_concurrency: int, optional
-            The number of concurrent requests to make to the LLM.
-        case_sensitive : bool, Optional
-            if True, entity text matching will be case-sensitive.
-        fuzzy_match : bool, Optional
-            if True, fuzzy matching will be applied to find entity text.
-        fuzzy_buffer_size : float, Optional
-            the buffer size for fuzzy matching. Default is 20% of entity text length.
-        fuzzy_score_cutoff : float, Optional
-            the Jaccard score cutoff for fuzzy matching.
-            Matched entity text must have a score higher than this value or a None will be returned.
-        allow_overlap_entities : bool, Optional
-            if True, entities can overlap in the text.
-        return_messages_log : bool, Optional
-            if True, a list of messages will be returned.
-        Yields:
-        -------
-        AsyncGenerator[Dict[str, any], None]
-            A dictionary for each completed document, containing its 'idx' and extracted 'frames'.
-        """
-        # Validate text_contents must be a list of str or dict, and not both
-        if not isinstance(text_contents, list):
-            raise ValueError("text_contents must be a list of strings or dictionaries.")
-        if all(isinstance(doc, str) for doc in text_contents):
-            pass
-        elif all(isinstance(doc, dict) for doc in text_contents):
-            pass
-        # Set CPU executor and queues
-        cpu_executor = ThreadPoolExecutor(max_workers=cpu_concurrency)
-        tasks_queue = asyncio.Queue(maxsize=llm_concurrency * 2)
-        # Store to track units and pending counts
-        results_store = {
-            idx: {'pending': 0, 'units': [], 'text': doc if isinstance(doc, str) else doc.get(document_key, "")}
-            for idx, doc in enumerate(text_contents)
-        }
-        output_queue = asyncio.Queue()
-        messages_logger = MessagesLogger() if return_messages_log else None
-        async def producer():
-            try:
-                for idx, text_content in enumerate(text_contents):
-                    text = text_content if isinstance(text_content, str) else text_content.get(document_key, "")
-                    if not text:
-                        warnings.warn(f"Document at index {idx} is empty or missing the document key '{document_key}'.")
-                        # signal that this document is done
-                        await output_queue.put({'idx': idx, 'frames': []})
-                        continue
-                    units = await self.unit_chunker.chunk_async(text, cpu_executor)
-                    await self.context_chunker.fit_async(text, units, cpu_executor)
-                    results_store[idx]['pending'] = len(units)
-                    # Handle cases where a document yields no units
-                    if not units:
-                        # signal that this document is done
-                        await output_queue.put({'idx': idx, 'frames': []})
-                        continue
-                    # Iterate through units
-                    for unit in units:
-                        context = await self.context_chunker.chunk_async(unit, cpu_executor)
-                        messages = []
-                        if self.system_prompt:
-                            messages.append({'role': 'system', 'content': self.system_prompt})
-                        if not context:
-                            if isinstance(text_content, str):
-                                messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
-                            else:
-                                unit_content = text_content.copy()
-                                unit_content[document_key] = unit.text
-                                messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
-                        else:
-                            # insert context to user prompt
-                            if isinstance(text_content, str):
-                                messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
-                            else:
-                                context_content = text_content.copy()
-                                context_content[document_key] = context
-                                messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
-                            # simulate conversation where assistant confirms
-                            messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
-                            # place unit of interest
-                            messages.append({'role': 'user', 'content': unit.text})
-                        await tasks_queue.put({'idx': idx, 'unit': unit, 'messages': messages})
-            finally:
-                for _ in range(llm_concurrency):
-                    await tasks_queue.put(None)
-        async def worker():
-            while True:
-                task_item = await tasks_queue.get()
-                if task_item is None:
-                    tasks_queue.task_done()
-                    break
-                idx = task_item['idx']
-                unit = task_item['unit']
-                doc_results = results_store[idx]
-                try:
-                    gen_text = await self.inference_engine.chat_async(
-                        messages=task_item['messages'], messages_logger=messages_logger
-                    )
-                    unit.set_generated_text(gen_text["response"])
-                    unit.set_status("success")
-                    doc_results['units'].append(unit)
-                except Exception as e:
-                    warnings.warn(f"Error processing unit for doc idx {idx}: {e}")
-                finally:
-                    doc_results['pending'] -= 1
-                    if doc_results['pending'] <= 0:
-                        final_frames = self._post_process_and_create_frames(doc_results, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities)
-                        output_payload = {'idx': idx, 'frames': final_frames}
-                        if return_messages_log:
-                            output_payload['messages_log'] = messages_logger.get_messages_log()
-                        await output_queue.put(output_payload)
-                    tasks_queue.task_done()
-        # Start producer and workers
-        producer_task = asyncio.create_task(producer())
-        worker_tasks = [asyncio.create_task(worker()) for _ in range(llm_concurrency)]
-        # Main loop to gather results
-        docs_completed = 0
-        while docs_completed < len(text_contents):
-            result = await output_queue.get()
-            yield result
-            docs_completed += 1
-        # Final cleanup
-        await producer_task
-        await tasks_queue.join()
-        # Cancel any lingering worker tasks
-        for task in worker_tasks:
-            task.cancel()
-        await asyncio.gather(*worker_tasks, return_exceptions=True)
-        cpu_executor.shutdown(wait=False)
-    def _post_process_and_create_frames(self, doc_results, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities):
-        """Helper function to run post-processing logic for a completed document."""
-        ENTITY_KEY = "entity_text"
-        frame_list = []
-        for res in sorted(doc_results['units'], key=lambda r: r.start):
-            entity_json = []
-            for entity in extract_json(gen_text=res.gen_text):
-                if ENTITY_KEY in entity:
-                    entity_json.append(entity)
-                else:
-                    warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{ENTITY_KEY}"). This frame will be dropped.', RuntimeWarning)
+        This is the async version of extract_frames.
+        """
+        extraction_results = await self._extract_async(text_content=text_content,
+                                                      document_key=document_key,
+                                                      concurrent_batch_size=concurrent_batch_size,
+                                                      return_messages_log=return_messages_log)
+        units, messages_log = extraction_results if return_messages_log else (extraction_results, None)
+        frame_list = self._post_process_units_to_frames(units, case_sensitive, fuzzy_match, fuzzy_buffer_size, fuzzy_score_cutoff, allow_overlap_entities)
-            spans = self._find_entity_spans(
-                text=res.text,
-                entities=[e[ENTITY_KEY] for e in entity_json],
-                case_sensitive=case_sensitive,
-                fuzzy_match=fuzzy_match,
-                fuzzy_buffer_size=fuzzy_buffer_size,
-                fuzzy_score_cutoff=fuzzy_score_cutoff,
-                allow_overlap_entities=allow_overlap_entities
-            )
-            for ent, span in zip(entity_json, spans):
-                if span is not None:
-                    start, end = span
-                    entity_text = res.text[start:end]
-                    start += res.start
-                    end += res.start
-                    attr = ent.get("attr", {}) or {}
-                    frame = LLMInformationExtractionFrame(
-                        frame_id=f"{len(frame_list)}",
-                        start=start,
-                        end=end,
-                        entity_text=entity_text,
-                        attr=attr
-                    )
-                    frame_list.append(frame)
+        if return_messages_log:
+            return frame_list, messages_log
         return frame_list
 class ReviewFrameExtractor(DirectFrameExtractor):
     def __init__(self, unit_chunker:UnitChunker, context_chunker:ContextChunker, inference_engine:InferenceEngine,
@@ -1200,7 +1474,7 @@ class ReviewFrameExtractor(DirectFrameExtractor):
             for chunk in response_stream:
                 yield chunk
-    async def extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
+    async def _extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
                             concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[FrameExtractionUnit]:
         """
         This is the asynchronous version of the extract() method with the review step.
@@ -1703,7 +1977,7 @@ class AttributeExtractor(Extractor):
             return (new_frames, messages_log) if return_messages_log else new_frames
-    async def extract_async(self, frames:List[LLMInformationExtractionFrame], text:str, context_size:int=256,
+    async def _extract_async(self, frames:List[LLMInformationExtractionFrame], text:str, context_size:int=256,
                             concurrent_batch_size:int=32, inplace:bool=True, return_messages_log:bool=False) -> Union[None, List[LLMInformationExtractionFrame]]:
         """
         This method extracts attributes from the document asynchronously.
@@ -1775,6 +2049,16 @@ class AttributeExtractor(Extractor):
         else:
             return (new_frames, messages_logger.get_messages_log()) if return_messages_log else new_frames
+    async def extract_attributes_async(self, frames:List[LLMInformationExtractionFrame], text:str, context_size:int=256,
+                                       concurrent_batch_size:int=32, inplace:bool=True,
+                                       return_messages_log:bool=False) -> Union[None, List[LLMInformationExtractionFrame]]:
+        """
+        This is the async version of extract_attributes.
+        """
+        return await self._extract_async(frames=frames, text=text, context_size=context_size,
+                            concurrent_batch_size=concurrent_batch_size, inplace=inplace, return_messages_log=return_messages_log)
     def extract_attributes(self, frames:List[LLMInformationExtractionFrame], text:str, context_size:int=256,
                            concurrent:bool=False, concurrent_batch_size:int=32, verbose:bool=False,
                            return_messages_log:bool=False, inplace:bool=True) -> Union[None, List[LLMInformationExtractionFrame]]:
@@ -1810,7 +2094,7 @@ class AttributeExtractor(Extractor):
             nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
-            return asyncio.run(self.extract_async(frames=frames, text=text, context_size=context_size,
+            return asyncio.run(self._extract_async(frames=frames, text=text, context_size=context_size,
                                                   concurrent_batch_size=concurrent_batch_size,
                                                   inplace=inplace, return_messages_log=return_messages_log))
         else:
@@ -1955,6 +2239,17 @@ class RelationExtractor(Extractor):
             return asyncio.run(self._extract_async(doc, buffer_size, concurrent_batch_size, return_messages_log))
         else:
             return self._extract(doc, buffer_size, verbose, return_messages_log)
+    async def extract_relations_async(self, doc: LLMInformationExtractionDocument, buffer_size: int = 128, concurrent_batch_size: int = 32, return_messages_log: bool = False) -> Union[List[Dict], Tuple[List[Dict], List]]:
+        """
+        This is the async version of extract_relations.
+        """
+        if not doc.has_frame():
+            raise ValueError("Input document must have frames.")
+        if doc.has_duplicate_frame_ids():
+            raise ValueError("All frame_ids in the input document must be unique.")
+        return await self._extract_async(doc, buffer_size, concurrent_batch_size, return_messages_log)
 class BinaryRelationExtractor(RelationExtractor):

llm-ie 1.2.4__py3-none-any.whl → 1.4.0__py3-none-any.whl

llm-ie 1.2.4py3-none-any.whl → 1.4.0py3-none-any.whl