PyPI - llm-ie - Versions diffs - 1.2.4__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

llm-ie 1.2.4py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

llm_ie/__init__.py +2 -2
llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt +2 -2
llm_ie/asset/prompt_guide/StructExtractor_prompt_guide.txt +53 -0
llm_ie/data_types.py +72 -44
llm_ie/extractors.py +422 -2
{llm_ie-1.2.4.dist-info → llm_ie-1.3.0.dist-info}/METADATA +1 -1
{llm_ie-1.2.4.dist-info → llm_ie-1.3.0.dist-info}/RECORD +8 -7
{llm_ie-1.2.4.dist-info → llm_ie-1.3.0.dist-info}/WHEEL +0 -0

llm_ie/__init__.py CHANGED Viewed

@@ -1,12 +1,12 @@
 from .data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
 from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
 from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, SGLangInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
-from .extractors import DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
+from .extractors import StructExtractor, BasicStructExtractor, DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
 from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, LLMUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
 from .prompt_editor import PromptEditor
 __all__ = ["LLMInformationExtractionFrame", "LLMInformationExtractionDocument",
            "BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "SGLangInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
-           "DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
+           "StructExtractor", "BasicStructExtractor", "DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
            "UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "LLMUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
            "PromptEditor"]

llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt CHANGED Viewed

@@ -7,7 +7,7 @@ Prompt Template Design:
    List the attributes to extract, and provide clear definitions for each one.
 3. Output Format Definition:
-   The output should be a JSON list, where each attribute be a key. The values could be any structure (e.g., str, int, List[str]).
+   The output should be a JSON, where each attribute be a key. The values could be any structure (e.g., str, int, List[str]).
 4. Optional: Hints:
    Provide itemized hints for the information extractors to guide the extraction process. Remind the prompted agent to be truthful. Emphasize that the prompted agent is supposed to perform the task instead of writting code or instruct other agents to do it.
@@ -37,7 +37,7 @@ Example:
     Your output should follow the JSON format:
     {"Date": "<MM/DD/YYYY>", "Status": "<status>"}
-    I am only interested in the content between []. Do not explain your answer.
+    I am only interested in the content between {}. Do not explain your answer.
     ### Hints
     - If the date is not complete, use the first available date in the context. For example, if the date is 01/2023, you should return 01/01/2023.

llm_ie/asset/prompt_guide/StructExtractor_prompt_guide.txt ADDED Viewed

@@ -0,0 +1,53 @@
+Prompt Template Design:
+1. Task Description:
+   Provide a detailed description of the task, including the background and the type of task (e.g., structured data extraction task).
+2. Schema Definition:
+   List the key-value pairs to extract, and provide clear definitions for each one.
+3. Output Format Definition:
+   The output should be a JSON. The values could be any structure (e.g., str, int, List[str]).
+4. Optional: Hints:
+   Provide itemized hints for the information extractors to guide the extraction process. Remind the prompted agent to be truthful. Emphasize that the prompted agent is supposed to perform the task instead of writting code or instruct other agents to do it.
+5. Optional: Examples:
+   Include examples in the format:
+    Input: ...
+    Output: ...
+6. Context:
+   The template must include a placeholder {{input}} for the document or chunk.
+Example:
+    ### Task description
+    This is an structured data extraction task. Given a medical report, you need to extract structured patient data from it.
+    ### Schema definition
+        "PatientName" which is the name of the patient,
+        "Age" which is the age of the patient in years,
+        "MRN" which is the medical record number of the patient.
+    ### Output format definition
+    Your output should follow the JSON format:
+    ```json
+   {
+      "PatientName": "<patient_name>",
+      "Age": <age_in_years>,
+      "MRN": "<medical_record_number>"
+   }
+   ```
+    I am only interested in the content between {}. Do not explain your answer.
+    ### Hints
+    - Make sure to extract the exact patient name as it appears in the report.
+    - You are suppose to perform the extraction task instead of writting code or instruct other agents to do it.
+    - If some values are not available, you should return "not specified".
+    ### Context
+    The text below is from the medical report:
+    "{{input}}"

llm_ie/data_types.py CHANGED Viewed

@@ -141,7 +141,7 @@ class LLMInformationExtractionFrame:
 class LLMInformationExtractionDocument:
-    def __init__(self, doc_id:str=None, filename:str=None, text:str=None,
+    def __init__(self, doc_id:str=None, filename:str=None, text:str=None, struct:Dict=None,
                  frames:List[LLMInformationExtractionFrame]=None, relations:List[Dict[str,str]]=None):
         """
         This class holds LLM-extracted frames, handles save/ load.
@@ -154,6 +154,8 @@ class LLMInformationExtractionDocument:
             the directory to a yaml file of a saved LLMInformationExtractionDocument
         text : str, Optional
             document text
+        struct : Dict, Optional
+            a dictionary of unanchored structure information
         frames : List[LLMInformationExtractionFrame], Optional
             a list of LLMInformationExtractionFrame
         relations : List[Dict[str,str]], Optional
@@ -168,12 +170,28 @@ class LLMInformationExtractionDocument:
                 llm_ie = json.load(json_file)
             if 'doc_id' in llm_ie.keys():
                 self.doc_id = llm_ie['doc_id']
+            else:
+                raise ValueError("doc_id key not found in the file.")
             if 'text' in llm_ie.keys():
                 self.text = llm_ie['text']
+            else:
+                raise ValueError("text key not found in the file.")
+            if 'struct' in llm_ie.keys():
+                self.struct = llm_ie['struct']
+            else:
+                self.struct = {}
             if 'frames' in llm_ie.keys():
                 self.frames = [LLMInformationExtractionFrame.from_dict(d) for d in llm_ie['frames']]
+            else:
+                self.frames = []
             if 'relations' in llm_ie.keys():
                 self.relations = llm_ie['relations']
+            else:
+                self.relations = []
         # create object from raw inputs
         else:
@@ -181,9 +199,15 @@ class LLMInformationExtractionDocument:
                 raise TypeError("doc_id must be a string.")
             self.doc_id = doc_id
             self.text = text
+            self.struct = struct.copy() if struct is not None else {}
             self.frames = frames.copy() if frames is not None else []
             self.relations = relations.copy() if relations is not None else []
+    def has_struct(self) -> bool:
+        """
+        This method checks if there is any unanchored structure information.
+        """
+        return bool(self.struct)
     def has_frame(self) -> bool:
         """
@@ -228,6 +252,18 @@ class LLMInformationExtractionDocument:
         return None
+    def set_struct(self, struct:Dict):
+        """
+        This method sets the unanchored structure information.
+        Parameters
+        ----------
+        struct : Dict
+            a dictionary of unanchored structure information
+        """
+        if not isinstance(struct, Dict):
+            raise TypeError("struct must be a dictionary.")
+        self.struct = struct.copy()
     def add_frame(self, frame:LLMInformationExtractionFrame, valid_mode:str=None, create_id:bool=False) -> bool:
         """
@@ -326,10 +362,12 @@ class LLMInformationExtractionDocument:
     def __repr__(self, N_top_chars:int=100) -> str:
         text_to_print = self.text[0:N_top_chars]
+        struct_key_count = len(self.struct.keys())
         frame_count = len(self.frames)
         relation_count = len(self.relations)
         return ''.join((f'LLMInformationExtractionDocument(doc_id: "{self.doc_id}"\n',
                         f'text: "{text_to_print}...",\n',
+                        f'struct keys: {struct_key_count}\n',
                         f'frames: {frame_count}\n',
                         f'relations: {relation_count}'))
@@ -338,6 +376,7 @@ class LLMInformationExtractionDocument:
         with open(filename, 'w') as json_file:
             json.dump({'doc_id': self.doc_id,
                         'text': self.text,
+                        'struct': self.struct,
                         'frames': [frame.to_dict() for frame in self.frames],
                         'relations': self.relations},
                         json_file, indent=4)
@@ -346,16 +385,22 @@ class LLMInformationExtractionDocument:
     def _viz_preprocess(self) -> Tuple:
         """
-        This method preprocesses the entities and relations for visualization.
+        This method preprocesses the struct, entities and relations for visualization.
         """
         if importlib.util.find_spec("ie_viz") is None:
-            raise ImportError("ie_viz not found. Please install ie_viz (```pip install ie-viz```).")
+            raise ImportError("ie_viz not found. Please install ie_viz (```pip install -U ie-viz```).")
+        # Struct
+        if self.has_struct():
+            struct = self.struct
+        else:
+            struct = {}
+        # Entities
         if self.has_frame():
             entities = [{"entity_id": frame.frame_id, "start": frame.start, "end": frame.end, "attr": frame.attr} for frame in self.frames]
         else:
-            raise ValueError("No frames in the document.")
+            entities = None
+        # Relations
         if self.has_relation():
             relations = []
             for relation in self.relations:
@@ -364,7 +409,7 @@ class LLMInformationExtractionDocument:
         else:
             relations = None
-        return entities, relations
+        return struct, entities, relations
     def viz_serve(self, host: str = '0.0.0.0', port: int = 5000, theme:str = "light", title:str="Frames Visualization",
@@ -388,29 +433,20 @@ class LLMInformationExtractionDocument:
             The function to be used for mapping the entity attributes to colors. When provided, the color_attr_key and
             theme will be overwritten. The function must take an entity dictionary as input and return a color string (hex).
         """
-        entities, relations = self._viz_preprocess()
+        struct, entities, relations = self._viz_preprocess()
         from ie_viz import serve
-        try:
-            serve(text=self.text,
-                    entities=entities,
-                    relations=relations,
-                    host=host,
-                    port=port,
-                    theme=theme,
-                    title=title,
-                    color_attr_key=color_attr_key,
-                    color_map_func=color_map_func)
-        except TypeError:
-            warnings.warn("The version of ie_viz is not the latest. Please update to the latest version (pip install --upgrade ie-viz) for complete features.", UserWarning)
-            serve(text=self.text,
-                    entities=entities,
-                    relations=relations,
-                    host=host,
-                    port=port,
-                    theme=theme,
-                    color_attr_key=color_attr_key,
-                    color_map_func=color_map_func)
+        serve(text=self.text,
+              struct=struct,
+              entities=entities,
+              relations=relations,
+              host=host,
+              port=port,
+              theme=theme,
+              title=title,
+              color_attr_key=color_attr_key,
+              color_map_func=color_map_func)
     def viz_render(self, theme:str = "light", color_attr_key:str=None, color_map_func:Callable=None,
                    title:str="Frames Visualization") -> str:
@@ -429,22 +465,14 @@ class LLMInformationExtractionDocument:
         title : str, Optional
             the title of the HTML.
         """
-        entities, relations = self._viz_preprocess()
+        struct, entities, relations = self._viz_preprocess()
         from ie_viz import render
-        try:
-            return render(text=self.text,
-                        entities=entities,
-                        relations=relations,
-                        theme=theme,
-                        title=title,
-                        color_attr_key=color_attr_key,
-                        color_map_func=color_map_func)
-        except TypeError:
-                warnings.warn("The version of ie_viz is not the latest. Please update to the latest version (pip install --upgrade ie-viz) for complete features.", UserWarning)
-                return render(text=self.text,
-                        entities=entities,
-                        relations=relations,
-                        theme=theme,
-                        color_attr_key=color_attr_key,
-                        color_map_func=color_map_func)
+        return render(text=self.text,
+                      struct=struct,
+                      entities=entities,
+                      relations=relations,
+                      theme=theme,
+                      title=title,
+                      color_attr_key=color_attr_key,
+                      color_map_func=color_map_func)

llm_ie/extractors.py CHANGED Viewed

@@ -98,6 +98,426 @@ class Extractor:
         return apply_prompt_template(self.prompt_template, text_content)
+class StructExtractor(Extractor):
+    def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker, prompt_template:str,
+                 system_prompt:str=None, context_chunker:ContextChunker=None, aggregation_func:Callable=None):
+        """
+        This class is for unanchored structured information extraction.
+        Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
+        Parameters:
+        ----------
+        inference_engine : InferenceEngine
+            the LLM inferencing engine object. Must implements the chat() method.
+        unit_chunker : UnitChunker
+            the unit chunker object that determines how to chunk the document text into units.
+        prompt_template : str
+            prompt template with "{{<placeholder name>}}" placeholder.
+        system_prompt : str, Optional
+            system prompt.
+        context_chunker : ContextChunker
+            the context chunker object that determines how to get context for each unit.
+        aggregation_func : Callable
+            a function that inputs a list of structured information (dict)
+            and outputs an aggregated structured information (dict).
+            if not specified, the default is to merge all dicts by updating keys and overwriting values sequentially.
+        """
+        super().__init__(inference_engine=inference_engine,
+                         prompt_template=prompt_template,
+                         system_prompt=system_prompt)
+        self.unit_chunker = unit_chunker
+        self.context_chunker = context_chunker
+        self.aggregation_func = aggregation_func
+    def extract(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
+                verbose:bool=False, return_messages_log:bool=False) -> List[FrameExtractionUnit]:
+        """
+        This method inputs text content and outputs a string generated by LLM
+        Parameters:
+        ----------
+        text_content : Union[str, Dict[str,str]]
+            the input text content to put in prompt template.
+            If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
+            If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
+        return_messages_log : bool, Optional
+            if True, a list of messages will be returned.
+        Return : List[FrameExtractionUnit]
+            the output from LLM. Need post-processing.
+        """
+        # unit chunking
+        if isinstance(text_content, str):
+            doc_text = text_content
+        elif isinstance(text_content, dict):
+            if document_key is None:
+                raise ValueError("document_key must be provided when text_content is dict.")
+            doc_text = text_content[document_key]
+        units = self.unit_chunker.chunk(doc_text)
+        # context chunker init
+        self.context_chunker.fit(doc_text, units)
+        # messages log
+        messages_logger = MessagesLogger() if return_messages_log else None
+        # generate unit by unit
+        for i, unit in enumerate(units):
+            try:
+                # construct chat messages
+                messages = []
+                if self.system_prompt:
+                    messages.append({'role': 'system', 'content': self.system_prompt})
+                context = self.context_chunker.chunk(unit)
+                if context == "":
+                    # no context, just place unit in user prompt
+                    if isinstance(text_content, str):
+                        messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
+                    else:
+                        unit_content = text_content.copy()
+                        unit_content[document_key] = unit.text
+                        messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
+                else:
+                    # insert context to user prompt
+                    if isinstance(text_content, str):
+                        messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
+                    else:
+                        context_content = text_content.copy()
+                        context_content[document_key] = context
+                        messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
+                    # simulate conversation where assistant confirms
+                    messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
+                    # place unit of interest
+                    messages.append({'role': 'user', 'content': unit.text})
+                if verbose:
+                    print(f"\n\n{Fore.GREEN}Unit {i + 1}/{len(units)}:{Style.RESET_ALL}\n{unit.text}\n")
+                    if context != "":
+                        print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
+                    print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
+                gen_text = self.inference_engine.chat(
+                                messages=messages,
+                                verbose=verbose,
+                                stream=False,
+                                messages_logger=messages_logger
+                            )
+                # add generated text to unit
+                unit.set_generated_text(gen_text["response"])
+                unit.set_status("success")
+            except Exception as e:
+                unit.set_status("fail")
+                warnings.warn(f"LLM inference failed for unit {i} ({unit.start}, {unit.end}): {e}", RuntimeWarning)
+        if return_messages_log:
+            return units, messages_logger.get_messages_log()
+        return units
+    def stream(self, text_content: Union[str, Dict[str, str]],
+               document_key: str = None) -> Generator[Dict[str, Any], None, List[FrameExtractionUnit]]:
+        """
+        Streams LLM responses per unit with structured event types,
+        and returns collected data for post-processing.
+        Yields:
+        -------
+        Dict[str, Any]: (type, data)
+            - {"type": "info", "data": str_message}: General informational messages.
+            - {"type": "unit", "data": dict_unit_info}: Signals start of a new unit. dict_unit_info contains {'id', 'text', 'start', 'end'}
+            - {"type": "context", "data": str_context}: Context string for the current unit.
+            - {"type": "reasoning", "data": str_chunk}: A reasoning model thinking chunk from the LLM.
+            - {"type": "response", "data": str_chunk}: A response/answer chunk from the LLM.
+        Returns:
+        --------
+        List[FrameExtractionUnit]:
+            A list of FrameExtractionUnit objects, each containing the
+            original unit details and the fully accumulated 'gen_text' from the LLM.
+        """
+        if isinstance(text_content, str):
+            doc_text = text_content
+        elif isinstance(text_content, dict):
+            if document_key is None:
+                raise ValueError("document_key must be provided when text_content is dict.")
+            if document_key not in text_content:
+                raise ValueError(f"document_key '{document_key}' not found in text_content.")
+            doc_text = text_content[document_key]
+        else:
+            raise TypeError("text_content must be a string or a dictionary.")
+        units: List[FrameExtractionUnit] = self.unit_chunker.chunk(doc_text)
+        self.context_chunker.fit(doc_text, units)
+        yield {"type": "info", "data": f"Starting LLM processing for {len(units)} units."}
+        for i, unit in enumerate(units):
+            unit_info_payload = {"id": i, "text": unit.text, "start": unit.start, "end": unit.end}
+            yield {"type": "unit", "data": unit_info_payload}
+            messages = []
+            if self.system_prompt:
+                messages.append({'role': 'system', 'content': self.system_prompt})
+            context_str = self.context_chunker.chunk(unit)
+            # Construct prompt input based on whether text_content was str or dict
+            if context_str:
+                yield {"type": "context", "data": context_str}
+                prompt_input_for_context = context_str
+                if isinstance(text_content, dict):
+                    context_content_dict = text_content.copy()
+                    context_content_dict[document_key] = context_str
+                    prompt_input_for_context = context_content_dict
+                messages.append({'role': 'user', 'content': self._get_user_prompt(prompt_input_for_context)})
+                messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
+                messages.append({'role': 'user', 'content': unit.text})
+            else: # No context
+                prompt_input_for_unit = unit.text
+                if isinstance(text_content, dict):
+                    unit_content_dict = text_content.copy()
+                    unit_content_dict[document_key] = unit.text
+                    prompt_input_for_unit = unit_content_dict
+                messages.append({'role': 'user', 'content': self._get_user_prompt(prompt_input_for_unit)})
+            current_gen_text = ""
+            response_stream = self.inference_engine.chat(
+                messages=messages,
+                stream=True
+            )
+            for chunk in response_stream:
+                yield chunk
+                if chunk["type"] == "response":
+                    current_gen_text += chunk["data"]
+            # Store the result for this unit
+            unit.set_generated_text(current_gen_text)
+            unit.set_status("success")
+        yield {"type": "info", "data": "All units processed by LLM."}
+        return units
+    async def extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
+                            concurrent_batch_size:int=32, return_messages_log:bool=False) -> List[FrameExtractionUnit]:
+        """
+        This is the asynchronous version of the extract() method.
+        Parameters:
+        ----------
+        text_content : Union[str, Dict[str,str]]
+            the input text content to put in prompt template.
+            If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
+            If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
+        document_key : str, Optional
+            specify the key in text_content where document text is.
+            If text_content is str, this parameter will be ignored.
+        concurrent_batch_size : int, Optional
+            the batch size for concurrent processing.
+        return_messages_log : bool, Optional
+            if True, a list of messages will be returned.
+        Return : List[FrameExtractionUnit]
+            the output from LLM for each unit. Contains the start, end, text, and generated text.
+        """
+        if isinstance(text_content, str):
+            doc_text = text_content
+        elif isinstance(text_content, dict):
+            if document_key is None:
+                raise ValueError("document_key must be provided when text_content is dict.")
+            if document_key not in text_content:
+                 raise ValueError(f"document_key '{document_key}' not found in text_content dictionary.")
+            doc_text = text_content[document_key]
+        else:
+            raise TypeError("text_content must be a string or a dictionary.")
+        units = self.unit_chunker.chunk(doc_text)
+        # context chunker init
+        self.context_chunker.fit(doc_text, units)
+        # messages logger init
+        messages_logger = MessagesLogger() if return_messages_log else None
+        # Prepare inputs for all units first
+        tasks_input = []
+        for i, unit in enumerate(units):
+            # construct chat messages
+            messages = []
+            if self.system_prompt:
+                messages.append({'role': 'system', 'content': self.system_prompt})
+            context = self.context_chunker.chunk(unit)
+            if context == "":
+                 # no context, just place unit in user prompt
+                if isinstance(text_content, str):
+                    messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
+                else:
+                    unit_content = text_content.copy()
+                    unit_content[document_key] = unit.text
+                    messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
+            else:
+                # insert context to user prompt
+                if isinstance(text_content, str):
+                    messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
+                else:
+                    context_content = text_content.copy()
+                    context_content[document_key] = context
+                    messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
+                # simulate conversation where assistant confirms
+                messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
+                # place unit of interest
+                messages.append({'role': 'user', 'content': unit.text})
+            # Store unit and messages together for the task
+            tasks_input.append({"unit": unit, "messages": messages, "original_index": i})
+        # Process units concurrently with asyncio.Semaphore
+        semaphore = asyncio.Semaphore(concurrent_batch_size)
+        async def semaphore_helper(task_data: Dict, **kwrs):
+            unit = task_data["unit"]
+            messages = task_data["messages"]
+            async with semaphore:
+                gen_text = await self.inference_engine.chat_async(
+                    messages=messages,
+                    messages_logger=messages_logger
+                )
+            unit.set_generated_text(gen_text["response"])
+            unit.set_status("success")
+        # Create and gather tasks
+        tasks = []
+        for task_inp in tasks_input:
+            task = asyncio.create_task(semaphore_helper(
+                task_inp
+            ))
+            tasks.append(task)
+        await asyncio.gather(*tasks)
+        # Return units
+        if return_messages_log:
+            return units, messages_logger.get_messages_log()
+        else:
+            return units
+    def _default_struct_aggregate(self, structs: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Given a list of structured information (dict), aggregate them into a single dict by seqentially updating keys
+        and overwriting values.
+        """
+        aggregated_struct = {}
+        for struct in structs:
+            aggregated_struct.update(struct)
+        return aggregated_struct
+    def extract_struct(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
+                       verbose:bool=False, concurrent:bool=False, concurrent_batch_size:int=32,
+                       return_messages_log:bool=False) -> List[Dict[str, Any]]:
+        """
+        This method inputs a document text and outputs a list of LLMInformationExtractionFrame
+        It use the extract() method and post-process outputs into frames.
+        Parameters:
+        ----------
+        text_content : Union[str, Dict[str,str]]
+            the input text content to put in prompt template.
+            If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
+            If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
+        document_key : str, Optional
+            specify the key in text_content where document text is.
+            If text_content is str, this parameter will be ignored.
+        verbose : bool, Optional
+            if True, LLM generated text will be printed in terminal in real-time.
+        concurrent : bool, Optional
+            if True, the sentences will be extracted in concurrent.
+        concurrent_batch_size : int, Optional
+            the number of sentences to process in concurrent. Only used when `concurrent` is True.
+        return_messages_log : bool, Optional
+            if True, a list of messages will be returned.
+        Return : List[Dict[str, Any]]
+            a list of unanchored structured information.
+        """
+        if concurrent:
+            if verbose:
+                warnings.warn("verbose=True is not supported in concurrent mode.", RuntimeWarning)
+            nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
+            extraction_results = asyncio.run(self.extract_async(text_content=text_content,
+                                                document_key=document_key,
+                                                concurrent_batch_size=concurrent_batch_size,
+                                                return_messages_log=return_messages_log)
+                                            )
+        else:
+            extraction_results = self.extract(text_content=text_content,
+                                                document_key=document_key,
+                                                verbose=verbose,
+                                                return_messages_log=return_messages_log)
+        units, messages_log = extraction_results if return_messages_log else (extraction_results, None)
+        struct_json = []
+        for unit in units:
+            if unit.status != "success":
+                continue
+            try:
+                unit_struct_json = extract_json(unit.get_generated_text())
+                struct_json.extend(unit_struct_json)
+            except Exception as e:
+                unit.set_status("fail")
+                warnings.warn(f"Struct extraction failed for unit ({unit.start}, {unit.end}): {e}", RuntimeWarning)
+        if self.aggregation_func is None:
+            struct = self._default_struct_aggregate(struct_json)
+        else:
+            struct = self.aggregation_func(struct_json)
+        if return_messages_log:
+            return struct, messages_log
+        return struct
+class BasicStructExtractor(StructExtractor):
+    def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
+                 system_prompt:str=None, aggregation_func:Callable=None):
+        """
+        This class prompts the LLM with the whole document at once for structured information extraction.
+        Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
+        Parameters:
+        ----------
+        inference_engine : InferenceEngine
+            the LLM inferencing engine object. Must implements the chat() method.
+        prompt_template : str
+            prompt template with "{{<placeholder name>}}" placeholder.
+        system_prompt : str, Optional
+            system prompt.
+        aggregation_func : Callable
+            a function that inputs a list of structured information (dict)
+            and outputs an aggregated structured information (dict).
+            if not specified, the default is to merge all dicts by updating keys and overwriting values sequentially.
+        """
+        super().__init__(inference_engine=inference_engine,
+                         unit_chunker=WholeDocumentUnitChunker(),
+                         prompt_template=prompt_template,
+                         system_prompt=system_prompt,
+                         context_chunker=WholeDocumentContextChunker())
 class FrameExtractor(Extractor):
     from nltk.tokenize import RegexpTokenizer
     def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker,
@@ -300,7 +720,7 @@ class FrameExtractor(Extractor):
         return_messages_log : bool, Optional
             if True, a list of messages will be returned.
-        Return : str
+        Return : List[LLMInformationExtractionFrame]
             a list of frames.
         """
         return NotImplemented
@@ -659,7 +1079,7 @@ class DirectFrameExtractor(FrameExtractor):
         return_messages_log : bool, Optional
             if True, a list of messages will be returned.
-        Return : str
+        Return : List[LLMInformationExtractionFrame]
             a list of frames.
         """
         ENTITY_KEY = "entity_text"

{llm_ie-1.2.4.dist-info → llm_ie-1.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llm-ie
-Version: 1.2.4
+Version: 1.3.0
 Summary: A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines.
 License: MIT
 Author: Enshuo (David) Hsu

{llm_ie-1.2.4.dist-info → llm_ie-1.3.0.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-llm_ie/__init__.py,sha256=9a0bTN2ol5k_rCEidhnqIwJCnVTfit7TbTtbWG4hj1s,1881
+llm_ie/__init__.py,sha256=Rtdra_fAGPXORFvTd2qjSG08q9LBLXX5J1C8tz2SMwk,1963
 llm_ie/asset/PromptEditor_prompts/chat.txt,sha256=Fq62voV0JQ8xBRcxS1Nmdd7DkHs1fGYb-tmNwctZZK0,118
 llm_ie/asset/PromptEditor_prompts/comment.txt,sha256=C_lxx-dlOlFJ__jkHKosZ8HsNAeV1aowh2B36nIipBY,159
 llm_ie/asset/PromptEditor_prompts/rewrite.txt,sha256=JAwY9vm1jSmKf2qcLBYUvrSmME2EJH36bALmkwZDWYQ,178
@@ -10,7 +10,7 @@ llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt,sha
 llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt,sha256=lGGjdeFpzZEc56w-EtQDMyYFs7A3DQAM32sT42Nf_08,293
 llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt,sha256=Of11LFuXLB249oekFelzlIeoAB0cATReqWgFTvhNz_8,329
 llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_revision_review_prompt.txt,sha256=kNJQK7NdoCx13TXGY8HYGrW_v4SEaErK8j9qIzd70CM,291
-llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt,sha256=w2amKipinuJtCiyPsgWsjaJRwTpS1qOBDuPPtPCMeQA,2120
+llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt,sha256=blr_fx4RI8NRQvSKNenYZWApLeWtjIX2xFPJfz0Mb9k,2115
 llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt,sha256=-Cli7rwu4wM4vSmkG0nInNkpStUhRqKESQ3oqD38pbE,10395
 llm_ie/asset/prompt_guide/BasicReviewFrameExtractor_prompt_guide.txt,sha256=-Cli7rwu4wM4vSmkG0nInNkpStUhRqKESQ3oqD38pbE,10395
 llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt,sha256=Z6Yc2_QRqroWcJ13owNJbo78I0wpS4XXDsOjXFR-aPk,2166
@@ -19,12 +19,13 @@ llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt,sha256=EQ
 llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt,sha256=rBRIXg8JQWUHTRdoluTS0zkbTkBAacEtHHvr3lZaQCw,10437
 llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=97_-y_vHMLG4Kb8fLsGgibLxB-3mest8k3LHfLo5h-I,10465
 llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=97_-y_vHMLG4Kb8fLsGgibLxB-3mest8k3LHfLo5h-I,10465
+llm_ie/asset/prompt_guide/StructExtractor_prompt_guide.txt,sha256=x8L4n_LVl6ofQu6cDE9YP4SB2FSQ4GrTee8y1XKwwwc,1922
 llm_ie/chunkers.py,sha256=b4APRwaLMU40QXVEhOK8m1DZi_jr-VCHAFwbMjqVBgA,11308
-llm_ie/data_types.py,sha256=6vefyGTgZcJBYgiuyfcbJN1ZKK4tNvOZf6HFpxFZngY,17792
+llm_ie/data_types.py,sha256=iG_jdqhpBi33xnsfFQYayCXNBK-2N-8u1xIhoKfJzRI,18294
 llm_ie/engines.py,sha256=K4Zgb1dYiuopBeTLcgSAseI-VXgwtTeWf9O4EK9SQqE,63901
-llm_ie/extractors.py,sha256=f-TUZFprJZ_ftrnKbi-g-au4KoJwtciCCawXHWzmDtU,100792
+llm_ie/extractors.py,sha256=Voexzc_sYQ3jBGkvLybazt9zVsLnnrMbsUswKciBS4I,120933
 llm_ie/prompt_editor.py,sha256=Hqukm2HMgsoGpXV3vZ__7CGgfMhd-UUIwTKGnfSDltM,12055
 llm_ie/utils.py,sha256=k6M4l8GsKOMcmO6UwONQ353Zk-TeoBj6HXGjlAn-JE0,3679
-llm_ie-1.2.4.dist-info/METADATA,sha256=dl0JyDkgjEbk12N5I1fZg-jh7gEvTpuJ1Ox1_mHo_6Q,728
-llm_ie-1.2.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-llm_ie-1.2.4.dist-info/RECORD,,
+llm_ie-1.3.0.dist-info/METADATA,sha256=GrgKPwzTXtHIBsEThNsJ6i7Z43Ghb2I5Y47mRYbSIAo,728
+llm_ie-1.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+llm_ie-1.3.0.dist-info/RECORD,,

{llm_ie-1.2.4.dist-info → llm_ie-1.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

llm-ie 1.2.4__py3-none-any.whl → 1.3.0__py3-none-any.whl

llm-ie 1.2.4py3-none-any.whl → 1.3.0py3-none-any.whl