llm-ie 1.2.4__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/__init__.py CHANGED
@@ -1,12 +1,12 @@
1
1
  from .data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
2
2
  from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
3
3
  from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, SGLangInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
4
- from .extractors import DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
4
+ from .extractors import StructExtractor, BasicStructExtractor, DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
5
5
  from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, LLMUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
6
6
  from .prompt_editor import PromptEditor
7
7
 
8
8
  __all__ = ["LLMInformationExtractionFrame", "LLMInformationExtractionDocument",
9
9
  "BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "SGLangInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
10
- "DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
10
+ "StructExtractor", "BasicStructExtractor", "DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
11
11
  "UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "LLMUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
12
12
  "PromptEditor"]
@@ -7,7 +7,7 @@ Prompt Template Design:
7
7
  List the attributes to extract, and provide clear definitions for each one.
8
8
 
9
9
  3. Output Format Definition:
10
- The output should be a JSON list, where each attribute be a key. The values could be any structure (e.g., str, int, List[str]).
10
+ The output should be a JSON, where each attribute be a key. The values could be any structure (e.g., str, int, List[str]).
11
11
 
12
12
  4. Optional: Hints:
13
13
  Provide itemized hints for the information extractors to guide the extraction process. Remind the prompted agent to be truthful. Emphasize that the prompted agent is supposed to perform the task instead of writting code or instruct other agents to do it.
@@ -37,7 +37,7 @@ Example:
37
37
  Your output should follow the JSON format:
38
38
  {"Date": "<MM/DD/YYYY>", "Status": "<status>"}
39
39
 
40
- I am only interested in the content between []. Do not explain your answer.
40
+ I am only interested in the content between {}. Do not explain your answer.
41
41
 
42
42
  ### Hints
43
43
  - If the date is not complete, use the first available date in the context. For example, if the date is 01/2023, you should return 01/01/2023.
@@ -0,0 +1,53 @@
1
+ Prompt Template Design:
2
+
3
+ 1. Task Description:
4
+ Provide a detailed description of the task, including the background and the type of task (e.g., structured data extraction task).
5
+
6
+ 2. Schema Definition:
7
+ List the key-value pairs to extract, and provide clear definitions for each one.
8
+
9
+ 3. Output Format Definition:
10
+ The output should be a JSON. The values could be any structure (e.g., str, int, List[str]).
11
+
12
+ 4. Optional: Hints:
13
+ Provide itemized hints for the information extractors to guide the extraction process. Remind the prompted agent to be truthful. Emphasize that the prompted agent is supposed to perform the task instead of writting code or instruct other agents to do it.
14
+
15
+ 5. Optional: Examples:
16
+ Include examples in the format:
17
+ Input: ...
18
+ Output: ...
19
+
20
+ 6. Context:
21
+ The template must include a placeholder {{input}} for the document or chunk.
22
+
23
+
24
+ Example:
25
+
26
+ ### Task description
27
+ This is an structured data extraction task. Given a medical report, you need to extract structured patient data from it.
28
+
29
+ ### Schema definition
30
+ "PatientName" which is the name of the patient,
31
+ "Age" which is the age of the patient in years,
32
+ "MRN" which is the medical record number of the patient.
33
+
34
+ ### Output format definition
35
+ Your output should follow the JSON format:
36
+ ```json
37
+ {
38
+ "PatientName": "<patient_name>",
39
+ "Age": <age_in_years>,
40
+ "MRN": "<medical_record_number>"
41
+ }
42
+ ```
43
+ I am only interested in the content between {}. Do not explain your answer.
44
+
45
+ ### Hints
46
+ - Make sure to extract the exact patient name as it appears in the report.
47
+ - You are suppose to perform the extraction task instead of writting code or instruct other agents to do it.
48
+ - If some values are not available, you should return "not specified".
49
+
50
+ ### Context
51
+ The text below is from the medical report:
52
+
53
+ "{{input}}"
llm_ie/data_types.py CHANGED
@@ -141,7 +141,7 @@ class LLMInformationExtractionFrame:
141
141
 
142
142
 
143
143
  class LLMInformationExtractionDocument:
144
- def __init__(self, doc_id:str=None, filename:str=None, text:str=None,
144
+ def __init__(self, doc_id:str=None, filename:str=None, text:str=None, struct:Dict=None,
145
145
  frames:List[LLMInformationExtractionFrame]=None, relations:List[Dict[str,str]]=None):
146
146
  """
147
147
  This class holds LLM-extracted frames, handles save/ load.
@@ -154,6 +154,8 @@ class LLMInformationExtractionDocument:
154
154
  the directory to a yaml file of a saved LLMInformationExtractionDocument
155
155
  text : str, Optional
156
156
  document text
157
+ struct : Dict, Optional
158
+ a dictionary of unanchored structure information
157
159
  frames : List[LLMInformationExtractionFrame], Optional
158
160
  a list of LLMInformationExtractionFrame
159
161
  relations : List[Dict[str,str]], Optional
@@ -168,12 +170,28 @@ class LLMInformationExtractionDocument:
168
170
  llm_ie = json.load(json_file)
169
171
  if 'doc_id' in llm_ie.keys():
170
172
  self.doc_id = llm_ie['doc_id']
173
+ else:
174
+ raise ValueError("doc_id key not found in the file.")
175
+
171
176
  if 'text' in llm_ie.keys():
172
177
  self.text = llm_ie['text']
178
+ else:
179
+ raise ValueError("text key not found in the file.")
180
+
181
+ if 'struct' in llm_ie.keys():
182
+ self.struct = llm_ie['struct']
183
+ else:
184
+ self.struct = {}
185
+
173
186
  if 'frames' in llm_ie.keys():
174
187
  self.frames = [LLMInformationExtractionFrame.from_dict(d) for d in llm_ie['frames']]
188
+ else:
189
+ self.frames = []
190
+
175
191
  if 'relations' in llm_ie.keys():
176
192
  self.relations = llm_ie['relations']
193
+ else:
194
+ self.relations = []
177
195
 
178
196
  # create object from raw inputs
179
197
  else:
@@ -181,9 +199,15 @@ class LLMInformationExtractionDocument:
181
199
  raise TypeError("doc_id must be a string.")
182
200
  self.doc_id = doc_id
183
201
  self.text = text
202
+ self.struct = struct.copy() if struct is not None else {}
184
203
  self.frames = frames.copy() if frames is not None else []
185
204
  self.relations = relations.copy() if relations is not None else []
186
205
 
206
+ def has_struct(self) -> bool:
207
+ """
208
+ This method checks if there is any unanchored structure information.
209
+ """
210
+ return bool(self.struct)
187
211
 
188
212
  def has_frame(self) -> bool:
189
213
  """
@@ -228,6 +252,18 @@ class LLMInformationExtractionDocument:
228
252
 
229
253
  return None
230
254
 
255
+ def set_struct(self, struct:Dict):
256
+ """
257
+ This method sets the unanchored structure information.
258
+
259
+ Parameters
260
+ ----------
261
+ struct : Dict
262
+ a dictionary of unanchored structure information
263
+ """
264
+ if not isinstance(struct, Dict):
265
+ raise TypeError("struct must be a dictionary.")
266
+ self.struct = struct.copy()
231
267
 
232
268
  def add_frame(self, frame:LLMInformationExtractionFrame, valid_mode:str=None, create_id:bool=False) -> bool:
233
269
  """
@@ -326,10 +362,12 @@ class LLMInformationExtractionDocument:
326
362
 
327
363
  def __repr__(self, N_top_chars:int=100) -> str:
328
364
  text_to_print = self.text[0:N_top_chars]
365
+ struct_key_count = len(self.struct.keys())
329
366
  frame_count = len(self.frames)
330
367
  relation_count = len(self.relations)
331
368
  return ''.join((f'LLMInformationExtractionDocument(doc_id: "{self.doc_id}"\n',
332
369
  f'text: "{text_to_print}...",\n',
370
+ f'struct keys: {struct_key_count}\n',
333
371
  f'frames: {frame_count}\n',
334
372
  f'relations: {relation_count}'))
335
373
 
@@ -338,6 +376,7 @@ class LLMInformationExtractionDocument:
338
376
  with open(filename, 'w') as json_file:
339
377
  json.dump({'doc_id': self.doc_id,
340
378
  'text': self.text,
379
+ 'struct': self.struct,
341
380
  'frames': [frame.to_dict() for frame in self.frames],
342
381
  'relations': self.relations},
343
382
  json_file, indent=4)
@@ -346,16 +385,22 @@ class LLMInformationExtractionDocument:
346
385
 
347
386
  def _viz_preprocess(self) -> Tuple:
348
387
  """
349
- This method preprocesses the entities and relations for visualization.
388
+ This method preprocesses the struct, entities and relations for visualization.
350
389
  """
351
390
  if importlib.util.find_spec("ie_viz") is None:
352
- raise ImportError("ie_viz not found. Please install ie_viz (```pip install ie-viz```).")
391
+ raise ImportError("ie_viz not found. Please install ie_viz (```pip install -U ie-viz```).")
353
392
 
393
+ # Struct
394
+ if self.has_struct():
395
+ struct = self.struct
396
+ else:
397
+ struct = {}
398
+ # Entities
354
399
  if self.has_frame():
355
400
  entities = [{"entity_id": frame.frame_id, "start": frame.start, "end": frame.end, "attr": frame.attr} for frame in self.frames]
356
401
  else:
357
- raise ValueError("No frames in the document.")
358
-
402
+ entities = None
403
+ # Relations
359
404
  if self.has_relation():
360
405
  relations = []
361
406
  for relation in self.relations:
@@ -364,7 +409,7 @@ class LLMInformationExtractionDocument:
364
409
  else:
365
410
  relations = None
366
411
 
367
- return entities, relations
412
+ return struct, entities, relations
368
413
 
369
414
 
370
415
  def viz_serve(self, host: str = '0.0.0.0', port: int = 5000, theme:str = "light", title:str="Frames Visualization",
@@ -388,29 +433,20 @@ class LLMInformationExtractionDocument:
388
433
  The function to be used for mapping the entity attributes to colors. When provided, the color_attr_key and
389
434
  theme will be overwritten. The function must take an entity dictionary as input and return a color string (hex).
390
435
  """
391
- entities, relations = self._viz_preprocess()
436
+ struct, entities, relations = self._viz_preprocess()
392
437
  from ie_viz import serve
393
438
 
394
- try:
395
- serve(text=self.text,
396
- entities=entities,
397
- relations=relations,
398
- host=host,
399
- port=port,
400
- theme=theme,
401
- title=title,
402
- color_attr_key=color_attr_key,
403
- color_map_func=color_map_func)
404
- except TypeError:
405
- warnings.warn("The version of ie_viz is not the latest. Please update to the latest version (pip install --upgrade ie-viz) for complete features.", UserWarning)
406
- serve(text=self.text,
407
- entities=entities,
408
- relations=relations,
409
- host=host,
410
- port=port,
411
- theme=theme,
412
- color_attr_key=color_attr_key,
413
- color_map_func=color_map_func)
439
+ serve(text=self.text,
440
+ struct=struct,
441
+ entities=entities,
442
+ relations=relations,
443
+ host=host,
444
+ port=port,
445
+ theme=theme,
446
+ title=title,
447
+ color_attr_key=color_attr_key,
448
+ color_map_func=color_map_func)
449
+
414
450
 
415
451
  def viz_render(self, theme:str = "light", color_attr_key:str=None, color_map_func:Callable=None,
416
452
  title:str="Frames Visualization") -> str:
@@ -429,22 +465,14 @@ class LLMInformationExtractionDocument:
429
465
  title : str, Optional
430
466
  the title of the HTML.
431
467
  """
432
- entities, relations = self._viz_preprocess()
468
+ struct, entities, relations = self._viz_preprocess()
433
469
  from ie_viz import render
434
470
 
435
- try:
436
- return render(text=self.text,
437
- entities=entities,
438
- relations=relations,
439
- theme=theme,
440
- title=title,
441
- color_attr_key=color_attr_key,
442
- color_map_func=color_map_func)
443
- except TypeError:
444
- warnings.warn("The version of ie_viz is not the latest. Please update to the latest version (pip install --upgrade ie-viz) for complete features.", UserWarning)
445
- return render(text=self.text,
446
- entities=entities,
447
- relations=relations,
448
- theme=theme,
449
- color_attr_key=color_attr_key,
450
- color_map_func=color_map_func)
471
+ return render(text=self.text,
472
+ struct=struct,
473
+ entities=entities,
474
+ relations=relations,
475
+ theme=theme,
476
+ title=title,
477
+ color_attr_key=color_attr_key,
478
+ color_map_func=color_map_func)
llm_ie/extractors.py CHANGED
@@ -98,6 +98,426 @@ class Extractor:
98
98
  return apply_prompt_template(self.prompt_template, text_content)
99
99
 
100
100
 
101
+ class StructExtractor(Extractor):
102
+ def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker, prompt_template:str,
103
+ system_prompt:str=None, context_chunker:ContextChunker=None, aggregation_func:Callable=None):
104
+ """
105
+ This class is for unanchored structured information extraction.
106
+ Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
107
+
108
+ Parameters:
109
+ ----------
110
+ inference_engine : InferenceEngine
111
+ the LLM inferencing engine object. Must implements the chat() method.
112
+ unit_chunker : UnitChunker
113
+ the unit chunker object that determines how to chunk the document text into units.
114
+ prompt_template : str
115
+ prompt template with "{{<placeholder name>}}" placeholder.
116
+ system_prompt : str, Optional
117
+ system prompt.
118
+ context_chunker : ContextChunker
119
+ the context chunker object that determines how to get context for each unit.
120
+ aggregation_func : Callable
121
+ a function that inputs a list of structured information (dict)
122
+ and outputs an aggregated structured information (dict).
123
+ if not specified, the default is to merge all dicts by updating keys and overwriting values sequentially.
124
+ """
125
+ super().__init__(inference_engine=inference_engine,
126
+ prompt_template=prompt_template,
127
+ system_prompt=system_prompt)
128
+
129
+ self.unit_chunker = unit_chunker
130
+ self.context_chunker = context_chunker
131
+ self.aggregation_func = aggregation_func
132
+
133
+
134
+ def extract(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
135
+ verbose:bool=False, return_messages_log:bool=False) -> List[FrameExtractionUnit]:
136
+ """
137
+ This method inputs text content and outputs a string generated by LLM
138
+
139
+ Parameters:
140
+ ----------
141
+ text_content : Union[str, Dict[str,str]]
142
+ the input text content to put in prompt template.
143
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
144
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
145
+ return_messages_log : bool, Optional
146
+ if True, a list of messages will be returned.
147
+
148
+ Return : List[FrameExtractionUnit]
149
+ the output from LLM. Need post-processing.
150
+ """
151
+ # unit chunking
152
+ if isinstance(text_content, str):
153
+ doc_text = text_content
154
+
155
+ elif isinstance(text_content, dict):
156
+ if document_key is None:
157
+ raise ValueError("document_key must be provided when text_content is dict.")
158
+ doc_text = text_content[document_key]
159
+
160
+ units = self.unit_chunker.chunk(doc_text)
161
+ # context chunker init
162
+ self.context_chunker.fit(doc_text, units)
163
+
164
+ # messages log
165
+ messages_logger = MessagesLogger() if return_messages_log else None
166
+
167
+ # generate unit by unit
168
+ for i, unit in enumerate(units):
169
+ try:
170
+ # construct chat messages
171
+ messages = []
172
+ if self.system_prompt:
173
+ messages.append({'role': 'system', 'content': self.system_prompt})
174
+
175
+ context = self.context_chunker.chunk(unit)
176
+
177
+ if context == "":
178
+ # no context, just place unit in user prompt
179
+ if isinstance(text_content, str):
180
+ messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
181
+ else:
182
+ unit_content = text_content.copy()
183
+ unit_content[document_key] = unit.text
184
+ messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
185
+ else:
186
+ # insert context to user prompt
187
+ if isinstance(text_content, str):
188
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
189
+ else:
190
+ context_content = text_content.copy()
191
+ context_content[document_key] = context
192
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
193
+ # simulate conversation where assistant confirms
194
+ messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
195
+ # place unit of interest
196
+ messages.append({'role': 'user', 'content': unit.text})
197
+
198
+ if verbose:
199
+ print(f"\n\n{Fore.GREEN}Unit {i + 1}/{len(units)}:{Style.RESET_ALL}\n{unit.text}\n")
200
+ if context != "":
201
+ print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
202
+
203
+ print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
204
+
205
+
206
+ gen_text = self.inference_engine.chat(
207
+ messages=messages,
208
+ verbose=verbose,
209
+ stream=False,
210
+ messages_logger=messages_logger
211
+ )
212
+
213
+ # add generated text to unit
214
+ unit.set_generated_text(gen_text["response"])
215
+ unit.set_status("success")
216
+ except Exception as e:
217
+ unit.set_status("fail")
218
+ warnings.warn(f"LLM inference failed for unit {i} ({unit.start}, {unit.end}): {e}", RuntimeWarning)
219
+
220
+ if return_messages_log:
221
+ return units, messages_logger.get_messages_log()
222
+
223
+ return units
224
+
225
+ def stream(self, text_content: Union[str, Dict[str, str]],
226
+ document_key: str = None) -> Generator[Dict[str, Any], None, List[FrameExtractionUnit]]:
227
+ """
228
+ Streams LLM responses per unit with structured event types,
229
+ and returns collected data for post-processing.
230
+
231
+ Yields:
232
+ -------
233
+ Dict[str, Any]: (type, data)
234
+ - {"type": "info", "data": str_message}: General informational messages.
235
+ - {"type": "unit", "data": dict_unit_info}: Signals start of a new unit. dict_unit_info contains {'id', 'text', 'start', 'end'}
236
+ - {"type": "context", "data": str_context}: Context string for the current unit.
237
+ - {"type": "reasoning", "data": str_chunk}: A reasoning model thinking chunk from the LLM.
238
+ - {"type": "response", "data": str_chunk}: A response/answer chunk from the LLM.
239
+
240
+ Returns:
241
+ --------
242
+ List[FrameExtractionUnit]:
243
+ A list of FrameExtractionUnit objects, each containing the
244
+ original unit details and the fully accumulated 'gen_text' from the LLM.
245
+ """
246
+ if isinstance(text_content, str):
247
+ doc_text = text_content
248
+ elif isinstance(text_content, dict):
249
+ if document_key is None:
250
+ raise ValueError("document_key must be provided when text_content is dict.")
251
+ if document_key not in text_content:
252
+ raise ValueError(f"document_key '{document_key}' not found in text_content.")
253
+ doc_text = text_content[document_key]
254
+ else:
255
+ raise TypeError("text_content must be a string or a dictionary.")
256
+
257
+ units: List[FrameExtractionUnit] = self.unit_chunker.chunk(doc_text)
258
+ self.context_chunker.fit(doc_text, units)
259
+
260
+ yield {"type": "info", "data": f"Starting LLM processing for {len(units)} units."}
261
+
262
+ for i, unit in enumerate(units):
263
+ unit_info_payload = {"id": i, "text": unit.text, "start": unit.start, "end": unit.end}
264
+ yield {"type": "unit", "data": unit_info_payload}
265
+
266
+ messages = []
267
+ if self.system_prompt:
268
+ messages.append({'role': 'system', 'content': self.system_prompt})
269
+
270
+ context_str = self.context_chunker.chunk(unit)
271
+
272
+ # Construct prompt input based on whether text_content was str or dict
273
+ if context_str:
274
+ yield {"type": "context", "data": context_str}
275
+ prompt_input_for_context = context_str
276
+ if isinstance(text_content, dict):
277
+ context_content_dict = text_content.copy()
278
+ context_content_dict[document_key] = context_str
279
+ prompt_input_for_context = context_content_dict
280
+ messages.append({'role': 'user', 'content': self._get_user_prompt(prompt_input_for_context)})
281
+ messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
282
+ messages.append({'role': 'user', 'content': unit.text})
283
+ else: # No context
284
+ prompt_input_for_unit = unit.text
285
+ if isinstance(text_content, dict):
286
+ unit_content_dict = text_content.copy()
287
+ unit_content_dict[document_key] = unit.text
288
+ prompt_input_for_unit = unit_content_dict
289
+ messages.append({'role': 'user', 'content': self._get_user_prompt(prompt_input_for_unit)})
290
+
291
+ current_gen_text = ""
292
+
293
+ response_stream = self.inference_engine.chat(
294
+ messages=messages,
295
+ stream=True
296
+ )
297
+ for chunk in response_stream:
298
+ yield chunk
299
+ if chunk["type"] == "response":
300
+ current_gen_text += chunk["data"]
301
+
302
+ # Store the result for this unit
303
+ unit.set_generated_text(current_gen_text)
304
+ unit.set_status("success")
305
+
306
+ yield {"type": "info", "data": "All units processed by LLM."}
307
+ return units
308
+
309
+ async def extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
310
+ concurrent_batch_size:int=32, return_messages_log:bool=False) -> List[FrameExtractionUnit]:
311
+ """
312
+ This is the asynchronous version of the extract() method.
313
+
314
+ Parameters:
315
+ ----------
316
+ text_content : Union[str, Dict[str,str]]
317
+ the input text content to put in prompt template.
318
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
319
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
320
+ document_key : str, Optional
321
+ specify the key in text_content where document text is.
322
+ If text_content is str, this parameter will be ignored.
323
+ concurrent_batch_size : int, Optional
324
+ the batch size for concurrent processing.
325
+ return_messages_log : bool, Optional
326
+ if True, a list of messages will be returned.
327
+
328
+ Return : List[FrameExtractionUnit]
329
+ the output from LLM for each unit. Contains the start, end, text, and generated text.
330
+ """
331
+ if isinstance(text_content, str):
332
+ doc_text = text_content
333
+ elif isinstance(text_content, dict):
334
+ if document_key is None:
335
+ raise ValueError("document_key must be provided when text_content is dict.")
336
+ if document_key not in text_content:
337
+ raise ValueError(f"document_key '{document_key}' not found in text_content dictionary.")
338
+ doc_text = text_content[document_key]
339
+ else:
340
+ raise TypeError("text_content must be a string or a dictionary.")
341
+
342
+ units = self.unit_chunker.chunk(doc_text)
343
+
344
+ # context chunker init
345
+ self.context_chunker.fit(doc_text, units)
346
+
347
+ # messages logger init
348
+ messages_logger = MessagesLogger() if return_messages_log else None
349
+
350
+ # Prepare inputs for all units first
351
+ tasks_input = []
352
+ for i, unit in enumerate(units):
353
+ # construct chat messages
354
+ messages = []
355
+ if self.system_prompt:
356
+ messages.append({'role': 'system', 'content': self.system_prompt})
357
+
358
+ context = self.context_chunker.chunk(unit)
359
+
360
+ if context == "":
361
+ # no context, just place unit in user prompt
362
+ if isinstance(text_content, str):
363
+ messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
364
+ else:
365
+ unit_content = text_content.copy()
366
+ unit_content[document_key] = unit.text
367
+ messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
368
+ else:
369
+ # insert context to user prompt
370
+ if isinstance(text_content, str):
371
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
372
+ else:
373
+ context_content = text_content.copy()
374
+ context_content[document_key] = context
375
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
376
+ # simulate conversation where assistant confirms
377
+ messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
378
+ # place unit of interest
379
+ messages.append({'role': 'user', 'content': unit.text})
380
+
381
+ # Store unit and messages together for the task
382
+ tasks_input.append({"unit": unit, "messages": messages, "original_index": i})
383
+
384
+ # Process units concurrently with asyncio.Semaphore
385
+ semaphore = asyncio.Semaphore(concurrent_batch_size)
386
+
387
+ async def semaphore_helper(task_data: Dict, **kwrs):
388
+ unit = task_data["unit"]
389
+ messages = task_data["messages"]
390
+
391
+ async with semaphore:
392
+ gen_text = await self.inference_engine.chat_async(
393
+ messages=messages,
394
+ messages_logger=messages_logger
395
+ )
396
+
397
+ unit.set_generated_text(gen_text["response"])
398
+ unit.set_status("success")
399
+
400
+ # Create and gather tasks
401
+ tasks = []
402
+ for task_inp in tasks_input:
403
+ task = asyncio.create_task(semaphore_helper(
404
+ task_inp
405
+ ))
406
+ tasks.append(task)
407
+
408
+ await asyncio.gather(*tasks)
409
+
410
+ # Return units
411
+ if return_messages_log:
412
+ return units, messages_logger.get_messages_log()
413
+ else:
414
+ return units
415
+
416
+ def _default_struct_aggregate(self, structs: List[Dict[str, Any]]) -> Dict[str, Any]:
417
+ """
418
+ Given a list of structured information (dict), aggregate them into a single dict by seqentially updating keys
419
+ and overwriting values.
420
+ """
421
+ aggregated_struct = {}
422
+ for struct in structs:
423
+ aggregated_struct.update(struct)
424
+ return aggregated_struct
425
+
426
+
427
+ def extract_struct(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
428
+ verbose:bool=False, concurrent:bool=False, concurrent_batch_size:int=32,
429
+ return_messages_log:bool=False) -> List[Dict[str, Any]]:
430
+ """
431
+ This method inputs a document text and outputs a list of LLMInformationExtractionFrame
432
+ It use the extract() method and post-process outputs into frames.
433
+
434
+ Parameters:
435
+ ----------
436
+ text_content : Union[str, Dict[str,str]]
437
+ the input text content to put in prompt template.
438
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
439
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
440
+ document_key : str, Optional
441
+ specify the key in text_content where document text is.
442
+ If text_content is str, this parameter will be ignored.
443
+ verbose : bool, Optional
444
+ if True, LLM generated text will be printed in terminal in real-time.
445
+ concurrent : bool, Optional
446
+ if True, the sentences will be extracted in concurrent.
447
+ concurrent_batch_size : int, Optional
448
+ the number of sentences to process in concurrent. Only used when `concurrent` is True.
449
+ return_messages_log : bool, Optional
450
+ if True, a list of messages will be returned.
451
+
452
+ Return : List[Dict[str, Any]]
453
+ a list of unanchored structured information.
454
+ """
455
+ if concurrent:
456
+ if verbose:
457
+ warnings.warn("verbose=True is not supported in concurrent mode.", RuntimeWarning)
458
+
459
+ nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
460
+ extraction_results = asyncio.run(self.extract_async(text_content=text_content,
461
+ document_key=document_key,
462
+ concurrent_batch_size=concurrent_batch_size,
463
+ return_messages_log=return_messages_log)
464
+ )
465
+ else:
466
+ extraction_results = self.extract(text_content=text_content,
467
+ document_key=document_key,
468
+ verbose=verbose,
469
+ return_messages_log=return_messages_log)
470
+
471
+ units, messages_log = extraction_results if return_messages_log else (extraction_results, None)
472
+
473
+ struct_json = []
474
+ for unit in units:
475
+ if unit.status != "success":
476
+ continue
477
+ try:
478
+ unit_struct_json = extract_json(unit.get_generated_text())
479
+ struct_json.extend(unit_struct_json)
480
+ except Exception as e:
481
+ unit.set_status("fail")
482
+ warnings.warn(f"Struct extraction failed for unit ({unit.start}, {unit.end}): {e}", RuntimeWarning)
483
+
484
+ if self.aggregation_func is None:
485
+ struct = self._default_struct_aggregate(struct_json)
486
+ else:
487
+ struct = self.aggregation_func(struct_json)
488
+
489
+ if return_messages_log:
490
+ return struct, messages_log
491
+ return struct
492
+
493
+
494
+ class BasicStructExtractor(StructExtractor):
495
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
496
+ system_prompt:str=None, aggregation_func:Callable=None):
497
+ """
498
+ This class prompts the LLM with the whole document at once for structured information extraction.
499
+ Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
500
+
501
+ Parameters:
502
+ ----------
503
+ inference_engine : InferenceEngine
504
+ the LLM inferencing engine object. Must implements the chat() method.
505
+ prompt_template : str
506
+ prompt template with "{{<placeholder name>}}" placeholder.
507
+ system_prompt : str, Optional
508
+ system prompt.
509
+ aggregation_func : Callable
510
+ a function that inputs a list of structured information (dict)
511
+ and outputs an aggregated structured information (dict).
512
+ if not specified, the default is to merge all dicts by updating keys and overwriting values sequentially.
513
+ """
514
+ super().__init__(inference_engine=inference_engine,
515
+ unit_chunker=WholeDocumentUnitChunker(),
516
+ prompt_template=prompt_template,
517
+ system_prompt=system_prompt,
518
+ context_chunker=WholeDocumentContextChunker())
519
+
520
+
101
521
  class FrameExtractor(Extractor):
102
522
  from nltk.tokenize import RegexpTokenizer
103
523
  def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker,
@@ -300,7 +720,7 @@ class FrameExtractor(Extractor):
300
720
  return_messages_log : bool, Optional
301
721
  if True, a list of messages will be returned.
302
722
 
303
- Return : str
723
+ Return : List[LLMInformationExtractionFrame]
304
724
  a list of frames.
305
725
  """
306
726
  return NotImplemented
@@ -659,7 +1079,7 @@ class DirectFrameExtractor(FrameExtractor):
659
1079
  return_messages_log : bool, Optional
660
1080
  if True, a list of messages will be returned.
661
1081
 
662
- Return : str
1082
+ Return : List[LLMInformationExtractionFrame]
663
1083
  a list of frames.
664
1084
  """
665
1085
  ENTITY_KEY = "entity_text"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llm-ie
3
- Version: 1.2.4
3
+ Version: 1.3.0
4
4
  Summary: A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -1,4 +1,4 @@
1
- llm_ie/__init__.py,sha256=9a0bTN2ol5k_rCEidhnqIwJCnVTfit7TbTtbWG4hj1s,1881
1
+ llm_ie/__init__.py,sha256=Rtdra_fAGPXORFvTd2qjSG08q9LBLXX5J1C8tz2SMwk,1963
2
2
  llm_ie/asset/PromptEditor_prompts/chat.txt,sha256=Fq62voV0JQ8xBRcxS1Nmdd7DkHs1fGYb-tmNwctZZK0,118
3
3
  llm_ie/asset/PromptEditor_prompts/comment.txt,sha256=C_lxx-dlOlFJ__jkHKosZ8HsNAeV1aowh2B36nIipBY,159
4
4
  llm_ie/asset/PromptEditor_prompts/rewrite.txt,sha256=JAwY9vm1jSmKf2qcLBYUvrSmME2EJH36bALmkwZDWYQ,178
@@ -10,7 +10,7 @@ llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt,sha
10
10
  llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt,sha256=lGGjdeFpzZEc56w-EtQDMyYFs7A3DQAM32sT42Nf_08,293
11
11
  llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt,sha256=Of11LFuXLB249oekFelzlIeoAB0cATReqWgFTvhNz_8,329
12
12
  llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_revision_review_prompt.txt,sha256=kNJQK7NdoCx13TXGY8HYGrW_v4SEaErK8j9qIzd70CM,291
13
- llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt,sha256=w2amKipinuJtCiyPsgWsjaJRwTpS1qOBDuPPtPCMeQA,2120
13
+ llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt,sha256=blr_fx4RI8NRQvSKNenYZWApLeWtjIX2xFPJfz0Mb9k,2115
14
14
  llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt,sha256=-Cli7rwu4wM4vSmkG0nInNkpStUhRqKESQ3oqD38pbE,10395
15
15
  llm_ie/asset/prompt_guide/BasicReviewFrameExtractor_prompt_guide.txt,sha256=-Cli7rwu4wM4vSmkG0nInNkpStUhRqKESQ3oqD38pbE,10395
16
16
  llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt,sha256=Z6Yc2_QRqroWcJ13owNJbo78I0wpS4XXDsOjXFR-aPk,2166
@@ -19,12 +19,13 @@ llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt,sha256=EQ
19
19
  llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt,sha256=rBRIXg8JQWUHTRdoluTS0zkbTkBAacEtHHvr3lZaQCw,10437
20
20
  llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=97_-y_vHMLG4Kb8fLsGgibLxB-3mest8k3LHfLo5h-I,10465
21
21
  llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=97_-y_vHMLG4Kb8fLsGgibLxB-3mest8k3LHfLo5h-I,10465
22
+ llm_ie/asset/prompt_guide/StructExtractor_prompt_guide.txt,sha256=x8L4n_LVl6ofQu6cDE9YP4SB2FSQ4GrTee8y1XKwwwc,1922
22
23
  llm_ie/chunkers.py,sha256=b4APRwaLMU40QXVEhOK8m1DZi_jr-VCHAFwbMjqVBgA,11308
23
- llm_ie/data_types.py,sha256=6vefyGTgZcJBYgiuyfcbJN1ZKK4tNvOZf6HFpxFZngY,17792
24
+ llm_ie/data_types.py,sha256=iG_jdqhpBi33xnsfFQYayCXNBK-2N-8u1xIhoKfJzRI,18294
24
25
  llm_ie/engines.py,sha256=K4Zgb1dYiuopBeTLcgSAseI-VXgwtTeWf9O4EK9SQqE,63901
25
- llm_ie/extractors.py,sha256=f-TUZFprJZ_ftrnKbi-g-au4KoJwtciCCawXHWzmDtU,100792
26
+ llm_ie/extractors.py,sha256=Voexzc_sYQ3jBGkvLybazt9zVsLnnrMbsUswKciBS4I,120933
26
27
  llm_ie/prompt_editor.py,sha256=Hqukm2HMgsoGpXV3vZ__7CGgfMhd-UUIwTKGnfSDltM,12055
27
28
  llm_ie/utils.py,sha256=k6M4l8GsKOMcmO6UwONQ353Zk-TeoBj6HXGjlAn-JE0,3679
28
- llm_ie-1.2.4.dist-info/METADATA,sha256=dl0JyDkgjEbk12N5I1fZg-jh7gEvTpuJ1Ox1_mHo_6Q,728
29
- llm_ie-1.2.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
30
- llm_ie-1.2.4.dist-info/RECORD,,
29
+ llm_ie-1.3.0.dist-info/METADATA,sha256=GrgKPwzTXtHIBsEThNsJ6i7Z43Ghb2I5Y47mRYbSIAo,728
30
+ llm_ie-1.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
31
+ llm_ie-1.3.0.dist-info/RECORD,,
File without changes