llm-ie 1.2.4__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/__init__.py CHANGED
@@ -1,12 +1,12 @@
1
1
  from .data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
2
2
  from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
3
3
  from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, SGLangInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
4
- from .extractors import DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
4
+ from .extractors import StructExtractor, BasicStructExtractor, DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
5
5
  from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, LLMUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
6
6
  from .prompt_editor import PromptEditor
7
7
 
8
8
  __all__ = ["LLMInformationExtractionFrame", "LLMInformationExtractionDocument",
9
9
  "BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "SGLangInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
10
- "DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
10
+ "StructExtractor", "BasicStructExtractor", "DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
11
11
  "UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "LLMUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
12
12
  "PromptEditor"]
@@ -7,7 +7,7 @@ Prompt Template Design:
7
7
  List the attributes to extract, and provide clear definitions for each one.
8
8
 
9
9
  3. Output Format Definition:
10
- The output should be a JSON list, where each attribute be a key. The values could be any structure (e.g., str, int, List[str]).
10
+ The output should be a JSON, where each attribute be a key. The values could be any structure (e.g., str, int, List[str]).
11
11
 
12
12
  4. Optional: Hints:
13
13
  Provide itemized hints for the information extractors to guide the extraction process. Remind the prompted agent to be truthful. Emphasize that the prompted agent is supposed to perform the task instead of writting code or instruct other agents to do it.
@@ -37,7 +37,7 @@ Example:
37
37
  Your output should follow the JSON format:
38
38
  {"Date": "<MM/DD/YYYY>", "Status": "<status>"}
39
39
 
40
- I am only interested in the content between []. Do not explain your answer.
40
+ I am only interested in the content between {}. Do not explain your answer.
41
41
 
42
42
  ### Hints
43
43
  - If the date is not complete, use the first available date in the context. For example, if the date is 01/2023, you should return 01/01/2023.
@@ -0,0 +1,53 @@
1
+ Prompt Template Design:
2
+
3
+ 1. Task Description:
4
+ Provide a detailed description of the task, including the background and the type of task (e.g., structured data extraction task).
5
+
6
+ 2. Schema Definition:
7
+ List the key-value pairs to extract, and provide clear definitions for each one.
8
+
9
+ 3. Output Format Definition:
10
+ The output should be a JSON. The values could be any structure (e.g., str, int, List[str]).
11
+
12
+ 4. Optional: Hints:
13
+ Provide itemized hints for the information extractors to guide the extraction process. Remind the prompted agent to be truthful. Emphasize that the prompted agent is supposed to perform the task instead of writting code or instruct other agents to do it.
14
+
15
+ 5. Optional: Examples:
16
+ Include examples in the format:
17
+ Input: ...
18
+ Output: ...
19
+
20
+ 6. Context:
21
+ The template must include a placeholder {{input}} for the document or chunk.
22
+
23
+
24
+ Example:
25
+
26
+ ### Task description
27
+ This is an structured data extraction task. Given a medical report, you need to extract structured patient data from it.
28
+
29
+ ### Schema definition
30
+ "PatientName" which is the name of the patient,
31
+ "Age" which is the age of the patient in years,
32
+ "MRN" which is the medical record number of the patient.
33
+
34
+ ### Output format definition
35
+ Your output should follow the JSON format:
36
+ ```json
37
+ {
38
+ "PatientName": "<patient_name>",
39
+ "Age": <age_in_years>,
40
+ "MRN": "<medical_record_number>"
41
+ }
42
+ ```
43
+ I am only interested in the content between {}. Do not explain your answer.
44
+
45
+ ### Hints
46
+ - Make sure to extract the exact patient name as it appears in the report.
47
+ - You are suppose to perform the extraction task instead of writting code or instruct other agents to do it.
48
+ - If some values are not available, you should return "not specified".
49
+
50
+ ### Context
51
+ The text below is from the medical report:
52
+
53
+ "{{input}}"
llm_ie/data_types.py CHANGED
@@ -141,7 +141,7 @@ class LLMInformationExtractionFrame:
141
141
 
142
142
 
143
143
  class LLMInformationExtractionDocument:
144
- def __init__(self, doc_id:str=None, filename:str=None, text:str=None,
144
+ def __init__(self, doc_id:str=None, filename:str=None, text:str=None, struct:Dict=None,
145
145
  frames:List[LLMInformationExtractionFrame]=None, relations:List[Dict[str,str]]=None):
146
146
  """
147
147
  This class holds LLM-extracted frames, handles save/ load.
@@ -154,6 +154,8 @@ class LLMInformationExtractionDocument:
154
154
  the directory to a yaml file of a saved LLMInformationExtractionDocument
155
155
  text : str, Optional
156
156
  document text
157
+ struct : Dict, Optional
158
+ a dictionary of unanchored structure information
157
159
  frames : List[LLMInformationExtractionFrame], Optional
158
160
  a list of LLMInformationExtractionFrame
159
161
  relations : List[Dict[str,str]], Optional
@@ -168,12 +170,28 @@ class LLMInformationExtractionDocument:
168
170
  llm_ie = json.load(json_file)
169
171
  if 'doc_id' in llm_ie.keys():
170
172
  self.doc_id = llm_ie['doc_id']
173
+ else:
174
+ raise ValueError("doc_id key not found in the file.")
175
+
171
176
  if 'text' in llm_ie.keys():
172
177
  self.text = llm_ie['text']
178
+ else:
179
+ raise ValueError("text key not found in the file.")
180
+
181
+ if 'struct' in llm_ie.keys():
182
+ self.struct = llm_ie['struct']
183
+ else:
184
+ self.struct = {}
185
+
173
186
  if 'frames' in llm_ie.keys():
174
187
  self.frames = [LLMInformationExtractionFrame.from_dict(d) for d in llm_ie['frames']]
188
+ else:
189
+ self.frames = []
190
+
175
191
  if 'relations' in llm_ie.keys():
176
192
  self.relations = llm_ie['relations']
193
+ else:
194
+ self.relations = []
177
195
 
178
196
  # create object from raw inputs
179
197
  else:
@@ -181,9 +199,15 @@ class LLMInformationExtractionDocument:
181
199
  raise TypeError("doc_id must be a string.")
182
200
  self.doc_id = doc_id
183
201
  self.text = text
202
+ self.struct = struct.copy() if struct is not None else {}
184
203
  self.frames = frames.copy() if frames is not None else []
185
204
  self.relations = relations.copy() if relations is not None else []
186
205
 
206
+ def has_struct(self) -> bool:
207
+ """
208
+ This method checks if there is any unanchored structure information.
209
+ """
210
+ return bool(self.struct)
187
211
 
188
212
  def has_frame(self) -> bool:
189
213
  """
@@ -228,6 +252,18 @@ class LLMInformationExtractionDocument:
228
252
 
229
253
  return None
230
254
 
255
+ def set_struct(self, struct:Dict):
256
+ """
257
+ This method sets the unanchored structure information.
258
+
259
+ Parameters
260
+ ----------
261
+ struct : Dict
262
+ a dictionary of unanchored structure information
263
+ """
264
+ if not isinstance(struct, Dict):
265
+ raise TypeError("struct must be a dictionary.")
266
+ self.struct = struct.copy()
231
267
 
232
268
  def add_frame(self, frame:LLMInformationExtractionFrame, valid_mode:str=None, create_id:bool=False) -> bool:
233
269
  """
@@ -326,10 +362,12 @@ class LLMInformationExtractionDocument:
326
362
 
327
363
  def __repr__(self, N_top_chars:int=100) -> str:
328
364
  text_to_print = self.text[0:N_top_chars]
365
+ struct_key_count = len(self.struct.keys())
329
366
  frame_count = len(self.frames)
330
367
  relation_count = len(self.relations)
331
368
  return ''.join((f'LLMInformationExtractionDocument(doc_id: "{self.doc_id}"\n',
332
369
  f'text: "{text_to_print}...",\n',
370
+ f'struct keys: {struct_key_count}\n',
333
371
  f'frames: {frame_count}\n',
334
372
  f'relations: {relation_count}'))
335
373
 
@@ -338,6 +376,7 @@ class LLMInformationExtractionDocument:
338
376
  with open(filename, 'w') as json_file:
339
377
  json.dump({'doc_id': self.doc_id,
340
378
  'text': self.text,
379
+ 'struct': self.struct,
341
380
  'frames': [frame.to_dict() for frame in self.frames],
342
381
  'relations': self.relations},
343
382
  json_file, indent=4)
@@ -346,16 +385,22 @@ class LLMInformationExtractionDocument:
346
385
 
347
386
  def _viz_preprocess(self) -> Tuple:
348
387
  """
349
- This method preprocesses the entities and relations for visualization.
388
+ This method preprocesses the struct, entities and relations for visualization.
350
389
  """
351
390
  if importlib.util.find_spec("ie_viz") is None:
352
- raise ImportError("ie_viz not found. Please install ie_viz (```pip install ie-viz```).")
391
+ raise ImportError("ie_viz not found. Please install ie_viz (```pip install -U ie-viz```).")
353
392
 
393
+ # Struct
394
+ if self.has_struct():
395
+ struct = self.struct
396
+ else:
397
+ struct = {}
398
+ # Entities
354
399
  if self.has_frame():
355
400
  entities = [{"entity_id": frame.frame_id, "start": frame.start, "end": frame.end, "attr": frame.attr} for frame in self.frames]
356
401
  else:
357
- raise ValueError("No frames in the document.")
358
-
402
+ entities = None
403
+ # Relations
359
404
  if self.has_relation():
360
405
  relations = []
361
406
  for relation in self.relations:
@@ -364,7 +409,7 @@ class LLMInformationExtractionDocument:
364
409
  else:
365
410
  relations = None
366
411
 
367
- return entities, relations
412
+ return struct, entities, relations
368
413
 
369
414
 
370
415
  def viz_serve(self, host: str = '0.0.0.0', port: int = 5000, theme:str = "light", title:str="Frames Visualization",
@@ -388,29 +433,20 @@ class LLMInformationExtractionDocument:
388
433
  The function to be used for mapping the entity attributes to colors. When provided, the color_attr_key and
389
434
  theme will be overwritten. The function must take an entity dictionary as input and return a color string (hex).
390
435
  """
391
- entities, relations = self._viz_preprocess()
436
+ struct, entities, relations = self._viz_preprocess()
392
437
  from ie_viz import serve
393
438
 
394
- try:
395
- serve(text=self.text,
396
- entities=entities,
397
- relations=relations,
398
- host=host,
399
- port=port,
400
- theme=theme,
401
- title=title,
402
- color_attr_key=color_attr_key,
403
- color_map_func=color_map_func)
404
- except TypeError:
405
- warnings.warn("The version of ie_viz is not the latest. Please update to the latest version (pip install --upgrade ie-viz) for complete features.", UserWarning)
406
- serve(text=self.text,
407
- entities=entities,
408
- relations=relations,
409
- host=host,
410
- port=port,
411
- theme=theme,
412
- color_attr_key=color_attr_key,
413
- color_map_func=color_map_func)
439
+ serve(text=self.text,
440
+ struct=struct,
441
+ entities=entities,
442
+ relations=relations,
443
+ host=host,
444
+ port=port,
445
+ theme=theme,
446
+ title=title,
447
+ color_attr_key=color_attr_key,
448
+ color_map_func=color_map_func)
449
+
414
450
 
415
451
  def viz_render(self, theme:str = "light", color_attr_key:str=None, color_map_func:Callable=None,
416
452
  title:str="Frames Visualization") -> str:
@@ -429,22 +465,14 @@ class LLMInformationExtractionDocument:
429
465
  title : str, Optional
430
466
  the title of the HTML.
431
467
  """
432
- entities, relations = self._viz_preprocess()
468
+ struct, entities, relations = self._viz_preprocess()
433
469
  from ie_viz import render
434
470
 
435
- try:
436
- return render(text=self.text,
437
- entities=entities,
438
- relations=relations,
439
- theme=theme,
440
- title=title,
441
- color_attr_key=color_attr_key,
442
- color_map_func=color_map_func)
443
- except TypeError:
444
- warnings.warn("The version of ie_viz is not the latest. Please update to the latest version (pip install --upgrade ie-viz) for complete features.", UserWarning)
445
- return render(text=self.text,
446
- entities=entities,
447
- relations=relations,
448
- theme=theme,
449
- color_attr_key=color_attr_key,
450
- color_map_func=color_map_func)
471
+ return render(text=self.text,
472
+ struct=struct,
473
+ entities=entities,
474
+ relations=relations,
475
+ theme=theme,
476
+ title=title,
477
+ color_attr_key=color_attr_key,
478
+ color_map_func=color_map_func)