llm-ie 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/data_types.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from typing import List, Dict, Tuple, Iterable, Callable
2
2
  import importlib.util
3
+ import warnings
3
4
  import json
4
5
 
5
6
 
@@ -203,7 +204,7 @@ class LLMInformationExtractionDocument:
203
204
  # Add frame
204
205
  frame_clone = frame.copy()
205
206
  if create_id:
206
- frame_clone.doc_id = f"{self.doc_id}_{len(self.frames)}"
207
+ frame_clone.frame_id = str(len(self.frames))
207
208
 
208
209
  self.frames.append(frame_clone)
209
210
  return True
@@ -306,7 +307,7 @@ class LLMInformationExtractionDocument:
306
307
  return entities, relations
307
308
 
308
309
 
309
- def viz_serve(self, host: str = '0.0.0.0', port: int = 5000, theme:str = "light",
310
+ def viz_serve(self, host: str = '0.0.0.0', port: int = 5000, theme:str = "light", title:str="Frames Visualization",
310
311
  color_attr_key:str=None, color_map_func:Callable=None):
311
312
  """
312
313
  This method serves a visualization App of the document.
@@ -319,6 +320,8 @@ class LLMInformationExtractionDocument:
319
320
  The port number to run the server on.
320
321
  theme : str, Optional
321
322
  The theme of the visualization. Must be either "light" or "dark".
323
+ title : str, Optional
324
+ the title of the HTML.
322
325
  color_attr_key : str, Optional
323
326
  The attribute key to be used for coloring the entities.
324
327
  color_map_func : Callable, Optional
@@ -328,17 +331,29 @@ class LLMInformationExtractionDocument:
328
331
  entities, relations = self._viz_preprocess()
329
332
  from ie_viz import serve
330
333
 
331
- serve(text=self.text,
332
- entities=entities,
333
- relations=relations,
334
- host=host,
335
- port=port,
336
- theme=theme,
337
- color_attr_key=color_attr_key,
338
- color_map_func=color_map_func)
339
-
334
+ try:
335
+ serve(text=self.text,
336
+ entities=entities,
337
+ relations=relations,
338
+ host=host,
339
+ port=port,
340
+ theme=theme,
341
+ title=title,
342
+ color_attr_key=color_attr_key,
343
+ color_map_func=color_map_func)
344
+ except TypeError:
345
+ warnings.warn("The version of ie_viz is not the latest. Please update to the latest version (pip install --upgrade ie-viz) for complete features.", UserWarning)
346
+ serve(text=self.text,
347
+ entities=entities,
348
+ relations=relations,
349
+ host=host,
350
+ port=port,
351
+ theme=theme,
352
+ color_attr_key=color_attr_key,
353
+ color_map_func=color_map_func)
340
354
 
341
- def viz_render(self, theme:str = "light", color_attr_key:str=None, color_map_func:Callable=None) -> str:
355
+ def viz_render(self, theme:str = "light", color_attr_key:str=None, color_map_func:Callable=None,
356
+ title:str="Frames Visualization") -> str:
342
357
  """
343
358
  This method renders visualization html of the document.
344
359
 
@@ -351,13 +366,25 @@ class LLMInformationExtractionDocument:
351
366
  color_map_func : Callable, Optional
352
367
  The function to be used for mapping the entity attributes to colors. When provided, the color_attr_key and
353
368
  theme will be overwritten. The function must take an entity dictionary as input and return a color string (hex).
369
+ title : str, Optional
370
+ the title of the HTML.
354
371
  """
355
372
  entities, relations = self._viz_preprocess()
356
373
  from ie_viz import render
357
374
 
358
- return render(text=self.text,
359
- entities=entities,
360
- relations=relations,
361
- theme=theme,
362
- color_attr_key=color_attr_key,
363
- color_map_func=color_map_func)
375
+ try:
376
+ return render(text=self.text,
377
+ entities=entities,
378
+ relations=relations,
379
+ theme=theme,
380
+ title=title,
381
+ color_attr_key=color_attr_key,
382
+ color_map_func=color_map_func)
383
+ except TypeError:
384
+ warnings.warn("The version of ie_viz is not the latest. Please update to the latest version (pip install --upgrade ie-viz) for complete features.", UserWarning)
385
+ return render(text=self.text,
386
+ entities=entities,
387
+ relations=relations,
388
+ theme=theme,
389
+ color_attr_key=color_attr_key,
390
+ color_map_func=color_map_func)
llm_ie/extractors.py CHANGED
@@ -59,7 +59,7 @@ class Extractor:
59
59
  text_content : Union[str, Dict[str,str]]
60
60
  the input text content to put in prompt template.
61
61
  If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
62
- If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
62
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}. All values must be str.
63
63
 
64
64
  Returns : str
65
65
  a user prompt.
@@ -73,6 +73,10 @@ class Extractor:
73
73
  prompt = pattern.sub(text, self.prompt_template)
74
74
 
75
75
  elif isinstance(text_content, dict):
76
+ # Check if all values are str
77
+ if not all([isinstance(v, str) for v in text_content.values()]):
78
+ raise ValueError("All values in text_content must be str.")
79
+ # Check if all keys are in the prompt template
76
80
  placeholders = pattern.findall(self.prompt_template)
77
81
  if len(placeholders) != len(text_content):
78
82
  raise ValueError(f"Expect text_content ({len(text_content)}) and prompt template placeholder ({len(placeholders)}) to have equal size.")
@@ -422,6 +426,13 @@ class BasicFrameExtractor(FrameExtractor):
422
426
  Return : str
423
427
  a list of frames.
424
428
  """
429
+ if isinstance(text_content, str):
430
+ text = text_content
431
+ elif isinstance(text_content, dict):
432
+ if document_key is None:
433
+ raise ValueError("document_key must be provided when text_content is dict.")
434
+ text = text_content[document_key]
435
+
425
436
  frame_list = []
426
437
  gen_text = self.extract(text_content=text_content,
427
438
  max_new_tokens=max_new_tokens,
@@ -435,11 +446,6 @@ class BasicFrameExtractor(FrameExtractor):
435
446
  entity_json.append(entity)
436
447
  else:
437
448
  warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
438
-
439
- if isinstance(text_content, str):
440
- text = text_content
441
- elif isinstance(text_content, dict):
442
- text = text_content[document_key]
443
449
 
444
450
  spans = self._find_entity_spans(text=text,
445
451
  entities=[e[entity_key] for e in entity_json],
@@ -645,6 +651,8 @@ class SentenceFrameExtractor(FrameExtractor):
645
651
  if isinstance(text_content, str):
646
652
  sentences = self._get_sentences(text_content)
647
653
  elif isinstance(text_content, dict):
654
+ if document_key is None:
655
+ raise ValueError("document_key must be provided when text_content is dict.")
648
656
  sentences = self._get_sentences(text_content[document_key])
649
657
  # construct chat messages
650
658
  messages = []
@@ -715,6 +723,8 @@ class SentenceFrameExtractor(FrameExtractor):
715
723
  if isinstance(text_content, str):
716
724
  sentences = self._get_sentences(text_content)
717
725
  elif isinstance(text_content, dict):
726
+ if document_key is None:
727
+ raise ValueError("document_key must be provided when text_content is dict.")
718
728
  sentences = self._get_sentences(text_content[document_key])
719
729
  # construct chat messages
720
730
  base_messages = []
@@ -933,6 +943,8 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
933
943
  if isinstance(text_content, str):
934
944
  sentences = self._get_sentences(text_content)
935
945
  elif isinstance(text_content, dict):
946
+ if document_key is None:
947
+ raise ValueError("document_key must be provided when text_content is dict.")
936
948
  sentences = self._get_sentences(text_content[document_key])
937
949
  # construct chat messages
938
950
  messages = []
@@ -1025,6 +1037,8 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
1025
1037
  if isinstance(text_content, str):
1026
1038
  sentences = self._get_sentences(text_content)
1027
1039
  elif isinstance(text_content, dict):
1040
+ if document_key is None:
1041
+ raise ValueError("document_key must be provided when text_content is dict.")
1028
1042
  sentences = self._get_sentences(text_content[document_key])
1029
1043
  # construct chat messages
1030
1044
  base_messages = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llm-ie
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -35,9 +35,10 @@ An LLM-powered tool that transforms everyday language into robust information ex
35
35
  - [v0.3.1](https://github.com/daviden1013/llm-ie/releases/tag/v0.3.1) (Oct 26, 2024): Added Sentence Review Frame Extractor and Sentence CoT Frame Extractor
36
36
  - [v0.3.4](https://github.com/daviden1013/llm-ie/releases/tag/v0.3.4) (Nov 24, 2024): Added entity fuzzy search.
37
37
  - [v0.3.5](https://github.com/daviden1013/llm-ie/releases/tag/v0.3.5) (Nov 27, 2024): Adopted `json_repair` to fix broken JSON from LLM outputs.
38
- - v0.4.0:
38
+ - [v0.4.0](https://github.com/daviden1013/llm-ie/releases/tag/v0.4.0) (Jan 4, 2025):
39
39
  - Concurrent LLM inferencing to speed up frame and relation extraction.
40
40
  - Support for LiteLLM.
41
+ - [v0.4.1](https://github.com/daviden1013/llm-ie/releases/tag/v0.4.1) (Jan 25, 2025): Added filters, table view, and some new features to visualization tool (make sure to update [ie-viz](https://github.com/daviden1013/ie-viz)).
41
42
 
42
43
  ## Table of Contents
43
44
  - [Overview](#overview)
@@ -62,7 +63,7 @@ LLM-IE is a toolkit that provides robust information extraction utilities for na
62
63
  <div align="center"><img src="doc_asset/readme_img/LLM-IE flowchart.png" width=800 ></div>
63
64
 
64
65
  ## Prerequisite
65
- At least one LLM inference engine is required. There are built-in supports for 🚅 [LiteLLM](https://github.com/BerriAI/litellm), 🦙 [Llama-cpp-python](https://github.com/abetlen/llama-cpp-python), <img src="doc_asset/readme_img/ollama_icon_small.png" alt="Icon" width="18"/> [Ollama](https://github.com/ollama/ollama), 🤗 [Huggingface_hub](https://github.com/huggingface/huggingface_hub), <img src=doc_asset/readme_img/openai-logomark.png width=16 /> [OpenAI API](https://platform.openai.com/docs/api-reference/introduction), and <img src=doc_asset/readme_img/vllm-logo_small.png width=20 /> [vLLM](https://github.com/vllm-project/vllm). For installation guides, please refer to those projects. Other inference engines can be configured through the [InferenceEngine](src/llm_ie/engines.py) abstract class. See [LLM Inference Engine](#llm-inference-engine) section below.
66
+ At least one LLM inference engine is required. There are built-in supports for 🚅 [LiteLLM](https://github.com/BerriAI/litellm), 🦙 [Llama-cpp-python](https://github.com/abetlen/llama-cpp-python), <img src="doc_asset/readme_img/ollama_icon.png" alt="Icon" width="22"/> [Ollama](https://github.com/ollama/ollama), 🤗 [Huggingface_hub](https://github.com/huggingface/huggingface_hub), <img src=doc_asset/readme_img/openai-logomark_white.png width=16 /> [OpenAI API](https://platform.openai.com/docs/api-reference/introduction), and <img src=doc_asset/readme_img/vllm-logo_small.png width=20 /> [vLLM](https://github.com/vllm-project/vllm). For installation guides, please refer to those projects. Other inference engines can be configured through the [InferenceEngine](src/llm_ie/engines.py) abstract class. See [LLM Inference Engine](#llm-inference-engine) section below.
66
67
 
67
68
  ## Installation
68
69
  The Python package is available on PyPI.
@@ -88,7 +89,7 @@ inference_engine = LiteLLMInferenceEngine(model="openai/Llama-3.3-70B-Instruct",
88
89
  </details>
89
90
 
90
91
  <details>
91
- <summary><img src=doc_asset/readme_img/openai-logomark.png width=16 /> OpenAI API</summary>
92
+ <summary><img src=doc_asset/readme_img/openai-logomark_white.png width=16 /> OpenAI API</summary>
92
93
 
93
94
  Follow the [Best Practices for API Key Safety](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety) to set up API key.
94
95
  ```python
@@ -109,7 +110,7 @@ inference_engine = HuggingFaceHubInferenceEngine(model="meta-llama/Meta-Llama-3-
109
110
  </details>
110
111
 
111
112
  <details>
112
- <summary><img src="doc_asset/readme_img/ollama_icon_small.png" alt="Icon" width="18"/> Ollama</summary>
113
+ <summary><img src="doc_asset/readme_img/ollama_icon.png" alt="Icon" width="22"/> Ollama</summary>
113
114
 
114
115
  ```python
115
116
  from llm_ie.engines import OllamaInferenceEngine
@@ -157,12 +158,12 @@ We start with a casual description:
157
158
 
158
159
  Define the AI prompt editor.
159
160
  ```python
160
- from llm_ie import OllamaInferenceEngine, PromptEditor, BasicFrameExtractor
161
+ from llm_ie import OllamaInferenceEngine, PromptEditor, SentenceFrameExtractor
161
162
 
162
163
  # Define a LLM inference engine
163
164
  inference_engine = OllamaInferenceEngine(model_name="llama3.1:8b-instruct-q8_0")
164
165
  # Define LLM prompt editor
165
- editor = PromptEditor(inference_engine, BasicFrameExtractor)
166
+ editor = PromptEditor(inference_engine, SentenceFrameExtractor)
166
167
  # Start chat
167
168
  editor.chat()
168
169
  ```
@@ -171,7 +172,7 @@ This opens an interactive session:
171
172
  <div align="left"><img src=doc_asset/readme_img/terminal_chat.PNG width=1000 ></div>
172
173
 
173
174
 
174
- The ```PromptEditor``` drafts a prompt template following the schema required by the ```BasicFrameExtractor```:
175
+ The ```PromptEditor``` drafts a prompt template following the schema required by the ```SentenceFrameExtractor```:
175
176
 
176
177
  ```
177
178
  # Task description
@@ -209,10 +210,13 @@ with open("./demo/document/synthesized_note.txt", 'r') as f:
209
210
  note_text = f.read()
210
211
 
211
212
  # Define extractor
212
- extractor = BasicFrameExtractor(inference_engine, prompt_template)
213
+ extractor = SentenceFrameExtractor(inference_engine, prompt_template)
213
214
 
214
215
  # Extract
215
- frames = extractor.extract_frames(note_text, entity_key="Diagnosis", stream=True)
216
+ # To stream the extraction process, use concurrent=False, stream=True:
217
+ frames = extractor.extract_frames(note_text, entity_key="Diagnosis", concurrent=False, stream=True)
218
+ # For faster extraction, use concurrent=True to enable asynchronous prompting
219
+ frames = extractor.extract_frames(note_text, entity_key="Diagnosis", concurrent=True)
216
220
 
217
221
  # Check extractions
218
222
  for frame in frames:
@@ -221,10 +225,17 @@ for frame in frames:
221
225
  The output is a list of frames. Each frame has a ```entity_text```, ```start```, ```end```, and a dictionary of ```attr```.
222
226
 
223
227
  ```python
224
- {'frame_id': '0', 'start': 537, 'end': 549, 'entity_text': 'Hypertension', 'attr': {'Datetime': '2010', 'Status': 'history'}}
225
- {'frame_id': '1', 'start': 551, 'end': 565, 'entity_text': 'Hyperlipidemia', 'attr': {'Datetime': '2015', 'Status': 'history'}}
226
- {'frame_id': '2', 'start': 571, 'end': 595, 'entity_text': 'Type 2 Diabetes Mellitus', 'attr': {'Datetime': '2018', 'Status': 'history'}}
227
- {'frame_id': '3', 'start': 2402, 'end': 2431, 'entity_text': 'Acute Coronary Syndrome (ACS)', 'attr': {'Datetime': 'July 20, 2024', 'Status': 'present'}}
228
+ {'frame_id': '0', 'start': 537, 'end': 549, 'entity_text': 'hypertension', 'attr': {'Date': '2010-01-01', 'Status': 'Active'}}
229
+ {'frame_id': '1', 'start': 551, 'end': 565, 'entity_text': 'hyperlipidemia', 'attr': {'Date': '2015-01-01', 'Status': 'Active'}}
230
+ {'frame_id': '2', 'start': 571, 'end': 595, 'entity_text': 'Type 2 diabetes mellitus', 'attr': {'Date': '2018-01-01', 'Status': 'Active'}}
231
+ {'frame_id': '3', 'start': 660, 'end': 670, 'entity_text': 'chest pain', 'attr': {'Date': 'July 18, 2024'}}
232
+ {'frame_id': '4', 'start': 991, 'end': 1003, 'entity_text': 'Hypertension', 'attr': {'Date': '2010-01-01'}}
233
+ {'frame_id': '5', 'start': 1026, 'end': 1040, 'entity_text': 'Hyperlipidemia', 'attr': {'Date': '2015-01-01'}}
234
+ {'frame_id': '6', 'start': 1063, 'end': 1087, 'entity_text': 'Type 2 Diabetes Mellitus', 'attr': {'Date': '2018-01-01'}}
235
+ {'frame_id': '7', 'start': 1926, 'end': 1947, 'entity_text': 'ST-segment depression', 'attr': None}
236
+ {'frame_id': '8', 'start': 2049, 'end': 2066, 'entity_text': 'acute infiltrates', 'attr': None}
237
+ {'frame_id': '9', 'start': 2117, 'end': 2150, 'entity_text': 'Mild left ventricular hypertrophy', 'attr': None}
238
+ {'frame_id': '10', 'start': 2402, 'end': 2425, 'entity_text': 'acute coronary syndrome', 'attr': {'Date': 'July 20, 2024', 'Status': 'Active'}}
228
239
  ```
229
240
 
230
241
  We can save the frames to a document object for better management. The document holds ```text``` and ```frames```. The ```add_frame()``` method performs validation and (if passed) adds a frame to the document.
@@ -298,7 +309,7 @@ inference_engine = LiteLLMInferenceEngine(model="openai/Llama-3.1-8B-Instruct",
298
309
  inference_engine = LiteLLMInferenceEngine(model="ollama/llama3.1:8b-instruct-q8_0")
299
310
  ```
300
311
 
301
- #### <img src=doc_asset/readme_img/openai-logomark.png width=16 /> OpenAI API
312
+ #### <img src=doc_asset/readme_img/openai-logomark_white.png width=16 /> OpenAI API
302
313
  In bash, save API key to the environmental variable ```OPENAI_API_KEY```.
303
314
  ```
304
315
  export OPENAI_API_KEY=<your_API_key>
@@ -322,7 +333,7 @@ from llm_ie.engines import HuggingFaceHubInferenceEngine
322
333
  inference_engine = HuggingFaceHubInferenceEngine(model="meta-llama/Meta-Llama-3-8B-Instruct")
323
334
  ```
324
335
 
325
- #### <img src="doc_asset/readme_img/ollama_icon_small.png" alt="Icon" width="18"/> Ollama
336
+ #### <img src="doc_asset/readme_img/ollama_icon.png" alt="Icon" width="22"/> Ollama
326
337
  The ```model_name``` must match the names on the [Ollama library](https://ollama.com/library). Use the command line ```ollama ls``` to check your local model list. ```num_ctx``` determines the context length LLM will consider during text generation. Empirically, longer context length gives better performance, while consuming more memory and increases computation. ```keep_alive``` regulates the lifespan of LLM. It indicates a number of seconds to hold the LLM after the last API call. Default is 5 minutes (300 sec).
327
338
 
328
339
  ```python
@@ -1073,6 +1084,9 @@ relations = extractor.extract_relations(doc, concurrent=False, stream=True)
1073
1084
  </details>
1074
1085
 
1075
1086
  ### Visualization
1087
+
1088
+ <div align="center"><img src="doc_asset/readme_img/visualization.PNG" width=95% ></div>
1089
+
1076
1090
  The `LLMInformationExtractionDocument` class supports named entity, entity attributes, and relation visualization. The implementation is through our plug-in package [ie-viz](https://github.com/daviden1013/ie-viz). Check the example Jupyter Notebook [NER + RE for Drug, Strength, Frequency](demo/medication_relation_extraction.ipynb) for a working demo.
1077
1091
 
1078
1092
  ```cmd
@@ -14,10 +14,10 @@ llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt,sha256=m7iX4Qjsf
14
14
  llm_ie/asset/prompt_guide/SentenceCoTFrameExtractor_prompt_guide.txt,sha256=T4NsO33s3KSJml-klzXAJiYox0kiuxGo-ou2a2Ig2SY,14225
15
15
  llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=oKH_QeDgpw771ZdHk3L7DYz2Jvfm7OolUoTiJyMJI30,9541
16
16
  llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=oKH_QeDgpw771ZdHk3L7DYz2Jvfm7OolUoTiJyMJI30,9541
17
- llm_ie/data_types.py,sha256=hPz3WOeAzfn2QKmb0CxHmRdQWZQ4G9zq8U-RJBVFdYk,14329
17
+ llm_ie/data_types.py,sha256=_Kt4Er1SMj1jg8U8TCXFJH_64prur-IbFngHKmZgWr8,15717
18
18
  llm_ie/engines.py,sha256=lz2HODoqlndgezdT76diXKN_wgb7mjl6hX3JuCwsH-g,15191
19
- llm_ie/extractors.py,sha256=CpEuSqzlYd3u8Qwiu7Qdd26iII2pci1nNKxGz8sv1ZU,84506
19
+ llm_ie/extractors.py,sha256=ueSt8jBKLnqOxu8FuqyYqEERugzd6FsI0r-pY8EboHw,85426
20
20
  llm_ie/prompt_editor.py,sha256=pw_FOsEeWxFJ1p5lYR93cTNMqKQ-YZHzgBmRbPm7aNE,9486
21
- llm_ie-0.4.0.dist-info/METADATA,sha256=o721Obb1copeoFz34bz_B7am2i1Vi1xMpK5QkWn4R6A,51119
22
- llm_ie-0.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
23
- llm_ie-0.4.0.dist-info/RECORD,,
21
+ llm_ie-0.4.2.dist-info/METADATA,sha256=DASy47RtSsT1d7s3nzncjUHLIyJEPV8rTrqr1jRTFnY,52527
22
+ llm_ie-0.4.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
23
+ llm_ie-0.4.2.dist-info/RECORD,,
File without changes