llm-ie 1.2.3__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/__init__.py CHANGED
@@ -1,12 +1,12 @@
1
1
  from .data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
2
2
  from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
3
- from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
4
- from .extractors import DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
5
- from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
3
+ from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, SGLangInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
4
+ from .extractors import StructExtractor, BasicStructExtractor, DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
5
+ from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, LLMUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
6
6
  from .prompt_editor import PromptEditor
7
7
 
8
8
  __all__ = ["LLMInformationExtractionFrame", "LLMInformationExtractionDocument",
9
- "BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
10
- "DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
11
- "UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
9
+ "BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "SGLangInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
10
+ "StructExtractor", "BasicStructExtractor", "DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
11
+ "UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "LLMUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
12
12
  "PromptEditor"]
@@ -0,0 +1,129 @@
1
+ ### Task description
2
+ You are a helpful assistant that breaks down a text document into semantic units or chunks. Each chunk should represent a coherent section of the text, such as a paragraph, subsection, or topic.
3
+
4
+ ### Schema definition
5
+ You will output a JSON array of objects. Each object should have the following fields:
6
+ - "title": Generate a brief title summarizing the content of the chunk.
7
+ - "anchor_text": the first line of text in the chunk used to locate it in the original document. Must be an exact match.
8
+ - if there is a title or heading for the chunk, use that as the anchor_text.
9
+ - if there is no title or heading, use the first sentence of the chunk as the anchor_text.
10
+
11
+ ```JSON
12
+ [
13
+ {
14
+ "title": "<your title here>",
15
+ "anchor_text": "<the anchor text of the chunk here>"
16
+ },
17
+ {
18
+ "title": "<your title here>",
19
+ "anchor_text": "<the anchor text of the chunk here>"
20
+ }
21
+ ]
22
+ ```
23
+
24
+ ### Examples
25
+
26
+ **Input**:
27
+ "# Clinical Note
28
+
29
+ **Patient Name**: Michael Green
30
+ **Medical Record Number**: 1122334455
31
+ **Date of Visit**: January 5, 2025
32
+ **Provider**: Dr. Emily Carter, MD
33
+
34
+ ## Reason for Visit
35
+ Follow-up for poorly controlled type 2 diabetes and complaints of occasional dizziness and blurred vision.
36
+
37
+ ## Summary of Visit
38
+ Michael Green, a 62-year-old male with a known history of type 2 diabetes, hypertension, and obesity, presents for follow-up regarding his glycemic control. Despite recent adjustments to his treatment plan, his glucose readings have remained elevated, averaging 180-220 mg/dL. He reports occasional episodes of dizziness and blurred vision, particularly in the morning before meals. He denies chest pain, palpitations, or recent falls. He reports compliance with his medication regimen but admits to difficulty following a consistent low-carbohydrate diet.
39
+
40
+ Michael has been using a glucose meter to monitor his blood sugar levels and logs them daily. His last hemoglobin A1c, performed three months ago, was 9.2%. He reports no recent hospitalizations, infections, or significant stressors.
41
+
42
+ ## Notable History
43
+ - **Chronic Conditions**:
44
+ - Type 2 diabetes mellitus, diagnosed 10 years ago.
45
+ - Hypertension, well-controlled on medication.
46
+ - Hyperlipidemia, on statin therapy.
47
+ - **Past Surgical History**:
48
+ - Knee arthroscopy for a meniscal tear, age 50.
49
+ - **Family History**:
50
+ - Mother: Deceased at 75, complications from diabetes.
51
+ - Father: Deceased at 70, myocardial infarction."
52
+
53
+ **Output**:
54
+ ```JSON
55
+ [
56
+ {
57
+ "title": "Patient Information",
58
+ "anchor_text": "# Clinical Note"
59
+ },
60
+ {
61
+ "title": "Reason for Visit",
62
+ "anchor_text": "## Reason for Visit"
63
+ },
64
+ {
65
+ "title": "Summary of Visit",
66
+ "anchor_text": "## Summary of Visit"
67
+ },
68
+ {
69
+ "title": "Notable History",
70
+ "anchor_text": "## Notable History"
71
+ }
72
+ ]
73
+ ```
74
+
75
+ **Input**:
76
+ "In the [**Hospital1 18**] ED, 35.3 102 133/58 100%AC 500x20, 5, 1.0 with an
77
+ ABG 7.16/66/162. He had a CTH which was unremarkable. He then
78
+ had a CTA chest, afterwhich he went into PEA arrest.
79
+ Rescucitation last approximately 10-15 minutes with multiple
80
+ rounds of epi and bicarb, with ROSC. He was then admitted to the
81
+ MICU for further management.
82
+ .
83
+ Currently, the patient is intubated, sedated, and parlyzed.
84
+
85
+ Past Medical History:
86
+ Asthma
87
+ Dilated cardiomyopathy
88
+ Multiple admissions for dyspnea this winter ([**1-26**]).
89
+ Anxiety/depression
90
+ CKD
91
+ HLD
92
+ Obesity
93
+ HTN
94
+
95
+ Social History:
96
+ Unknown
97
+
98
+ Family History:
99
+ Unknown"
100
+
101
+ **Output**:
102
+ ```JSON
103
+ [
104
+ {
105
+ "title": "Patient Presentation and Initial Management",
106
+ "anchor_text": "In the [**Hospital1 18**] ED, 35.3 102 133/58 100%AC 500x20, 5, 1.0 with an"
107
+ },
108
+ {
109
+ "title": "Current Status of the Patient",
110
+ "anchor_text": "Currently, the patient is intubated, sedated, and parlyzed."
111
+ },
112
+ {
113
+ "title": "Past Medical History",
114
+ "anchor_text": "Past Medical History:"
115
+ },
116
+ {
117
+ "title": "Social History",
118
+ "anchor_text": "Social History:"
119
+ },
120
+ {
121
+ "title": "Family History",
122
+ "anchor_text": "Family History:"
123
+ }
124
+ ]
125
+ ```
126
+
127
+ ### Document text
128
+
129
+ "{{document_text}}"
@@ -7,7 +7,7 @@ Prompt Template Design:
7
7
  List the attributes to extract, and provide clear definitions for each one.
8
8
 
9
9
  3. Output Format Definition:
10
- The output should be a JSON list, where each attribute be a key. The values could be any structure (e.g., str, int, List[str]).
10
+ The output should be a JSON, where each attribute be a key. The values could be any structure (e.g., str, int, List[str]).
11
11
 
12
12
  4. Optional: Hints:
13
13
  Provide itemized hints for the information extractors to guide the extraction process. Remind the prompted agent to be truthful. Emphasize that the prompted agent is supposed to perform the task instead of writting code or instruct other agents to do it.
@@ -37,7 +37,7 @@ Example:
37
37
  Your output should follow the JSON format:
38
38
  {"Date": "<MM/DD/YYYY>", "Status": "<status>"}
39
39
 
40
- I am only interested in the content between []. Do not explain your answer.
40
+ I am only interested in the content between {}. Do not explain your answer.
41
41
 
42
42
  ### Hints
43
43
  - If the date is not complete, use the first available date in the context. For example, if the date is 01/2023, you should return 01/01/2023.
@@ -0,0 +1,53 @@
1
+ Prompt Template Design:
2
+
3
+ 1. Task Description:
4
+ Provide a detailed description of the task, including the background and the type of task (e.g., structured data extraction task).
5
+
6
+ 2. Schema Definition:
7
+ List the key-value pairs to extract, and provide clear definitions for each one.
8
+
9
+ 3. Output Format Definition:
10
+ The output should be a JSON. The values could be any structure (e.g., str, int, List[str]).
11
+
12
+ 4. Optional: Hints:
13
+ Provide itemized hints for the information extractors to guide the extraction process. Remind the prompted agent to be truthful. Emphasize that the prompted agent is supposed to perform the task instead of writting code or instruct other agents to do it.
14
+
15
+ 5. Optional: Examples:
16
+ Include examples in the format:
17
+ Input: ...
18
+ Output: ...
19
+
20
+ 6. Context:
21
+ The template must include a placeholder {{input}} for the document or chunk.
22
+
23
+
24
+ Example:
25
+
26
+ ### Task description
27
+ This is an structured data extraction task. Given a medical report, you need to extract structured patient data from it.
28
+
29
+ ### Schema definition
30
+ "PatientName" which is the name of the patient,
31
+ "Age" which is the age of the patient in years,
32
+ "MRN" which is the medical record number of the patient.
33
+
34
+ ### Output format definition
35
+ Your output should follow the JSON format:
36
+ ```json
37
+ {
38
+ "PatientName": "<patient_name>",
39
+ "Age": <age_in_years>,
40
+ "MRN": "<medical_record_number>"
41
+ }
42
+ ```
43
+ I am only interested in the content between {}. Do not explain your answer.
44
+
45
+ ### Hints
46
+ - Make sure to extract the exact patient name as it appears in the report.
47
+ - You are suppose to perform the extraction task instead of writting code or instruct other agents to do it.
48
+ - If some values are not available, you should return "not specified".
49
+
50
+ ### Context
51
+ The text below is from the medical report:
52
+
53
+ "{{input}}"
llm_ie/chunkers.py CHANGED
@@ -1,8 +1,11 @@
1
1
  import abc
2
- from typing import Set, List, Dict, Tuple, Union, Callable
2
+ from typing import List
3
3
  import asyncio
4
4
  import uuid
5
+ import importlib.resources
6
+ from llm_ie.utils import extract_json, apply_prompt_template
5
7
  from llm_ie.data_types import FrameExtractionUnit
8
+ from llm_ie.engines import InferenceEngine
6
9
 
7
10
 
8
11
  class UnitChunker(abc.ABC):
@@ -74,13 +77,14 @@ class SeparatorUnitChunker(UnitChunker):
74
77
  text : str
75
78
  The document text.
76
79
  """
80
+ doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
77
81
  paragraphs = text.split(self.sep)
78
82
  paragraph_units = []
79
83
  start = 0
80
84
  for paragraph in paragraphs:
81
85
  end = start + len(paragraph)
82
86
  paragraph_units.append(FrameExtractionUnit(
83
- doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
87
+ doc_id=doc_id,
84
88
  start=start,
85
89
  end=end,
86
90
  text=paragraph
@@ -104,10 +108,11 @@ class SentenceUnitChunker(UnitChunker):
104
108
  text : str
105
109
  The document text.
106
110
  """
111
+ doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
107
112
  sentences = []
108
113
  for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
109
114
  sentences.append(FrameExtractionUnit(
110
- doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
115
+ doc_id=doc_id,
111
116
  start=start,
112
117
  end=end,
113
118
  text=text[start:end]
@@ -129,13 +134,14 @@ class TextLineUnitChunker(UnitChunker):
129
134
  text : str
130
135
  The document text.
131
136
  """
137
+ doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
132
138
  lines = text.split('\n')
133
139
  line_units = []
134
140
  start = 0
135
141
  for line in lines:
136
142
  end = start + len(line)
137
143
  line_units.append(FrameExtractionUnit(
138
- doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
144
+ doc_id=doc_id,
139
145
  start=start,
140
146
  end=end,
141
147
  text=line
@@ -143,6 +149,100 @@ class TextLineUnitChunker(UnitChunker):
143
149
  start = end + 1
144
150
  return line_units
145
151
 
152
+ class LLMUnitChunker(UnitChunker):
153
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str=None, system_prompt:str=None):
154
+ """
155
+ This class prompt an LLM for document segmentation (e.g., sections, paragraphs).
156
+
157
+ Parameters:
158
+ ----------
159
+ inference_engine : InferenceEngine
160
+ the LLM inferencing engine object.
161
+ prompt_template : str
162
+ the prompt template that defines how to chunk the document. Must define a JSON schema with
163
+ ```json
164
+ [
165
+ {
166
+ "title": "<your title here>",
167
+ "anchor_text": "<the anchor text of the chunk here>"
168
+ },
169
+ {
170
+ "title": "<your title here>",
171
+ "anchor_text": "<the anchor text of the chunk here>"
172
+ }
173
+ ]
174
+ ```
175
+ system_prompt : str, optional
176
+ The system prompt.
177
+ """
178
+ self.inference_engine = inference_engine
179
+
180
+ if prompt_template is None:
181
+ file_path = importlib.resources.files('llm_ie.asset.default_prompts').joinpath("LLMUnitChunker_user_prompt.txt")
182
+ with open(file_path, 'r', encoding="utf-8") as f:
183
+ self.prompt_template = f.read()
184
+ else:
185
+ self.prompt_template = prompt_template
186
+
187
+ self.system_prompt = system_prompt
188
+
189
+ def chunk(self, text, doc_id=None) -> List[FrameExtractionUnit]:
190
+ """
191
+ Parameters:
192
+ -----------
193
+ text : str
194
+ the document text.
195
+ doc_id : str, optional
196
+ the document id.
197
+ """
198
+ doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
199
+ user_prompt = apply_prompt_template(prompt_template=self.prompt_template, text_content=text)
200
+ messages = []
201
+ if self.system_prompt is not None:
202
+ messages.append({'role': 'system', 'content': self.system_prompt})
203
+ messages.append({'role': 'user', 'content': user_prompt})
204
+
205
+ gen_text = self.inference_engine.chat(messages=messages)
206
+
207
+ header_list = extract_json(gen_text=gen_text["response"])
208
+ units = []
209
+ start = 0
210
+ prev_end = 0
211
+ for header in header_list:
212
+ if "anchor_text" not in header:
213
+ Warning.warn(f"Missing anchor_text in header: {header}. Skipping this header.")
214
+ continue
215
+ if not isinstance(header["anchor_text"], str):
216
+ Warning.warn(f"Invalid anchor_text: {header['anchor_text']}. Skipping this header.")
217
+ continue
218
+
219
+ start = prev_end
220
+ # find the first instandce of the leading sentence in the rest of the text
221
+ end = text.find(header["anchor_text"], start)
222
+ # if not found, skip this header
223
+ if end == -1:
224
+ continue
225
+ # if start == end (empty text), skip this header
226
+ if start == end:
227
+ continue
228
+ # create a frame extraction unit
229
+ units.append(FrameExtractionUnit(
230
+ doc_id=doc_id,
231
+ start=start,
232
+ end=end,
233
+ text=text[start:end]
234
+ ))
235
+ prev_end = end
236
+ # add the last section
237
+ if prev_end < len(text):
238
+ units.append(FrameExtractionUnit(
239
+ doc_id=doc_id,
240
+ start=prev_end,
241
+ end=len(text),
242
+ text=text[prev_end:]
243
+ ))
244
+ return units
245
+
146
246
 
147
247
  class ContextChunker(abc.ABC):
148
248
  def __init__(self):
llm_ie/data_types.py CHANGED
@@ -141,7 +141,7 @@ class LLMInformationExtractionFrame:
141
141
 
142
142
 
143
143
  class LLMInformationExtractionDocument:
144
- def __init__(self, doc_id:str=None, filename:str=None, text:str=None,
144
+ def __init__(self, doc_id:str=None, filename:str=None, text:str=None, struct:Dict=None,
145
145
  frames:List[LLMInformationExtractionFrame]=None, relations:List[Dict[str,str]]=None):
146
146
  """
147
147
  This class holds LLM-extracted frames, handles save/ load.
@@ -154,6 +154,8 @@ class LLMInformationExtractionDocument:
154
154
  the directory to a yaml file of a saved LLMInformationExtractionDocument
155
155
  text : str, Optional
156
156
  document text
157
+ struct : Dict, Optional
158
+ a dictionary of unanchored structure information
157
159
  frames : List[LLMInformationExtractionFrame], Optional
158
160
  a list of LLMInformationExtractionFrame
159
161
  relations : List[Dict[str,str]], Optional
@@ -168,12 +170,28 @@ class LLMInformationExtractionDocument:
168
170
  llm_ie = json.load(json_file)
169
171
  if 'doc_id' in llm_ie.keys():
170
172
  self.doc_id = llm_ie['doc_id']
173
+ else:
174
+ raise ValueError("doc_id key not found in the file.")
175
+
171
176
  if 'text' in llm_ie.keys():
172
177
  self.text = llm_ie['text']
178
+ else:
179
+ raise ValueError("text key not found in the file.")
180
+
181
+ if 'struct' in llm_ie.keys():
182
+ self.struct = llm_ie['struct']
183
+ else:
184
+ self.struct = {}
185
+
173
186
  if 'frames' in llm_ie.keys():
174
187
  self.frames = [LLMInformationExtractionFrame.from_dict(d) for d in llm_ie['frames']]
188
+ else:
189
+ self.frames = []
190
+
175
191
  if 'relations' in llm_ie.keys():
176
192
  self.relations = llm_ie['relations']
193
+ else:
194
+ self.relations = []
177
195
 
178
196
  # create object from raw inputs
179
197
  else:
@@ -181,9 +199,15 @@ class LLMInformationExtractionDocument:
181
199
  raise TypeError("doc_id must be a string.")
182
200
  self.doc_id = doc_id
183
201
  self.text = text
202
+ self.struct = struct.copy() if struct is not None else {}
184
203
  self.frames = frames.copy() if frames is not None else []
185
204
  self.relations = relations.copy() if relations is not None else []
186
205
 
206
+ def has_struct(self) -> bool:
207
+ """
208
+ This method checks if there is any unanchored structure information.
209
+ """
210
+ return bool(self.struct)
187
211
 
188
212
  def has_frame(self) -> bool:
189
213
  """
@@ -228,6 +252,18 @@ class LLMInformationExtractionDocument:
228
252
 
229
253
  return None
230
254
 
255
+ def set_struct(self, struct:Dict):
256
+ """
257
+ This method sets the unanchored structure information.
258
+
259
+ Parameters
260
+ ----------
261
+ struct : Dict
262
+ a dictionary of unanchored structure information
263
+ """
264
+ if not isinstance(struct, Dict):
265
+ raise TypeError("struct must be a dictionary.")
266
+ self.struct = struct.copy()
231
267
 
232
268
  def add_frame(self, frame:LLMInformationExtractionFrame, valid_mode:str=None, create_id:bool=False) -> bool:
233
269
  """
@@ -326,10 +362,12 @@ class LLMInformationExtractionDocument:
326
362
 
327
363
  def __repr__(self, N_top_chars:int=100) -> str:
328
364
  text_to_print = self.text[0:N_top_chars]
365
+ struct_key_count = len(self.struct.keys())
329
366
  frame_count = len(self.frames)
330
367
  relation_count = len(self.relations)
331
368
  return ''.join((f'LLMInformationExtractionDocument(doc_id: "{self.doc_id}"\n',
332
369
  f'text: "{text_to_print}...",\n',
370
+ f'struct keys: {struct_key_count}\n',
333
371
  f'frames: {frame_count}\n',
334
372
  f'relations: {relation_count}'))
335
373
 
@@ -338,6 +376,7 @@ class LLMInformationExtractionDocument:
338
376
  with open(filename, 'w') as json_file:
339
377
  json.dump({'doc_id': self.doc_id,
340
378
  'text': self.text,
379
+ 'struct': self.struct,
341
380
  'frames': [frame.to_dict() for frame in self.frames],
342
381
  'relations': self.relations},
343
382
  json_file, indent=4)
@@ -346,16 +385,22 @@ class LLMInformationExtractionDocument:
346
385
 
347
386
  def _viz_preprocess(self) -> Tuple:
348
387
  """
349
- This method preprocesses the entities and relations for visualization.
388
+ This method preprocesses the struct, entities and relations for visualization.
350
389
  """
351
390
  if importlib.util.find_spec("ie_viz") is None:
352
- raise ImportError("ie_viz not found. Please install ie_viz (```pip install ie-viz```).")
391
+ raise ImportError("ie_viz not found. Please install ie_viz (```pip install -U ie-viz```).")
353
392
 
393
+ # Struct
394
+ if self.has_struct():
395
+ struct = self.struct
396
+ else:
397
+ struct = {}
398
+ # Entities
354
399
  if self.has_frame():
355
400
  entities = [{"entity_id": frame.frame_id, "start": frame.start, "end": frame.end, "attr": frame.attr} for frame in self.frames]
356
401
  else:
357
- raise ValueError("No frames in the document.")
358
-
402
+ entities = None
403
+ # Relations
359
404
  if self.has_relation():
360
405
  relations = []
361
406
  for relation in self.relations:
@@ -364,7 +409,7 @@ class LLMInformationExtractionDocument:
364
409
  else:
365
410
  relations = None
366
411
 
367
- return entities, relations
412
+ return struct, entities, relations
368
413
 
369
414
 
370
415
  def viz_serve(self, host: str = '0.0.0.0', port: int = 5000, theme:str = "light", title:str="Frames Visualization",
@@ -388,29 +433,20 @@ class LLMInformationExtractionDocument:
388
433
  The function to be used for mapping the entity attributes to colors. When provided, the color_attr_key and
389
434
  theme will be overwritten. The function must take an entity dictionary as input and return a color string (hex).
390
435
  """
391
- entities, relations = self._viz_preprocess()
436
+ struct, entities, relations = self._viz_preprocess()
392
437
  from ie_viz import serve
393
438
 
394
- try:
395
- serve(text=self.text,
396
- entities=entities,
397
- relations=relations,
398
- host=host,
399
- port=port,
400
- theme=theme,
401
- title=title,
402
- color_attr_key=color_attr_key,
403
- color_map_func=color_map_func)
404
- except TypeError:
405
- warnings.warn("The version of ie_viz is not the latest. Please update to the latest version (pip install --upgrade ie-viz) for complete features.", UserWarning)
406
- serve(text=self.text,
407
- entities=entities,
408
- relations=relations,
409
- host=host,
410
- port=port,
411
- theme=theme,
412
- color_attr_key=color_attr_key,
413
- color_map_func=color_map_func)
439
+ serve(text=self.text,
440
+ struct=struct,
441
+ entities=entities,
442
+ relations=relations,
443
+ host=host,
444
+ port=port,
445
+ theme=theme,
446
+ title=title,
447
+ color_attr_key=color_attr_key,
448
+ color_map_func=color_map_func)
449
+
414
450
 
415
451
  def viz_render(self, theme:str = "light", color_attr_key:str=None, color_map_func:Callable=None,
416
452
  title:str="Frames Visualization") -> str:
@@ -429,22 +465,14 @@ class LLMInformationExtractionDocument:
429
465
  title : str, Optional
430
466
  the title of the HTML.
431
467
  """
432
- entities, relations = self._viz_preprocess()
468
+ struct, entities, relations = self._viz_preprocess()
433
469
  from ie_viz import render
434
470
 
435
- try:
436
- return render(text=self.text,
437
- entities=entities,
438
- relations=relations,
439
- theme=theme,
440
- title=title,
441
- color_attr_key=color_attr_key,
442
- color_map_func=color_map_func)
443
- except TypeError:
444
- warnings.warn("The version of ie_viz is not the latest. Please update to the latest version (pip install --upgrade ie-viz) for complete features.", UserWarning)
445
- return render(text=self.text,
446
- entities=entities,
447
- relations=relations,
448
- theme=theme,
449
- color_attr_key=color_attr_key,
450
- color_map_func=color_map_func)
471
+ return render(text=self.text,
472
+ struct=struct,
473
+ entities=entities,
474
+ relations=relations,
475
+ theme=theme,
476
+ title=title,
477
+ color_attr_key=color_attr_key,
478
+ color_map_func=color_map_func)
llm_ie/engines.py CHANGED
@@ -1060,6 +1060,50 @@ class VLLMInferenceEngine(OpenAICompatibleInferenceEngine):
1060
1060
  return {"reasoning": getattr(response.choices[0].message, "reasoning_content", ""),
1061
1061
  "response": getattr(response.choices[0].message, "content", "")}
1062
1062
 
1063
+ class SGLangInferenceEngine(OpenAICompatibleInferenceEngine):
1064
+ def __init__(self, model:str, api_key:str="", base_url:str="http://localhost:30000/v1", config:LLMConfig=None, **kwrs):
1065
+ """
1066
+ SGLang OpenAI compatible API inference engine.
1067
+ https://docs.sglang.ai/basic_usage/openai_api.html
1068
+
1069
+ Parameters:
1070
+ ----------
1071
+ model_name : str
1072
+ model name as shown in the vLLM server
1073
+ api_key : str, Optional
1074
+ the API key for the vLLM server.
1075
+ base_url : str, Optional
1076
+ the base url for the vLLM server.
1077
+ config : LLMConfig
1078
+ the LLM configuration.
1079
+ """
1080
+ super().__init__(model, api_key, base_url, config, **kwrs)
1081
+
1082
+
1083
+ def _format_response(self, response: Any) -> Dict[str, str]:
1084
+ """
1085
+ This method format the response from OpenAI API to a dict with keys "type" and "data".
1086
+
1087
+ Parameters:
1088
+ ----------
1089
+ response : Any
1090
+ the response from OpenAI-compatible API. Could be a dict, generator, or object.
1091
+ """
1092
+ if isinstance(response, self.ChatCompletionChunk):
1093
+ if hasattr(response.choices[0].delta, "reasoning_content") and getattr(response.choices[0].delta, "reasoning_content") is not None:
1094
+ chunk_text = getattr(response.choices[0].delta, "reasoning_content", "")
1095
+ if chunk_text is None:
1096
+ chunk_text = ""
1097
+ return {"type": "reasoning", "data": chunk_text}
1098
+ else:
1099
+ chunk_text = getattr(response.choices[0].delta, "content", "")
1100
+ if chunk_text is None:
1101
+ chunk_text = ""
1102
+ return {"type": "response", "data": chunk_text}
1103
+
1104
+ return {"reasoning": getattr(response.choices[0].message, "reasoning_content", ""),
1105
+ "response": getattr(response.choices[0].message, "content", "")}
1106
+
1063
1107
 
1064
1108
  class OpenRouterInferenceEngine(OpenAICompatibleInferenceEngine):
1065
1109
  def __init__(self, model:str, api_key:str=None, base_url:str="https://openrouter.ai/api/v1", config:LLMConfig=None, **kwrs):