llm-ie 1.2.3__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_ie/__init__.py +6 -6
- llm_ie/asset/default_prompts/LLMUnitChunker_user_prompt.txt +129 -0
- llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt +2 -2
- llm_ie/asset/prompt_guide/StructExtractor_prompt_guide.txt +53 -0
- llm_ie/chunkers.py +104 -4
- llm_ie/data_types.py +72 -44
- llm_ie/engines.py +44 -0
- llm_ie/extractors.py +421 -73
- llm_ie/prompt_editor.py +9 -32
- llm_ie/utils.py +95 -0
- {llm_ie-1.2.3.dist-info → llm_ie-1.3.0.dist-info}/METADATA +1 -1
- {llm_ie-1.2.3.dist-info → llm_ie-1.3.0.dist-info}/RECORD +13 -10
- {llm_ie-1.2.3.dist-info → llm_ie-1.3.0.dist-info}/WHEEL +0 -0
llm_ie/__init__.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from .data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
2
2
|
from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
|
|
3
|
-
from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
|
|
4
|
-
from .extractors import DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
|
|
5
|
-
from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
|
|
3
|
+
from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, SGLangInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
|
|
4
|
+
from .extractors import StructExtractor, BasicStructExtractor, DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
|
|
5
|
+
from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, LLMUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
|
|
6
6
|
from .prompt_editor import PromptEditor
|
|
7
7
|
|
|
8
8
|
__all__ = ["LLMInformationExtractionFrame", "LLMInformationExtractionDocument",
|
|
9
|
-
"BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
|
|
10
|
-
"DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
|
|
11
|
-
"UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
|
|
9
|
+
"BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "SGLangInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
|
|
10
|
+
"StructExtractor", "BasicStructExtractor", "DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
|
|
11
|
+
"UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "LLMUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
|
|
12
12
|
"PromptEditor"]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
### Task description
|
|
2
|
+
You are a helpful assistant that breaks down a text document into semantic units or chunks. Each chunk should represent a coherent section of the text, such as a paragraph, subsection, or topic.
|
|
3
|
+
|
|
4
|
+
### Schema definition
|
|
5
|
+
You will output a JSON array of objects. Each object should have the following fields:
|
|
6
|
+
- "title": Generate a brief title summarizing the content of the chunk.
|
|
7
|
+
- "anchor_text": the first line of text in the chunk used to locate it in the original document. Must be an exact match.
|
|
8
|
+
- if there is a title or heading for the chunk, use that as the anchor_text.
|
|
9
|
+
- if there is no title or heading, use the first sentence of the chunk as the anchor_text.
|
|
10
|
+
|
|
11
|
+
```JSON
|
|
12
|
+
[
|
|
13
|
+
{
|
|
14
|
+
"title": "<your title here>",
|
|
15
|
+
"anchor_text": "<the anchor text of the chunk here>"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"title": "<your title here>",
|
|
19
|
+
"anchor_text": "<the anchor text of the chunk here>"
|
|
20
|
+
}
|
|
21
|
+
]
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### Examples
|
|
25
|
+
|
|
26
|
+
**Input**:
|
|
27
|
+
"# Clinical Note
|
|
28
|
+
|
|
29
|
+
**Patient Name**: Michael Green
|
|
30
|
+
**Medical Record Number**: 1122334455
|
|
31
|
+
**Date of Visit**: January 5, 2025
|
|
32
|
+
**Provider**: Dr. Emily Carter, MD
|
|
33
|
+
|
|
34
|
+
## Reason for Visit
|
|
35
|
+
Follow-up for poorly controlled type 2 diabetes and complaints of occasional dizziness and blurred vision.
|
|
36
|
+
|
|
37
|
+
## Summary of Visit
|
|
38
|
+
Michael Green, a 62-year-old male with a known history of type 2 diabetes, hypertension, and obesity, presents for follow-up regarding his glycemic control. Despite recent adjustments to his treatment plan, his glucose readings have remained elevated, averaging 180-220 mg/dL. He reports occasional episodes of dizziness and blurred vision, particularly in the morning before meals. He denies chest pain, palpitations, or recent falls. He reports compliance with his medication regimen but admits to difficulty following a consistent low-carbohydrate diet.
|
|
39
|
+
|
|
40
|
+
Michael has been using a glucose meter to monitor his blood sugar levels and logs them daily. His last hemoglobin A1c, performed three months ago, was 9.2%. He reports no recent hospitalizations, infections, or significant stressors.
|
|
41
|
+
|
|
42
|
+
## Notable History
|
|
43
|
+
- **Chronic Conditions**:
|
|
44
|
+
- Type 2 diabetes mellitus, diagnosed 10 years ago.
|
|
45
|
+
- Hypertension, well-controlled on medication.
|
|
46
|
+
- Hyperlipidemia, on statin therapy.
|
|
47
|
+
- **Past Surgical History**:
|
|
48
|
+
- Knee arthroscopy for a meniscal tear, age 50.
|
|
49
|
+
- **Family History**:
|
|
50
|
+
- Mother: Deceased at 75, complications from diabetes.
|
|
51
|
+
- Father: Deceased at 70, myocardial infarction."
|
|
52
|
+
|
|
53
|
+
**Output**:
|
|
54
|
+
```JSON
|
|
55
|
+
[
|
|
56
|
+
{
|
|
57
|
+
"title": "Patient Information",
|
|
58
|
+
"anchor_text": "# Clinical Note"
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
"title": "Reason for Visit",
|
|
62
|
+
"anchor_text": "## Reason for Visit"
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
"title": "Summary of Visit",
|
|
66
|
+
"anchor_text": "## Summary of Visit"
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"title": "Notable History",
|
|
70
|
+
"anchor_text": "## Notable History"
|
|
71
|
+
}
|
|
72
|
+
]
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
**Input**:
|
|
76
|
+
"In the [**Hospital1 18**] ED, 35.3 102 133/58 100%AC 500x20, 5, 1.0 with an
|
|
77
|
+
ABG 7.16/66/162. He had a CTH which was unremarkable. He then
|
|
78
|
+
had a CTA chest, afterwhich he went into PEA arrest.
|
|
79
|
+
Rescucitation last approximately 10-15 minutes with multiple
|
|
80
|
+
rounds of epi and bicarb, with ROSC. He was then admitted to the
|
|
81
|
+
MICU for further management.
|
|
82
|
+
.
|
|
83
|
+
Currently, the patient is intubated, sedated, and parlyzed.
|
|
84
|
+
|
|
85
|
+
Past Medical History:
|
|
86
|
+
Asthma
|
|
87
|
+
Dilated cardiomyopathy
|
|
88
|
+
Multiple admissions for dyspnea this winter ([**1-26**]).
|
|
89
|
+
Anxiety/depression
|
|
90
|
+
CKD
|
|
91
|
+
HLD
|
|
92
|
+
Obesity
|
|
93
|
+
HTN
|
|
94
|
+
|
|
95
|
+
Social History:
|
|
96
|
+
Unknown
|
|
97
|
+
|
|
98
|
+
Family History:
|
|
99
|
+
Unknown"
|
|
100
|
+
|
|
101
|
+
**Output**:
|
|
102
|
+
```JSON
|
|
103
|
+
[
|
|
104
|
+
{
|
|
105
|
+
"title": "Patient Presentation and Initial Management",
|
|
106
|
+
"anchor_text": "In the [**Hospital1 18**] ED, 35.3 102 133/58 100%AC 500x20, 5, 1.0 with an"
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
"title": "Current Status of the Patient",
|
|
110
|
+
"anchor_text": "Currently, the patient is intubated, sedated, and parlyzed."
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
"title": "Past Medical History",
|
|
114
|
+
"anchor_text": "Past Medical History:"
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
"title": "Social History",
|
|
118
|
+
"anchor_text": "Social History:"
|
|
119
|
+
},
|
|
120
|
+
{
|
|
121
|
+
"title": "Family History",
|
|
122
|
+
"anchor_text": "Family History:"
|
|
123
|
+
}
|
|
124
|
+
]
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Document text
|
|
128
|
+
|
|
129
|
+
"{{document_text}}"
|
|
@@ -7,7 +7,7 @@ Prompt Template Design:
|
|
|
7
7
|
List the attributes to extract, and provide clear definitions for each one.
|
|
8
8
|
|
|
9
9
|
3. Output Format Definition:
|
|
10
|
-
The output should be a JSON
|
|
10
|
+
The output should be a JSON, where each attribute be a key. The values could be any structure (e.g., str, int, List[str]).
|
|
11
11
|
|
|
12
12
|
4. Optional: Hints:
|
|
13
13
|
Provide itemized hints for the information extractors to guide the extraction process. Remind the prompted agent to be truthful. Emphasize that the prompted agent is supposed to perform the task instead of writting code or instruct other agents to do it.
|
|
@@ -37,7 +37,7 @@ Example:
|
|
|
37
37
|
Your output should follow the JSON format:
|
|
38
38
|
{"Date": "<MM/DD/YYYY>", "Status": "<status>"}
|
|
39
39
|
|
|
40
|
-
I am only interested in the content between
|
|
40
|
+
I am only interested in the content between {}. Do not explain your answer.
|
|
41
41
|
|
|
42
42
|
### Hints
|
|
43
43
|
- If the date is not complete, use the first available date in the context. For example, if the date is 01/2023, you should return 01/01/2023.
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
Prompt Template Design:
|
|
2
|
+
|
|
3
|
+
1. Task Description:
|
|
4
|
+
Provide a detailed description of the task, including the background and the type of task (e.g., structured data extraction task).
|
|
5
|
+
|
|
6
|
+
2. Schema Definition:
|
|
7
|
+
List the key-value pairs to extract, and provide clear definitions for each one.
|
|
8
|
+
|
|
9
|
+
3. Output Format Definition:
|
|
10
|
+
The output should be a JSON. The values could be any structure (e.g., str, int, List[str]).
|
|
11
|
+
|
|
12
|
+
4. Optional: Hints:
|
|
13
|
+
Provide itemized hints for the information extractors to guide the extraction process. Remind the prompted agent to be truthful. Emphasize that the prompted agent is supposed to perform the task instead of writting code or instruct other agents to do it.
|
|
14
|
+
|
|
15
|
+
5. Optional: Examples:
|
|
16
|
+
Include examples in the format:
|
|
17
|
+
Input: ...
|
|
18
|
+
Output: ...
|
|
19
|
+
|
|
20
|
+
6. Context:
|
|
21
|
+
The template must include a placeholder {{input}} for the document or chunk.
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
|
|
26
|
+
### Task description
|
|
27
|
+
This is an structured data extraction task. Given a medical report, you need to extract structured patient data from it.
|
|
28
|
+
|
|
29
|
+
### Schema definition
|
|
30
|
+
"PatientName" which is the name of the patient,
|
|
31
|
+
"Age" which is the age of the patient in years,
|
|
32
|
+
"MRN" which is the medical record number of the patient.
|
|
33
|
+
|
|
34
|
+
### Output format definition
|
|
35
|
+
Your output should follow the JSON format:
|
|
36
|
+
```json
|
|
37
|
+
{
|
|
38
|
+
"PatientName": "<patient_name>",
|
|
39
|
+
"Age": <age_in_years>,
|
|
40
|
+
"MRN": "<medical_record_number>"
|
|
41
|
+
}
|
|
42
|
+
```
|
|
43
|
+
I am only interested in the content between {}. Do not explain your answer.
|
|
44
|
+
|
|
45
|
+
### Hints
|
|
46
|
+
- Make sure to extract the exact patient name as it appears in the report.
|
|
47
|
+
- You are suppose to perform the extraction task instead of writting code or instruct other agents to do it.
|
|
48
|
+
- If some values are not available, you should return "not specified".
|
|
49
|
+
|
|
50
|
+
### Context
|
|
51
|
+
The text below is from the medical report:
|
|
52
|
+
|
|
53
|
+
"{{input}}"
|
llm_ie/chunkers.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
import abc
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import List
|
|
3
3
|
import asyncio
|
|
4
4
|
import uuid
|
|
5
|
+
import importlib.resources
|
|
6
|
+
from llm_ie.utils import extract_json, apply_prompt_template
|
|
5
7
|
from llm_ie.data_types import FrameExtractionUnit
|
|
8
|
+
from llm_ie.engines import InferenceEngine
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
class UnitChunker(abc.ABC):
|
|
@@ -74,13 +77,14 @@ class SeparatorUnitChunker(UnitChunker):
|
|
|
74
77
|
text : str
|
|
75
78
|
The document text.
|
|
76
79
|
"""
|
|
80
|
+
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
|
|
77
81
|
paragraphs = text.split(self.sep)
|
|
78
82
|
paragraph_units = []
|
|
79
83
|
start = 0
|
|
80
84
|
for paragraph in paragraphs:
|
|
81
85
|
end = start + len(paragraph)
|
|
82
86
|
paragraph_units.append(FrameExtractionUnit(
|
|
83
|
-
doc_id=doc_id
|
|
87
|
+
doc_id=doc_id,
|
|
84
88
|
start=start,
|
|
85
89
|
end=end,
|
|
86
90
|
text=paragraph
|
|
@@ -104,10 +108,11 @@ class SentenceUnitChunker(UnitChunker):
|
|
|
104
108
|
text : str
|
|
105
109
|
The document text.
|
|
106
110
|
"""
|
|
111
|
+
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
|
|
107
112
|
sentences = []
|
|
108
113
|
for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
|
|
109
114
|
sentences.append(FrameExtractionUnit(
|
|
110
|
-
doc_id=doc_id
|
|
115
|
+
doc_id=doc_id,
|
|
111
116
|
start=start,
|
|
112
117
|
end=end,
|
|
113
118
|
text=text[start:end]
|
|
@@ -129,13 +134,14 @@ class TextLineUnitChunker(UnitChunker):
|
|
|
129
134
|
text : str
|
|
130
135
|
The document text.
|
|
131
136
|
"""
|
|
137
|
+
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
|
|
132
138
|
lines = text.split('\n')
|
|
133
139
|
line_units = []
|
|
134
140
|
start = 0
|
|
135
141
|
for line in lines:
|
|
136
142
|
end = start + len(line)
|
|
137
143
|
line_units.append(FrameExtractionUnit(
|
|
138
|
-
doc_id=doc_id
|
|
144
|
+
doc_id=doc_id,
|
|
139
145
|
start=start,
|
|
140
146
|
end=end,
|
|
141
147
|
text=line
|
|
@@ -143,6 +149,100 @@ class TextLineUnitChunker(UnitChunker):
|
|
|
143
149
|
start = end + 1
|
|
144
150
|
return line_units
|
|
145
151
|
|
|
152
|
+
class LLMUnitChunker(UnitChunker):
|
|
153
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str=None, system_prompt:str=None):
|
|
154
|
+
"""
|
|
155
|
+
This class prompt an LLM for document segmentation (e.g., sections, paragraphs).
|
|
156
|
+
|
|
157
|
+
Parameters:
|
|
158
|
+
----------
|
|
159
|
+
inference_engine : InferenceEngine
|
|
160
|
+
the LLM inferencing engine object.
|
|
161
|
+
prompt_template : str
|
|
162
|
+
the prompt template that defines how to chunk the document. Must define a JSON schema with
|
|
163
|
+
```json
|
|
164
|
+
[
|
|
165
|
+
{
|
|
166
|
+
"title": "<your title here>",
|
|
167
|
+
"anchor_text": "<the anchor text of the chunk here>"
|
|
168
|
+
},
|
|
169
|
+
{
|
|
170
|
+
"title": "<your title here>",
|
|
171
|
+
"anchor_text": "<the anchor text of the chunk here>"
|
|
172
|
+
}
|
|
173
|
+
]
|
|
174
|
+
```
|
|
175
|
+
system_prompt : str, optional
|
|
176
|
+
The system prompt.
|
|
177
|
+
"""
|
|
178
|
+
self.inference_engine = inference_engine
|
|
179
|
+
|
|
180
|
+
if prompt_template is None:
|
|
181
|
+
file_path = importlib.resources.files('llm_ie.asset.default_prompts').joinpath("LLMUnitChunker_user_prompt.txt")
|
|
182
|
+
with open(file_path, 'r', encoding="utf-8") as f:
|
|
183
|
+
self.prompt_template = f.read()
|
|
184
|
+
else:
|
|
185
|
+
self.prompt_template = prompt_template
|
|
186
|
+
|
|
187
|
+
self.system_prompt = system_prompt
|
|
188
|
+
|
|
189
|
+
def chunk(self, text, doc_id=None) -> List[FrameExtractionUnit]:
|
|
190
|
+
"""
|
|
191
|
+
Parameters:
|
|
192
|
+
-----------
|
|
193
|
+
text : str
|
|
194
|
+
the document text.
|
|
195
|
+
doc_id : str, optional
|
|
196
|
+
the document id.
|
|
197
|
+
"""
|
|
198
|
+
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
|
|
199
|
+
user_prompt = apply_prompt_template(prompt_template=self.prompt_template, text_content=text)
|
|
200
|
+
messages = []
|
|
201
|
+
if self.system_prompt is not None:
|
|
202
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
203
|
+
messages.append({'role': 'user', 'content': user_prompt})
|
|
204
|
+
|
|
205
|
+
gen_text = self.inference_engine.chat(messages=messages)
|
|
206
|
+
|
|
207
|
+
header_list = extract_json(gen_text=gen_text["response"])
|
|
208
|
+
units = []
|
|
209
|
+
start = 0
|
|
210
|
+
prev_end = 0
|
|
211
|
+
for header in header_list:
|
|
212
|
+
if "anchor_text" not in header:
|
|
213
|
+
Warning.warn(f"Missing anchor_text in header: {header}. Skipping this header.")
|
|
214
|
+
continue
|
|
215
|
+
if not isinstance(header["anchor_text"], str):
|
|
216
|
+
Warning.warn(f"Invalid anchor_text: {header['anchor_text']}. Skipping this header.")
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
start = prev_end
|
|
220
|
+
# find the first instandce of the leading sentence in the rest of the text
|
|
221
|
+
end = text.find(header["anchor_text"], start)
|
|
222
|
+
# if not found, skip this header
|
|
223
|
+
if end == -1:
|
|
224
|
+
continue
|
|
225
|
+
# if start == end (empty text), skip this header
|
|
226
|
+
if start == end:
|
|
227
|
+
continue
|
|
228
|
+
# create a frame extraction unit
|
|
229
|
+
units.append(FrameExtractionUnit(
|
|
230
|
+
doc_id=doc_id,
|
|
231
|
+
start=start,
|
|
232
|
+
end=end,
|
|
233
|
+
text=text[start:end]
|
|
234
|
+
))
|
|
235
|
+
prev_end = end
|
|
236
|
+
# add the last section
|
|
237
|
+
if prev_end < len(text):
|
|
238
|
+
units.append(FrameExtractionUnit(
|
|
239
|
+
doc_id=doc_id,
|
|
240
|
+
start=prev_end,
|
|
241
|
+
end=len(text),
|
|
242
|
+
text=text[prev_end:]
|
|
243
|
+
))
|
|
244
|
+
return units
|
|
245
|
+
|
|
146
246
|
|
|
147
247
|
class ContextChunker(abc.ABC):
|
|
148
248
|
def __init__(self):
|
llm_ie/data_types.py
CHANGED
|
@@ -141,7 +141,7 @@ class LLMInformationExtractionFrame:
|
|
|
141
141
|
|
|
142
142
|
|
|
143
143
|
class LLMInformationExtractionDocument:
|
|
144
|
-
def __init__(self, doc_id:str=None, filename:str=None, text:str=None,
|
|
144
|
+
def __init__(self, doc_id:str=None, filename:str=None, text:str=None, struct:Dict=None,
|
|
145
145
|
frames:List[LLMInformationExtractionFrame]=None, relations:List[Dict[str,str]]=None):
|
|
146
146
|
"""
|
|
147
147
|
This class holds LLM-extracted frames, handles save/ load.
|
|
@@ -154,6 +154,8 @@ class LLMInformationExtractionDocument:
|
|
|
154
154
|
the directory to a yaml file of a saved LLMInformationExtractionDocument
|
|
155
155
|
text : str, Optional
|
|
156
156
|
document text
|
|
157
|
+
struct : Dict, Optional
|
|
158
|
+
a dictionary of unanchored structure information
|
|
157
159
|
frames : List[LLMInformationExtractionFrame], Optional
|
|
158
160
|
a list of LLMInformationExtractionFrame
|
|
159
161
|
relations : List[Dict[str,str]], Optional
|
|
@@ -168,12 +170,28 @@ class LLMInformationExtractionDocument:
|
|
|
168
170
|
llm_ie = json.load(json_file)
|
|
169
171
|
if 'doc_id' in llm_ie.keys():
|
|
170
172
|
self.doc_id = llm_ie['doc_id']
|
|
173
|
+
else:
|
|
174
|
+
raise ValueError("doc_id key not found in the file.")
|
|
175
|
+
|
|
171
176
|
if 'text' in llm_ie.keys():
|
|
172
177
|
self.text = llm_ie['text']
|
|
178
|
+
else:
|
|
179
|
+
raise ValueError("text key not found in the file.")
|
|
180
|
+
|
|
181
|
+
if 'struct' in llm_ie.keys():
|
|
182
|
+
self.struct = llm_ie['struct']
|
|
183
|
+
else:
|
|
184
|
+
self.struct = {}
|
|
185
|
+
|
|
173
186
|
if 'frames' in llm_ie.keys():
|
|
174
187
|
self.frames = [LLMInformationExtractionFrame.from_dict(d) for d in llm_ie['frames']]
|
|
188
|
+
else:
|
|
189
|
+
self.frames = []
|
|
190
|
+
|
|
175
191
|
if 'relations' in llm_ie.keys():
|
|
176
192
|
self.relations = llm_ie['relations']
|
|
193
|
+
else:
|
|
194
|
+
self.relations = []
|
|
177
195
|
|
|
178
196
|
# create object from raw inputs
|
|
179
197
|
else:
|
|
@@ -181,9 +199,15 @@ class LLMInformationExtractionDocument:
|
|
|
181
199
|
raise TypeError("doc_id must be a string.")
|
|
182
200
|
self.doc_id = doc_id
|
|
183
201
|
self.text = text
|
|
202
|
+
self.struct = struct.copy() if struct is not None else {}
|
|
184
203
|
self.frames = frames.copy() if frames is not None else []
|
|
185
204
|
self.relations = relations.copy() if relations is not None else []
|
|
186
205
|
|
|
206
|
+
def has_struct(self) -> bool:
|
|
207
|
+
"""
|
|
208
|
+
This method checks if there is any unanchored structure information.
|
|
209
|
+
"""
|
|
210
|
+
return bool(self.struct)
|
|
187
211
|
|
|
188
212
|
def has_frame(self) -> bool:
|
|
189
213
|
"""
|
|
@@ -228,6 +252,18 @@ class LLMInformationExtractionDocument:
|
|
|
228
252
|
|
|
229
253
|
return None
|
|
230
254
|
|
|
255
|
+
def set_struct(self, struct:Dict):
|
|
256
|
+
"""
|
|
257
|
+
This method sets the unanchored structure information.
|
|
258
|
+
|
|
259
|
+
Parameters
|
|
260
|
+
----------
|
|
261
|
+
struct : Dict
|
|
262
|
+
a dictionary of unanchored structure information
|
|
263
|
+
"""
|
|
264
|
+
if not isinstance(struct, Dict):
|
|
265
|
+
raise TypeError("struct must be a dictionary.")
|
|
266
|
+
self.struct = struct.copy()
|
|
231
267
|
|
|
232
268
|
def add_frame(self, frame:LLMInformationExtractionFrame, valid_mode:str=None, create_id:bool=False) -> bool:
|
|
233
269
|
"""
|
|
@@ -326,10 +362,12 @@ class LLMInformationExtractionDocument:
|
|
|
326
362
|
|
|
327
363
|
def __repr__(self, N_top_chars:int=100) -> str:
|
|
328
364
|
text_to_print = self.text[0:N_top_chars]
|
|
365
|
+
struct_key_count = len(self.struct.keys())
|
|
329
366
|
frame_count = len(self.frames)
|
|
330
367
|
relation_count = len(self.relations)
|
|
331
368
|
return ''.join((f'LLMInformationExtractionDocument(doc_id: "{self.doc_id}"\n',
|
|
332
369
|
f'text: "{text_to_print}...",\n',
|
|
370
|
+
f'struct keys: {struct_key_count}\n',
|
|
333
371
|
f'frames: {frame_count}\n',
|
|
334
372
|
f'relations: {relation_count}'))
|
|
335
373
|
|
|
@@ -338,6 +376,7 @@ class LLMInformationExtractionDocument:
|
|
|
338
376
|
with open(filename, 'w') as json_file:
|
|
339
377
|
json.dump({'doc_id': self.doc_id,
|
|
340
378
|
'text': self.text,
|
|
379
|
+
'struct': self.struct,
|
|
341
380
|
'frames': [frame.to_dict() for frame in self.frames],
|
|
342
381
|
'relations': self.relations},
|
|
343
382
|
json_file, indent=4)
|
|
@@ -346,16 +385,22 @@ class LLMInformationExtractionDocument:
|
|
|
346
385
|
|
|
347
386
|
def _viz_preprocess(self) -> Tuple:
|
|
348
387
|
"""
|
|
349
|
-
This method preprocesses the entities and relations for visualization.
|
|
388
|
+
This method preprocesses the struct, entities and relations for visualization.
|
|
350
389
|
"""
|
|
351
390
|
if importlib.util.find_spec("ie_viz") is None:
|
|
352
|
-
raise ImportError("ie_viz not found. Please install ie_viz (```pip install ie-viz```).")
|
|
391
|
+
raise ImportError("ie_viz not found. Please install ie_viz (```pip install -U ie-viz```).")
|
|
353
392
|
|
|
393
|
+
# Struct
|
|
394
|
+
if self.has_struct():
|
|
395
|
+
struct = self.struct
|
|
396
|
+
else:
|
|
397
|
+
struct = {}
|
|
398
|
+
# Entities
|
|
354
399
|
if self.has_frame():
|
|
355
400
|
entities = [{"entity_id": frame.frame_id, "start": frame.start, "end": frame.end, "attr": frame.attr} for frame in self.frames]
|
|
356
401
|
else:
|
|
357
|
-
|
|
358
|
-
|
|
402
|
+
entities = None
|
|
403
|
+
# Relations
|
|
359
404
|
if self.has_relation():
|
|
360
405
|
relations = []
|
|
361
406
|
for relation in self.relations:
|
|
@@ -364,7 +409,7 @@ class LLMInformationExtractionDocument:
|
|
|
364
409
|
else:
|
|
365
410
|
relations = None
|
|
366
411
|
|
|
367
|
-
return entities, relations
|
|
412
|
+
return struct, entities, relations
|
|
368
413
|
|
|
369
414
|
|
|
370
415
|
def viz_serve(self, host: str = '0.0.0.0', port: int = 5000, theme:str = "light", title:str="Frames Visualization",
|
|
@@ -388,29 +433,20 @@ class LLMInformationExtractionDocument:
|
|
|
388
433
|
The function to be used for mapping the entity attributes to colors. When provided, the color_attr_key and
|
|
389
434
|
theme will be overwritten. The function must take an entity dictionary as input and return a color string (hex).
|
|
390
435
|
"""
|
|
391
|
-
entities, relations = self._viz_preprocess()
|
|
436
|
+
struct, entities, relations = self._viz_preprocess()
|
|
392
437
|
from ie_viz import serve
|
|
393
438
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
warnings.warn("The version of ie_viz is not the latest. Please update to the latest version (pip install --upgrade ie-viz) for complete features.", UserWarning)
|
|
406
|
-
serve(text=self.text,
|
|
407
|
-
entities=entities,
|
|
408
|
-
relations=relations,
|
|
409
|
-
host=host,
|
|
410
|
-
port=port,
|
|
411
|
-
theme=theme,
|
|
412
|
-
color_attr_key=color_attr_key,
|
|
413
|
-
color_map_func=color_map_func)
|
|
439
|
+
serve(text=self.text,
|
|
440
|
+
struct=struct,
|
|
441
|
+
entities=entities,
|
|
442
|
+
relations=relations,
|
|
443
|
+
host=host,
|
|
444
|
+
port=port,
|
|
445
|
+
theme=theme,
|
|
446
|
+
title=title,
|
|
447
|
+
color_attr_key=color_attr_key,
|
|
448
|
+
color_map_func=color_map_func)
|
|
449
|
+
|
|
414
450
|
|
|
415
451
|
def viz_render(self, theme:str = "light", color_attr_key:str=None, color_map_func:Callable=None,
|
|
416
452
|
title:str="Frames Visualization") -> str:
|
|
@@ -429,22 +465,14 @@ class LLMInformationExtractionDocument:
|
|
|
429
465
|
title : str, Optional
|
|
430
466
|
the title of the HTML.
|
|
431
467
|
"""
|
|
432
|
-
entities, relations = self._viz_preprocess()
|
|
468
|
+
struct, entities, relations = self._viz_preprocess()
|
|
433
469
|
from ie_viz import render
|
|
434
470
|
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
except TypeError:
|
|
444
|
-
warnings.warn("The version of ie_viz is not the latest. Please update to the latest version (pip install --upgrade ie-viz) for complete features.", UserWarning)
|
|
445
|
-
return render(text=self.text,
|
|
446
|
-
entities=entities,
|
|
447
|
-
relations=relations,
|
|
448
|
-
theme=theme,
|
|
449
|
-
color_attr_key=color_attr_key,
|
|
450
|
-
color_map_func=color_map_func)
|
|
471
|
+
return render(text=self.text,
|
|
472
|
+
struct=struct,
|
|
473
|
+
entities=entities,
|
|
474
|
+
relations=relations,
|
|
475
|
+
theme=theme,
|
|
476
|
+
title=title,
|
|
477
|
+
color_attr_key=color_attr_key,
|
|
478
|
+
color_map_func=color_map_func)
|
llm_ie/engines.py
CHANGED
|
@@ -1060,6 +1060,50 @@ class VLLMInferenceEngine(OpenAICompatibleInferenceEngine):
|
|
|
1060
1060
|
return {"reasoning": getattr(response.choices[0].message, "reasoning_content", ""),
|
|
1061
1061
|
"response": getattr(response.choices[0].message, "content", "")}
|
|
1062
1062
|
|
|
1063
|
+
class SGLangInferenceEngine(OpenAICompatibleInferenceEngine):
|
|
1064
|
+
def __init__(self, model:str, api_key:str="", base_url:str="http://localhost:30000/v1", config:LLMConfig=None, **kwrs):
|
|
1065
|
+
"""
|
|
1066
|
+
SGLang OpenAI compatible API inference engine.
|
|
1067
|
+
https://docs.sglang.ai/basic_usage/openai_api.html
|
|
1068
|
+
|
|
1069
|
+
Parameters:
|
|
1070
|
+
----------
|
|
1071
|
+
model_name : str
|
|
1072
|
+
model name as shown in the vLLM server
|
|
1073
|
+
api_key : str, Optional
|
|
1074
|
+
the API key for the vLLM server.
|
|
1075
|
+
base_url : str, Optional
|
|
1076
|
+
the base url for the vLLM server.
|
|
1077
|
+
config : LLMConfig
|
|
1078
|
+
the LLM configuration.
|
|
1079
|
+
"""
|
|
1080
|
+
super().__init__(model, api_key, base_url, config, **kwrs)
|
|
1081
|
+
|
|
1082
|
+
|
|
1083
|
+
def _format_response(self, response: Any) -> Dict[str, str]:
|
|
1084
|
+
"""
|
|
1085
|
+
This method format the response from OpenAI API to a dict with keys "type" and "data".
|
|
1086
|
+
|
|
1087
|
+
Parameters:
|
|
1088
|
+
----------
|
|
1089
|
+
response : Any
|
|
1090
|
+
the response from OpenAI-compatible API. Could be a dict, generator, or object.
|
|
1091
|
+
"""
|
|
1092
|
+
if isinstance(response, self.ChatCompletionChunk):
|
|
1093
|
+
if hasattr(response.choices[0].delta, "reasoning_content") and getattr(response.choices[0].delta, "reasoning_content") is not None:
|
|
1094
|
+
chunk_text = getattr(response.choices[0].delta, "reasoning_content", "")
|
|
1095
|
+
if chunk_text is None:
|
|
1096
|
+
chunk_text = ""
|
|
1097
|
+
return {"type": "reasoning", "data": chunk_text}
|
|
1098
|
+
else:
|
|
1099
|
+
chunk_text = getattr(response.choices[0].delta, "content", "")
|
|
1100
|
+
if chunk_text is None:
|
|
1101
|
+
chunk_text = ""
|
|
1102
|
+
return {"type": "response", "data": chunk_text}
|
|
1103
|
+
|
|
1104
|
+
return {"reasoning": getattr(response.choices[0].message, "reasoning_content", ""),
|
|
1105
|
+
"response": getattr(response.choices[0].message, "content", "")}
|
|
1106
|
+
|
|
1063
1107
|
|
|
1064
1108
|
class OpenRouterInferenceEngine(OpenAICompatibleInferenceEngine):
|
|
1065
1109
|
def __init__(self, model:str, api_key:str=None, base_url:str="https://openrouter.ai/api/v1", config:LLMConfig=None, **kwrs):
|