llm-ie 1.2.3__tar.gz → 1.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llm_ie-1.2.3 → llm_ie-1.2.4}/PKG-INFO +1 -1
- {llm_ie-1.2.3 → llm_ie-1.2.4}/pyproject.toml +1 -1
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/__init__.py +4 -4
- llm_ie-1.2.4/src/llm_ie/asset/default_prompts/LLMUnitChunker_user_prompt.txt +129 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/chunkers.py +104 -4
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/engines.py +44 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/extractors.py +8 -80
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/prompt_editor.py +9 -32
- llm_ie-1.2.4/src/llm_ie/utils.py +95 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/README.md +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/chat.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/comment.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/rewrite.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/system.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/BasicReviewFrameExtractor_addition_review_prompt.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/BasicReviewFrameExtractor_revision_review_prompt.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_revision_review_prompt.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/BasicReviewFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/DirectFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/data_types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llm-ie
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.4
|
|
4
4
|
Summary: A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Enshuo (David) Hsu
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "llm-ie"
|
|
3
|
-
version = "1.2.
|
|
3
|
+
version = "1.2.4"
|
|
4
4
|
description = "A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines."
|
|
5
5
|
authors = ["Enshuo (David) Hsu"]
|
|
6
6
|
license = "MIT"
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from .data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
2
2
|
from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
|
|
3
|
-
from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
|
|
3
|
+
from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, SGLangInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
|
|
4
4
|
from .extractors import DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
|
|
5
|
-
from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
|
|
5
|
+
from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, LLMUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
|
|
6
6
|
from .prompt_editor import PromptEditor
|
|
7
7
|
|
|
8
8
|
__all__ = ["LLMInformationExtractionFrame", "LLMInformationExtractionDocument",
|
|
9
|
-
"BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
|
|
9
|
+
"BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "SGLangInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
|
|
10
10
|
"DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
|
|
11
|
-
"UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
|
|
11
|
+
"UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "LLMUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
|
|
12
12
|
"PromptEditor"]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
### Task description
|
|
2
|
+
You are a helpful assistant that breaks down a text document into semantic units or chunks. Each chunk should represent a coherent section of the text, such as a paragraph, subsection, or topic.
|
|
3
|
+
|
|
4
|
+
### Schema definition
|
|
5
|
+
You will output a JSON array of objects. Each object should have the following fields:
|
|
6
|
+
- "title": Generate a brief title summarizing the content of the chunk.
|
|
7
|
+
- "anchor_text": the first line of text in the chunk used to locate it in the original document. Must be an exact match.
|
|
8
|
+
- if there is a title or heading for the chunk, use that as the anchor_text.
|
|
9
|
+
- if there is no title or heading, use the first sentence of the chunk as the anchor_text.
|
|
10
|
+
|
|
11
|
+
```JSON
|
|
12
|
+
[
|
|
13
|
+
{
|
|
14
|
+
"title": "<your title here>",
|
|
15
|
+
"anchor_text": "<the anchor text of the chunk here>"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"title": "<your title here>",
|
|
19
|
+
"anchor_text": "<the anchor text of the chunk here>"
|
|
20
|
+
}
|
|
21
|
+
]
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### Examples
|
|
25
|
+
|
|
26
|
+
**Input**:
|
|
27
|
+
"# Clinical Note
|
|
28
|
+
|
|
29
|
+
**Patient Name**: Michael Green
|
|
30
|
+
**Medical Record Number**: 1122334455
|
|
31
|
+
**Date of Visit**: January 5, 2025
|
|
32
|
+
**Provider**: Dr. Emily Carter, MD
|
|
33
|
+
|
|
34
|
+
## Reason for Visit
|
|
35
|
+
Follow-up for poorly controlled type 2 diabetes and complaints of occasional dizziness and blurred vision.
|
|
36
|
+
|
|
37
|
+
## Summary of Visit
|
|
38
|
+
Michael Green, a 62-year-old male with a known history of type 2 diabetes, hypertension, and obesity, presents for follow-up regarding his glycemic control. Despite recent adjustments to his treatment plan, his glucose readings have remained elevated, averaging 180-220 mg/dL. He reports occasional episodes of dizziness and blurred vision, particularly in the morning before meals. He denies chest pain, palpitations, or recent falls. He reports compliance with his medication regimen but admits to difficulty following a consistent low-carbohydrate diet.
|
|
39
|
+
|
|
40
|
+
Michael has been using a glucose meter to monitor his blood sugar levels and logs them daily. His last hemoglobin A1c, performed three months ago, was 9.2%. He reports no recent hospitalizations, infections, or significant stressors.
|
|
41
|
+
|
|
42
|
+
## Notable History
|
|
43
|
+
- **Chronic Conditions**:
|
|
44
|
+
- Type 2 diabetes mellitus, diagnosed 10 years ago.
|
|
45
|
+
- Hypertension, well-controlled on medication.
|
|
46
|
+
- Hyperlipidemia, on statin therapy.
|
|
47
|
+
- **Past Surgical History**:
|
|
48
|
+
- Knee arthroscopy for a meniscal tear, age 50.
|
|
49
|
+
- **Family History**:
|
|
50
|
+
- Mother: Deceased at 75, complications from diabetes.
|
|
51
|
+
- Father: Deceased at 70, myocardial infarction."
|
|
52
|
+
|
|
53
|
+
**Output**:
|
|
54
|
+
```JSON
|
|
55
|
+
[
|
|
56
|
+
{
|
|
57
|
+
"title": "Patient Information",
|
|
58
|
+
"anchor_text": "# Clinical Note"
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
"title": "Reason for Visit",
|
|
62
|
+
"anchor_text": "## Reason for Visit"
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
"title": "Summary of Visit",
|
|
66
|
+
"anchor_text": "## Summary of Visit"
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"title": "Notable History",
|
|
70
|
+
"anchor_text": "## Notable History"
|
|
71
|
+
}
|
|
72
|
+
]
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
**Input**:
|
|
76
|
+
"In the [**Hospital1 18**] ED, 35.3 102 133/58 100%AC 500x20, 5, 1.0 with an
|
|
77
|
+
ABG 7.16/66/162. He had a CTH which was unremarkable. He then
|
|
78
|
+
had a CTA chest, afterwhich he went into PEA arrest.
|
|
79
|
+
Rescucitation last approximately 10-15 minutes with multiple
|
|
80
|
+
rounds of epi and bicarb, with ROSC. He was then admitted to the
|
|
81
|
+
MICU for further management.
|
|
82
|
+
.
|
|
83
|
+
Currently, the patient is intubated, sedated, and parlyzed.
|
|
84
|
+
|
|
85
|
+
Past Medical History:
|
|
86
|
+
Asthma
|
|
87
|
+
Dilated cardiomyopathy
|
|
88
|
+
Multiple admissions for dyspnea this winter ([**1-26**]).
|
|
89
|
+
Anxiety/depression
|
|
90
|
+
CKD
|
|
91
|
+
HLD
|
|
92
|
+
Obesity
|
|
93
|
+
HTN
|
|
94
|
+
|
|
95
|
+
Social History:
|
|
96
|
+
Unknown
|
|
97
|
+
|
|
98
|
+
Family History:
|
|
99
|
+
Unknown"
|
|
100
|
+
|
|
101
|
+
**Output**:
|
|
102
|
+
```JSON
|
|
103
|
+
[
|
|
104
|
+
{
|
|
105
|
+
"title": "Patient Presentation and Initial Management",
|
|
106
|
+
"anchor_text": "In the [**Hospital1 18**] ED, 35.3 102 133/58 100%AC 500x20, 5, 1.0 with an"
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
"title": "Current Status of the Patient",
|
|
110
|
+
"anchor_text": "Currently, the patient is intubated, sedated, and parlyzed."
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
"title": "Past Medical History",
|
|
114
|
+
"anchor_text": "Past Medical History:"
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
"title": "Social History",
|
|
118
|
+
"anchor_text": "Social History:"
|
|
119
|
+
},
|
|
120
|
+
{
|
|
121
|
+
"title": "Family History",
|
|
122
|
+
"anchor_text": "Family History:"
|
|
123
|
+
}
|
|
124
|
+
]
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Document text
|
|
128
|
+
|
|
129
|
+
"{{document_text}}"
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
import abc
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import List
|
|
3
3
|
import asyncio
|
|
4
4
|
import uuid
|
|
5
|
+
import importlib.resources
|
|
6
|
+
from llm_ie.utils import extract_json, apply_prompt_template
|
|
5
7
|
from llm_ie.data_types import FrameExtractionUnit
|
|
8
|
+
from llm_ie.engines import InferenceEngine
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
class UnitChunker(abc.ABC):
|
|
@@ -74,13 +77,14 @@ class SeparatorUnitChunker(UnitChunker):
|
|
|
74
77
|
text : str
|
|
75
78
|
The document text.
|
|
76
79
|
"""
|
|
80
|
+
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
|
|
77
81
|
paragraphs = text.split(self.sep)
|
|
78
82
|
paragraph_units = []
|
|
79
83
|
start = 0
|
|
80
84
|
for paragraph in paragraphs:
|
|
81
85
|
end = start + len(paragraph)
|
|
82
86
|
paragraph_units.append(FrameExtractionUnit(
|
|
83
|
-
doc_id=doc_id
|
|
87
|
+
doc_id=doc_id,
|
|
84
88
|
start=start,
|
|
85
89
|
end=end,
|
|
86
90
|
text=paragraph
|
|
@@ -104,10 +108,11 @@ class SentenceUnitChunker(UnitChunker):
|
|
|
104
108
|
text : str
|
|
105
109
|
The document text.
|
|
106
110
|
"""
|
|
111
|
+
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
|
|
107
112
|
sentences = []
|
|
108
113
|
for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
|
|
109
114
|
sentences.append(FrameExtractionUnit(
|
|
110
|
-
doc_id=doc_id
|
|
115
|
+
doc_id=doc_id,
|
|
111
116
|
start=start,
|
|
112
117
|
end=end,
|
|
113
118
|
text=text[start:end]
|
|
@@ -129,13 +134,14 @@ class TextLineUnitChunker(UnitChunker):
|
|
|
129
134
|
text : str
|
|
130
135
|
The document text.
|
|
131
136
|
"""
|
|
137
|
+
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
|
|
132
138
|
lines = text.split('\n')
|
|
133
139
|
line_units = []
|
|
134
140
|
start = 0
|
|
135
141
|
for line in lines:
|
|
136
142
|
end = start + len(line)
|
|
137
143
|
line_units.append(FrameExtractionUnit(
|
|
138
|
-
doc_id=doc_id
|
|
144
|
+
doc_id=doc_id,
|
|
139
145
|
start=start,
|
|
140
146
|
end=end,
|
|
141
147
|
text=line
|
|
@@ -143,6 +149,100 @@ class TextLineUnitChunker(UnitChunker):
|
|
|
143
149
|
start = end + 1
|
|
144
150
|
return line_units
|
|
145
151
|
|
|
152
|
+
class LLMUnitChunker(UnitChunker):
|
|
153
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str=None, system_prompt:str=None):
|
|
154
|
+
"""
|
|
155
|
+
This class prompt an LLM for document segmentation (e.g., sections, paragraphs).
|
|
156
|
+
|
|
157
|
+
Parameters:
|
|
158
|
+
----------
|
|
159
|
+
inference_engine : InferenceEngine
|
|
160
|
+
the LLM inferencing engine object.
|
|
161
|
+
prompt_template : str
|
|
162
|
+
the prompt template that defines how to chunk the document. Must define a JSON schema with
|
|
163
|
+
```json
|
|
164
|
+
[
|
|
165
|
+
{
|
|
166
|
+
"title": "<your title here>",
|
|
167
|
+
"anchor_text": "<the anchor text of the chunk here>"
|
|
168
|
+
},
|
|
169
|
+
{
|
|
170
|
+
"title": "<your title here>",
|
|
171
|
+
"anchor_text": "<the anchor text of the chunk here>"
|
|
172
|
+
}
|
|
173
|
+
]
|
|
174
|
+
```
|
|
175
|
+
system_prompt : str, optional
|
|
176
|
+
The system prompt.
|
|
177
|
+
"""
|
|
178
|
+
self.inference_engine = inference_engine
|
|
179
|
+
|
|
180
|
+
if prompt_template is None:
|
|
181
|
+
file_path = importlib.resources.files('llm_ie.asset.default_prompts').joinpath("LLMUnitChunker_user_prompt.txt")
|
|
182
|
+
with open(file_path, 'r', encoding="utf-8") as f:
|
|
183
|
+
self.prompt_template = f.read()
|
|
184
|
+
else:
|
|
185
|
+
self.prompt_template = prompt_template
|
|
186
|
+
|
|
187
|
+
self.system_prompt = system_prompt
|
|
188
|
+
|
|
189
|
+
def chunk(self, text, doc_id=None) -> List[FrameExtractionUnit]:
|
|
190
|
+
"""
|
|
191
|
+
Parameters:
|
|
192
|
+
-----------
|
|
193
|
+
text : str
|
|
194
|
+
the document text.
|
|
195
|
+
doc_id : str, optional
|
|
196
|
+
the document id.
|
|
197
|
+
"""
|
|
198
|
+
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
|
|
199
|
+
user_prompt = apply_prompt_template(prompt_template=self.prompt_template, text_content=text)
|
|
200
|
+
messages = []
|
|
201
|
+
if self.system_prompt is not None:
|
|
202
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
203
|
+
messages.append({'role': 'user', 'content': user_prompt})
|
|
204
|
+
|
|
205
|
+
gen_text = self.inference_engine.chat(messages=messages)
|
|
206
|
+
|
|
207
|
+
header_list = extract_json(gen_text=gen_text["response"])
|
|
208
|
+
units = []
|
|
209
|
+
start = 0
|
|
210
|
+
prev_end = 0
|
|
211
|
+
for header in header_list:
|
|
212
|
+
if "anchor_text" not in header:
|
|
213
|
+
Warning.warn(f"Missing anchor_text in header: {header}. Skipping this header.")
|
|
214
|
+
continue
|
|
215
|
+
if not isinstance(header["anchor_text"], str):
|
|
216
|
+
Warning.warn(f"Invalid anchor_text: {header['anchor_text']}. Skipping this header.")
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
start = prev_end
|
|
220
|
+
# find the first instandce of the leading sentence in the rest of the text
|
|
221
|
+
end = text.find(header["anchor_text"], start)
|
|
222
|
+
# if not found, skip this header
|
|
223
|
+
if end == -1:
|
|
224
|
+
continue
|
|
225
|
+
# if start == end (empty text), skip this header
|
|
226
|
+
if start == end:
|
|
227
|
+
continue
|
|
228
|
+
# create a frame extraction unit
|
|
229
|
+
units.append(FrameExtractionUnit(
|
|
230
|
+
doc_id=doc_id,
|
|
231
|
+
start=start,
|
|
232
|
+
end=end,
|
|
233
|
+
text=text[start:end]
|
|
234
|
+
))
|
|
235
|
+
prev_end = end
|
|
236
|
+
# add the last section
|
|
237
|
+
if prev_end < len(text):
|
|
238
|
+
units.append(FrameExtractionUnit(
|
|
239
|
+
doc_id=doc_id,
|
|
240
|
+
start=prev_end,
|
|
241
|
+
end=len(text),
|
|
242
|
+
text=text[prev_end:]
|
|
243
|
+
))
|
|
244
|
+
return units
|
|
245
|
+
|
|
146
246
|
|
|
147
247
|
class ContextChunker(abc.ABC):
|
|
148
248
|
def __init__(self):
|
|
@@ -1060,6 +1060,50 @@ class VLLMInferenceEngine(OpenAICompatibleInferenceEngine):
|
|
|
1060
1060
|
return {"reasoning": getattr(response.choices[0].message, "reasoning_content", ""),
|
|
1061
1061
|
"response": getattr(response.choices[0].message, "content", "")}
|
|
1062
1062
|
|
|
1063
|
+
class SGLangInferenceEngine(OpenAICompatibleInferenceEngine):
|
|
1064
|
+
def __init__(self, model:str, api_key:str="", base_url:str="http://localhost:30000/v1", config:LLMConfig=None, **kwrs):
|
|
1065
|
+
"""
|
|
1066
|
+
SGLang OpenAI compatible API inference engine.
|
|
1067
|
+
https://docs.sglang.ai/basic_usage/openai_api.html
|
|
1068
|
+
|
|
1069
|
+
Parameters:
|
|
1070
|
+
----------
|
|
1071
|
+
model_name : str
|
|
1072
|
+
model name as shown in the vLLM server
|
|
1073
|
+
api_key : str, Optional
|
|
1074
|
+
the API key for the vLLM server.
|
|
1075
|
+
base_url : str, Optional
|
|
1076
|
+
the base url for the vLLM server.
|
|
1077
|
+
config : LLMConfig
|
|
1078
|
+
the LLM configuration.
|
|
1079
|
+
"""
|
|
1080
|
+
super().__init__(model, api_key, base_url, config, **kwrs)
|
|
1081
|
+
|
|
1082
|
+
|
|
1083
|
+
def _format_response(self, response: Any) -> Dict[str, str]:
|
|
1084
|
+
"""
|
|
1085
|
+
This method format the response from OpenAI API to a dict with keys "type" and "data".
|
|
1086
|
+
|
|
1087
|
+
Parameters:
|
|
1088
|
+
----------
|
|
1089
|
+
response : Any
|
|
1090
|
+
the response from OpenAI-compatible API. Could be a dict, generator, or object.
|
|
1091
|
+
"""
|
|
1092
|
+
if isinstance(response, self.ChatCompletionChunk):
|
|
1093
|
+
if hasattr(response.choices[0].delta, "reasoning_content") and getattr(response.choices[0].delta, "reasoning_content") is not None:
|
|
1094
|
+
chunk_text = getattr(response.choices[0].delta, "reasoning_content", "")
|
|
1095
|
+
if chunk_text is None:
|
|
1096
|
+
chunk_text = ""
|
|
1097
|
+
return {"type": "reasoning", "data": chunk_text}
|
|
1098
|
+
else:
|
|
1099
|
+
chunk_text = getattr(response.choices[0].delta, "content", "")
|
|
1100
|
+
if chunk_text is None:
|
|
1101
|
+
chunk_text = ""
|
|
1102
|
+
return {"type": "response", "data": chunk_text}
|
|
1103
|
+
|
|
1104
|
+
return {"reasoning": getattr(response.choices[0].message, "reasoning_content", ""),
|
|
1105
|
+
"response": getattr(response.choices[0].message, "content", "")}
|
|
1106
|
+
|
|
1063
1107
|
|
|
1064
1108
|
class OpenRouterInferenceEngine(OpenAICompatibleInferenceEngine):
|
|
1065
1109
|
def __init__(self, model:str, api_key:str=None, base_url:str="https://openrouter.ai/api/v1", config:LLMConfig=None, **kwrs):
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
import re
|
|
3
|
-
import json
|
|
4
|
-
import json_repair
|
|
5
3
|
import inspect
|
|
6
4
|
import importlib.resources
|
|
7
5
|
import warnings
|
|
@@ -10,6 +8,7 @@ import asyncio
|
|
|
10
8
|
import nest_asyncio
|
|
11
9
|
from concurrent.futures import ThreadPoolExecutor
|
|
12
10
|
from typing import Any, Set, List, Dict, Tuple, Union, Callable, Generator, Optional, AsyncGenerator
|
|
11
|
+
from llm_ie.utils import extract_json, apply_prompt_template
|
|
13
12
|
from llm_ie.data_types import FrameExtractionUnit, LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
14
13
|
from llm_ie.chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker
|
|
15
14
|
from llm_ie.chunkers import ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
|
|
@@ -96,79 +95,8 @@ class Extractor:
|
|
|
96
95
|
Returns : str
|
|
97
96
|
a user prompt.
|
|
98
97
|
"""
|
|
99
|
-
|
|
100
|
-
if isinstance(text_content, str):
|
|
101
|
-
matches = pattern.findall(self.prompt_template)
|
|
102
|
-
if len(matches) != 1:
|
|
103
|
-
raise ValueError("When text_content is str, the prompt template must has exactly 1 placeholder {{<placeholder name>}}.")
|
|
104
|
-
text = re.sub(r'\\', r'\\\\', text_content)
|
|
105
|
-
prompt = pattern.sub(text, self.prompt_template)
|
|
98
|
+
return apply_prompt_template(self.prompt_template, text_content)
|
|
106
99
|
|
|
107
|
-
elif isinstance(text_content, dict):
|
|
108
|
-
# Check if all values are str
|
|
109
|
-
if not all([isinstance(v, str) for v in text_content.values()]):
|
|
110
|
-
raise ValueError("All values in text_content must be str.")
|
|
111
|
-
# Check if all keys are in the prompt template
|
|
112
|
-
placeholders = pattern.findall(self.prompt_template)
|
|
113
|
-
if len(placeholders) != len(text_content):
|
|
114
|
-
raise ValueError(f"Expect text_content ({len(text_content)}) and prompt template placeholder ({len(placeholders)}) to have equal size.")
|
|
115
|
-
if not all([k in placeholders for k, _ in text_content.items()]):
|
|
116
|
-
raise ValueError(f"All keys in text_content ({text_content.keys()}) must match placeholders in prompt template ({placeholders}).")
|
|
117
|
-
|
|
118
|
-
prompt = pattern.sub(lambda match: re.sub(r'\\', r'\\\\', text_content[match.group(1)]), self.prompt_template)
|
|
119
|
-
|
|
120
|
-
return prompt
|
|
121
|
-
|
|
122
|
-
def _find_dict_strings(self, text: str) -> List[str]:
|
|
123
|
-
"""
|
|
124
|
-
Extracts balanced JSON-like dictionaries from a string, even if nested.
|
|
125
|
-
|
|
126
|
-
Parameters:
|
|
127
|
-
-----------
|
|
128
|
-
text : str
|
|
129
|
-
the input text containing JSON-like structures.
|
|
130
|
-
|
|
131
|
-
Returns : List[str]
|
|
132
|
-
A list of valid JSON-like strings representing dictionaries.
|
|
133
|
-
"""
|
|
134
|
-
open_brace = 0
|
|
135
|
-
start = -1
|
|
136
|
-
json_objects = []
|
|
137
|
-
|
|
138
|
-
for i, char in enumerate(text):
|
|
139
|
-
if char == '{':
|
|
140
|
-
if open_brace == 0:
|
|
141
|
-
# start of a new JSON object
|
|
142
|
-
start = i
|
|
143
|
-
open_brace += 1
|
|
144
|
-
elif char == '}':
|
|
145
|
-
open_brace -= 1
|
|
146
|
-
if open_brace == 0 and start != -1:
|
|
147
|
-
json_objects.append(text[start:i + 1])
|
|
148
|
-
start = -1
|
|
149
|
-
|
|
150
|
-
return json_objects
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
def _extract_json(self, gen_text:str) -> List[Dict[str, str]]:
|
|
154
|
-
"""
|
|
155
|
-
This method inputs a generated text and output a JSON of information tuples
|
|
156
|
-
"""
|
|
157
|
-
out = []
|
|
158
|
-
dict_str_list = self._find_dict_strings(gen_text)
|
|
159
|
-
for dict_str in dict_str_list:
|
|
160
|
-
try:
|
|
161
|
-
dict_obj = json.loads(dict_str)
|
|
162
|
-
out.append(dict_obj)
|
|
163
|
-
except json.JSONDecodeError:
|
|
164
|
-
dict_obj = json_repair.repair_json(dict_str, skip_json_loads=True, return_objects=True)
|
|
165
|
-
if dict_obj:
|
|
166
|
-
warnings.warn(f'JSONDecodeError detected, fixed with repair_json:\n{dict_str}', RuntimeWarning)
|
|
167
|
-
out.append(dict_obj)
|
|
168
|
-
else:
|
|
169
|
-
warnings.warn(f'JSONDecodeError could not be fixed:\n{dict_str}', RuntimeWarning)
|
|
170
|
-
return out
|
|
171
|
-
|
|
172
100
|
|
|
173
101
|
class FrameExtractor(Extractor):
|
|
174
102
|
from nltk.tokenize import RegexpTokenizer
|
|
@@ -759,7 +687,7 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
759
687
|
if unit.status != "success":
|
|
760
688
|
warnings.warn(f"Skipping failed unit ({unit.start}, {unit.end}): {unit.text}", RuntimeWarning)
|
|
761
689
|
continue
|
|
762
|
-
for entity in
|
|
690
|
+
for entity in extract_json(gen_text=unit.gen_text):
|
|
763
691
|
if ENTITY_KEY in entity:
|
|
764
692
|
entity_json.append(entity)
|
|
765
693
|
else:
|
|
@@ -963,7 +891,7 @@ class DirectFrameExtractor(FrameExtractor):
|
|
|
963
891
|
frame_list = []
|
|
964
892
|
for res in sorted(doc_results['units'], key=lambda r: r.start):
|
|
965
893
|
entity_json = []
|
|
966
|
-
for entity in
|
|
894
|
+
for entity in extract_json(gen_text=res.gen_text):
|
|
967
895
|
if ENTITY_KEY in entity:
|
|
968
896
|
entity_json.append(entity)
|
|
969
897
|
else:
|
|
@@ -1712,7 +1640,7 @@ class AttributeExtractor(Extractor):
|
|
|
1712
1640
|
messages_logger=messages_logger
|
|
1713
1641
|
)
|
|
1714
1642
|
|
|
1715
|
-
attribute_list =
|
|
1643
|
+
attribute_list = extract_json(gen_text=gen_text["response"])
|
|
1716
1644
|
if isinstance(attribute_list, list) and len(attribute_list) > 0:
|
|
1717
1645
|
attributes = attribute_list[0]
|
|
1718
1646
|
if return_messages_log:
|
|
@@ -1822,7 +1750,7 @@ class AttributeExtractor(Extractor):
|
|
|
1822
1750
|
messages.append({'role': 'user', 'content': self._get_user_prompt({"context": context, "frame": str(frame.to_dict())})})
|
|
1823
1751
|
|
|
1824
1752
|
gen_text = await self.inference_engine.chat_async(messages=messages, messages_logger=messages_logger)
|
|
1825
|
-
attribute_list =
|
|
1753
|
+
attribute_list = extract_json(gen_text=gen_text["response"])
|
|
1826
1754
|
attributes = attribute_list[0] if isinstance(attribute_list, list) and len(attribute_list) > 0 else {}
|
|
1827
1755
|
return {"frame": frame, "attributes": attributes, "messages": messages}
|
|
1828
1756
|
|
|
@@ -2075,7 +2003,7 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
2075
2003
|
return None
|
|
2076
2004
|
|
|
2077
2005
|
def _post_process_result(self, gen_text: str, pair_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
2078
|
-
rel_json =
|
|
2006
|
+
rel_json = extract_json(gen_text)
|
|
2079
2007
|
if len(rel_json) > 0 and "Relation" in rel_json[0]:
|
|
2080
2008
|
rel = rel_json[0]["Relation"]
|
|
2081
2009
|
if (isinstance(rel, bool) and rel) or (isinstance(rel, str) and rel.lower() == 'true'):
|
|
@@ -2141,7 +2069,7 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
2141
2069
|
return None
|
|
2142
2070
|
|
|
2143
2071
|
def _post_process_result(self, gen_text: str, pair_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
2144
|
-
rel_json =
|
|
2072
|
+
rel_json = extract_json(gen_text)
|
|
2145
2073
|
pos_rel_types = pair_data['pos_rel_types']
|
|
2146
2074
|
if len(rel_json) > 0 and "RelationType" in rel_json[0]:
|
|
2147
2075
|
rel_type = rel_json[0]["RelationType"]
|
|
@@ -2,6 +2,7 @@ import sys
|
|
|
2
2
|
import warnings
|
|
3
3
|
from typing import List, Dict, Generator
|
|
4
4
|
import importlib.resources
|
|
5
|
+
from llm_ie.utils import apply_prompt_template
|
|
5
6
|
from llm_ie.engines import InferenceEngine
|
|
6
7
|
from llm_ie.extractors import FrameExtractor
|
|
7
8
|
import re
|
|
@@ -45,30 +46,6 @@ class PromptEditor:
|
|
|
45
46
|
|
|
46
47
|
# internal memory (history messages) for the `chat` method
|
|
47
48
|
self.messages = []
|
|
48
|
-
|
|
49
|
-
def _apply_prompt_template(self, text_content:Dict[str,str], prompt_template:str) -> str:
|
|
50
|
-
"""
|
|
51
|
-
This method applies text_content to prompt_template and returns a prompt.
|
|
52
|
-
|
|
53
|
-
Parameters
|
|
54
|
-
----------
|
|
55
|
-
text_content : Dict[str,str]
|
|
56
|
-
the input text content to put in prompt template.
|
|
57
|
-
all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
58
|
-
|
|
59
|
-
Returns : str
|
|
60
|
-
a prompt.
|
|
61
|
-
"""
|
|
62
|
-
pattern = re.compile(r'{{(.*?)}}')
|
|
63
|
-
placeholders = pattern.findall(prompt_template)
|
|
64
|
-
if len(placeholders) != len(text_content):
|
|
65
|
-
raise ValueError(f"Expect text_content ({len(text_content)}) and prompt template placeholder ({len(placeholders)}) to have equal size.")
|
|
66
|
-
if not all([k in placeholders for k, _ in text_content.items()]):
|
|
67
|
-
raise ValueError(f"All keys in text_content ({text_content.keys()}) must match placeholders in prompt template ({placeholders}).")
|
|
68
|
-
|
|
69
|
-
prompt = pattern.sub(lambda match: re.sub(r'\\', r'\\\\', text_content[match.group(1)]), prompt_template)
|
|
70
|
-
|
|
71
|
-
return prompt
|
|
72
49
|
|
|
73
50
|
|
|
74
51
|
def rewrite(self, draft:str) -> str:
|
|
@@ -80,8 +57,8 @@ class PromptEditor:
|
|
|
80
57
|
with open(file_path, 'r') as f:
|
|
81
58
|
rewrite_prompt_template = f.read()
|
|
82
59
|
|
|
83
|
-
prompt =
|
|
84
|
-
|
|
60
|
+
prompt = apply_prompt_template(prompt_template=rewrite_prompt_template,
|
|
61
|
+
text_content={"draft": draft, "prompt_guideline": self.prompt_guide})
|
|
85
62
|
messages = [{"role": "system", "content": self.system_prompt},
|
|
86
63
|
{"role": "user", "content": prompt}]
|
|
87
64
|
res = self.inference_engine.chat(messages, verbose=True)
|
|
@@ -96,8 +73,8 @@ class PromptEditor:
|
|
|
96
73
|
with open(file_path, 'r') as f:
|
|
97
74
|
comment_prompt_template = f.read()
|
|
98
75
|
|
|
99
|
-
prompt =
|
|
100
|
-
|
|
76
|
+
prompt = apply_prompt_template(prompt_template=comment_prompt_template,
|
|
77
|
+
text_content={"draft": draft, "prompt_guideline": self.prompt_guide})
|
|
101
78
|
messages = [{"role": "system", "content": self.system_prompt},
|
|
102
79
|
{"role": "user", "content": prompt}]
|
|
103
80
|
res = self.inference_engine.chat(messages, verbose=True)
|
|
@@ -254,8 +231,8 @@ class PromptEditor:
|
|
|
254
231
|
with open(file_path, 'r') as f:
|
|
255
232
|
chat_prompt_template = f.read()
|
|
256
233
|
|
|
257
|
-
guideline =
|
|
258
|
-
|
|
234
|
+
guideline = apply_prompt_template(prompt_template=chat_prompt_template,
|
|
235
|
+
text_content={"prompt_guideline": self.prompt_guide})
|
|
259
236
|
|
|
260
237
|
self.messages = [{"role": "system", "content": self.system_prompt + guideline}]
|
|
261
238
|
|
|
@@ -288,8 +265,8 @@ class PromptEditor:
|
|
|
288
265
|
with open(file_path, 'r') as f:
|
|
289
266
|
chat_prompt_template = f.read()
|
|
290
267
|
|
|
291
|
-
guideline =
|
|
292
|
-
|
|
268
|
+
guideline = apply_prompt_template(prompt_template=chat_prompt_template,
|
|
269
|
+
text_content={"prompt_guideline": self.prompt_guide})
|
|
293
270
|
|
|
294
271
|
messages = [{"role": "system", "content": self.system_prompt + guideline}] + messages
|
|
295
272
|
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from typing import List, Dict, Union
|
|
2
|
+
import re
|
|
3
|
+
import json
|
|
4
|
+
import warnings
|
|
5
|
+
import json_repair
|
|
6
|
+
|
|
7
|
+
def _find_dict_strings(text: str) -> List[str]:
|
|
8
|
+
"""
|
|
9
|
+
Extracts balanced JSON-like dictionaries from a string, even if nested.
|
|
10
|
+
|
|
11
|
+
Parameters:
|
|
12
|
+
-----------
|
|
13
|
+
text : str
|
|
14
|
+
the input text containing JSON-like structures.
|
|
15
|
+
|
|
16
|
+
Returns : List[str]
|
|
17
|
+
A list of valid JSON-like strings representing dictionaries.
|
|
18
|
+
"""
|
|
19
|
+
open_brace = 0
|
|
20
|
+
start = -1
|
|
21
|
+
json_objects = []
|
|
22
|
+
|
|
23
|
+
for i, char in enumerate(text):
|
|
24
|
+
if char == '{':
|
|
25
|
+
if open_brace == 0:
|
|
26
|
+
# start of a new JSON object
|
|
27
|
+
start = i
|
|
28
|
+
open_brace += 1
|
|
29
|
+
elif char == '}':
|
|
30
|
+
open_brace -= 1
|
|
31
|
+
if open_brace == 0 and start != -1:
|
|
32
|
+
json_objects.append(text[start:i + 1])
|
|
33
|
+
start = -1
|
|
34
|
+
|
|
35
|
+
return json_objects
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def extract_json(gen_text:str) -> List[Dict[str, str]]:
|
|
39
|
+
"""
|
|
40
|
+
This method inputs a generated text and output a JSON of information tuples
|
|
41
|
+
"""
|
|
42
|
+
out = []
|
|
43
|
+
dict_str_list = _find_dict_strings(gen_text)
|
|
44
|
+
for dict_str in dict_str_list:
|
|
45
|
+
try:
|
|
46
|
+
dict_obj = json.loads(dict_str)
|
|
47
|
+
out.append(dict_obj)
|
|
48
|
+
except json.JSONDecodeError:
|
|
49
|
+
dict_obj = json_repair.repair_json(dict_str, skip_json_loads=True, return_objects=True)
|
|
50
|
+
if dict_obj:
|
|
51
|
+
warnings.warn(f'JSONDecodeError detected, fixed with repair_json:\n{dict_str}', RuntimeWarning)
|
|
52
|
+
out.append(dict_obj)
|
|
53
|
+
else:
|
|
54
|
+
warnings.warn(f'JSONDecodeError could not be fixed:\n{dict_str}', RuntimeWarning)
|
|
55
|
+
return out
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def apply_prompt_template(prompt_template:str, text_content:Union[str, Dict[str,str]]) -> str:
|
|
59
|
+
"""
|
|
60
|
+
This method applies text_content to prompt_template and returns a prompt.
|
|
61
|
+
|
|
62
|
+
Parameters:
|
|
63
|
+
----------
|
|
64
|
+
prompt_template : str
|
|
65
|
+
the prompt template with placeholders {{<placeholder name>}}.
|
|
66
|
+
text_content : Union[str, Dict[str,str]]
|
|
67
|
+
the input text content to put in prompt template.
|
|
68
|
+
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
69
|
+
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}. All values must be str.
|
|
70
|
+
|
|
71
|
+
Returns : str
|
|
72
|
+
a user prompt.
|
|
73
|
+
"""
|
|
74
|
+
pattern = re.compile(r'{{(.*?)}}')
|
|
75
|
+
if isinstance(text_content, str):
|
|
76
|
+
matches = pattern.findall(prompt_template)
|
|
77
|
+
if len(matches) != 1:
|
|
78
|
+
raise ValueError("When text_content is str, the prompt template must has exactly 1 placeholder {{<placeholder name>}}.")
|
|
79
|
+
text = re.sub(r'\\', r'\\\\', text_content)
|
|
80
|
+
prompt = pattern.sub(text, prompt_template)
|
|
81
|
+
|
|
82
|
+
elif isinstance(text_content, dict):
|
|
83
|
+
# Check if all values are str
|
|
84
|
+
if not all([isinstance(v, str) for v in text_content.values()]):
|
|
85
|
+
raise ValueError("All values in text_content must be str.")
|
|
86
|
+
# Check if all keys are in the prompt template
|
|
87
|
+
placeholders = pattern.findall(prompt_template)
|
|
88
|
+
if len(placeholders) != len(text_content):
|
|
89
|
+
raise ValueError(f"Expect text_content ({len(text_content)}) and prompt template placeholder ({len(placeholders)}) to have equal size.")
|
|
90
|
+
if not all([k in placeholders for k, _ in text_content.items()]):
|
|
91
|
+
raise ValueError(f"All keys in text_content ({text_content.keys()}) must match placeholders in prompt template ({placeholders}).")
|
|
92
|
+
|
|
93
|
+
prompt = pattern.sub(lambda match: re.sub(r'\\', r'\\\\', text_content[match.group(1)]), prompt_template)
|
|
94
|
+
|
|
95
|
+
return prompt
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
{llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
|
File without changes
|
{llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
{llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/DirectFrameExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
|
File without changes
|
{llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
{llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|