llm-ie 0.1.7__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt +38 -0
- llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt +46 -0
- llm_ie/data_types.py +131 -18
- llm_ie/extractors.py +436 -13
- {llm_ie-0.1.7.dist-info → llm_ie-0.2.0.dist-info}/METADATA +212 -5
- {llm_ie-0.1.7.dist-info → llm_ie-0.2.0.dist-info}/RECORD +7 -5
- {llm_ie-0.1.7.dist-info → llm_ie-0.2.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
Prompt template design:
|
|
2
|
+
1. Task description (mention binary relation extraction and ROI)
|
|
3
|
+
2. Schema definition (defines relation)
|
|
4
|
+
3. Output format definition (must use the key "Relation")
|
|
5
|
+
4. Hints
|
|
6
|
+
5. Input placeholders (must include "roi_text", "frame_1", and "frame_2" placeholders)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
Example:
|
|
10
|
+
|
|
11
|
+
# Task description
|
|
12
|
+
This is a binary relation extraction task. Given a region of interest (ROI) text and two entities from a medical note, indicate the relation existence between the two entities.
|
|
13
|
+
|
|
14
|
+
# Schema definition
|
|
15
|
+
True: if there is a relationship between a medication name (one of the entities) and its strength or frequency (the other entity).
|
|
16
|
+
False: Otherwise.
|
|
17
|
+
|
|
18
|
+
# Output format definition
|
|
19
|
+
Your output should follow the JSON format:
|
|
20
|
+
{"Relation": "<True or False>"}
|
|
21
|
+
|
|
22
|
+
I am only interested in the content between []. Do not explain your answer.
|
|
23
|
+
|
|
24
|
+
# Hints
|
|
25
|
+
1. Your input always contains one medication entity and 1) one strength entity or 2) one frequency entity.
|
|
26
|
+
2. Pay attention to the medication entity and see if the strength or frequency is for it.
|
|
27
|
+
3. If the strength or frequency is for another medication, output False.
|
|
28
|
+
4. If the strength or frequency is for the same medication but at a different location (span), output False.
|
|
29
|
+
|
|
30
|
+
# Input placeholders
|
|
31
|
+
ROI Text with the two entities annotated with <entity_1> and <entity_2>:
|
|
32
|
+
"{{roi_text}}"
|
|
33
|
+
|
|
34
|
+
Entity 1 full information:
|
|
35
|
+
{{frame_1}}
|
|
36
|
+
|
|
37
|
+
Entity 2 full information:
|
|
38
|
+
{{frame_2}}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
Prompt template design:
|
|
2
|
+
1. Task description (mention multi-class relation extraction and ROI)
|
|
3
|
+
2. Schema definition (defines relation types)
|
|
4
|
+
3. Output format definition (must use the key "RelationType")
|
|
5
|
+
4. Input placeholders (must include "roi_text", "frame_1", and "frame_2" placeholders)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
|
|
10
|
+
# Task description
|
|
11
|
+
This is a multi-class relation extraction task. Given a region of interest (ROI) text and two frames from a medical note, classify the relation types between the two frames.
|
|
12
|
+
|
|
13
|
+
# Schema definition
|
|
14
|
+
Strength-Drug: this is a relationship between the drug strength and its name.
|
|
15
|
+
Dosage-Drug: this is a relationship between the drug dosage and its name.
|
|
16
|
+
Duration-Drug: this is a relationship between a drug duration and its name.
|
|
17
|
+
Frequency-Drug: this is a relationship between a drug frequency and its name.
|
|
18
|
+
Form-Drug: this is a relationship between a drug form and its name.
|
|
19
|
+
Route-Drug: this is a relationship between the route of administration for a drug and its name.
|
|
20
|
+
Reason-Drug: this is a relationship between the reason for which a drug was administered (e.g., symptoms, diseases, etc.) and a drug name.
|
|
21
|
+
ADE-Drug: this is a relationship between an adverse drug event (ADE) and a drug name.
|
|
22
|
+
|
|
23
|
+
# Output format definition
|
|
24
|
+
Choose one of the relation types listed below or choose "No Relation":
|
|
25
|
+
{{pos_rel_types}}
|
|
26
|
+
|
|
27
|
+
Your output should follow the JSON format:
|
|
28
|
+
{"RelationType": "<relation type or No Relation>"}
|
|
29
|
+
|
|
30
|
+
I am only interested in the content between []. Do not explain your answer.
|
|
31
|
+
|
|
32
|
+
# Hints
|
|
33
|
+
1. Your input always contains one medication entity and 1) one strength entity or 2) one frequency entity.
|
|
34
|
+
2. Pay attention to the medication entity and see if the strength or frequency is for it.
|
|
35
|
+
3. If the strength or frequency is for another medication, output "No Relation".
|
|
36
|
+
4. If the strength or frequency is for the same medication but at a different location (span), output "No Relation".
|
|
37
|
+
|
|
38
|
+
# Input placeholders
|
|
39
|
+
ROI Text with the two entities annotated with <entity_1> and <entity_2>:
|
|
40
|
+
"{{roi_text}}"
|
|
41
|
+
|
|
42
|
+
Entity 1 full information:
|
|
43
|
+
{{frame_1}}
|
|
44
|
+
|
|
45
|
+
Entity 2 full information:
|
|
46
|
+
{{frame_2}}
|
llm_ie/data_types.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from typing import List, Dict
|
|
2
|
-
import
|
|
1
|
+
from typing import List, Dict, Iterable
|
|
2
|
+
import json
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class LLMInformationExtractionFrame:
|
|
@@ -22,7 +22,8 @@ class LLMInformationExtractionFrame:
|
|
|
22
22
|
attr : Dict[str,str], Optional
|
|
23
23
|
dict of attributes
|
|
24
24
|
"""
|
|
25
|
-
|
|
25
|
+
if not isinstance(frame_id, str):
|
|
26
|
+
raise TypeError("frame_id must be a string.")
|
|
26
27
|
self.frame_id = frame_id
|
|
27
28
|
self.start = start
|
|
28
29
|
self.end = end
|
|
@@ -78,7 +79,8 @@ class LLMInformationExtractionFrame:
|
|
|
78
79
|
|
|
79
80
|
|
|
80
81
|
class LLMInformationExtractionDocument:
|
|
81
|
-
def __init__(self, doc_id:str=None, filename:str=None, text:str=None,
|
|
82
|
+
def __init__(self, doc_id:str=None, filename:str=None, text:str=None,
|
|
83
|
+
frames:List[LLMInformationExtractionFrame]=None, relations:List[Dict[str,str]]=None):
|
|
82
84
|
"""
|
|
83
85
|
This class holds LLM-extracted frames, handles save/ load.
|
|
84
86
|
|
|
@@ -92,30 +94,79 @@ class LLMInformationExtractionDocument:
|
|
|
92
94
|
document text
|
|
93
95
|
frames : List[LLMInformationExtractionFrame], Optional
|
|
94
96
|
a list of LLMInformationExtractionFrame
|
|
97
|
+
relations : List[Dict[str,str]], Optional
|
|
98
|
+
a list of dictionary of {"frame_1", "frame_2", "relation"}.
|
|
99
|
+
If binary relation (no relation type), there is no "relation" key.
|
|
95
100
|
"""
|
|
96
|
-
|
|
101
|
+
if doc_id is None and filename is None:
|
|
102
|
+
raise ValueError("Either doc_id (create from raw inputs) or filename (create from file) must be provided.")
|
|
97
103
|
# if create object from file
|
|
98
104
|
if filename:
|
|
99
|
-
with open(filename) as
|
|
100
|
-
llm_ie =
|
|
105
|
+
with open(filename) as json_file:
|
|
106
|
+
llm_ie = json.load(json_file)
|
|
101
107
|
if 'doc_id' in llm_ie.keys():
|
|
102
108
|
self.doc_id = llm_ie['doc_id']
|
|
103
109
|
if 'text' in llm_ie.keys():
|
|
104
110
|
self.text = llm_ie['text']
|
|
105
111
|
if 'frames' in llm_ie.keys():
|
|
106
112
|
self.frames = [LLMInformationExtractionFrame.from_dict(d) for d in llm_ie['frames']]
|
|
113
|
+
if 'relations' in llm_ie.keys():
|
|
114
|
+
self.relations = llm_ie['relations']
|
|
107
115
|
|
|
108
116
|
# create object from raw inputs
|
|
109
117
|
else:
|
|
110
|
-
|
|
118
|
+
if not isinstance(doc_id, str):
|
|
119
|
+
raise TypeError("doc_id must be a string.")
|
|
111
120
|
self.doc_id = doc_id
|
|
112
121
|
self.text = text
|
|
113
122
|
self.frames = frames.copy() if frames is not None else []
|
|
123
|
+
self.relations = relations.copy() if relations is not None else []
|
|
114
124
|
|
|
115
125
|
|
|
116
126
|
def has_frame(self) -> bool:
|
|
127
|
+
"""
|
|
128
|
+
This method checks if there is any frames.
|
|
129
|
+
"""
|
|
117
130
|
return bool(self.frames)
|
|
118
131
|
|
|
132
|
+
def has_relation(self) -> bool:
|
|
133
|
+
"""
|
|
134
|
+
This method checks if there is any relations.
|
|
135
|
+
"""
|
|
136
|
+
return bool(self.relations)
|
|
137
|
+
|
|
138
|
+
def has_duplicate_frame_ids(self) -> bool:
|
|
139
|
+
"""
|
|
140
|
+
This method checks for duplicate frame ids.
|
|
141
|
+
"""
|
|
142
|
+
frame_id_set = set()
|
|
143
|
+
for frame in self.frames:
|
|
144
|
+
if frame.frame_id in frame_id_set:
|
|
145
|
+
return True
|
|
146
|
+
frame_id_set.add(frame.frame_id)
|
|
147
|
+
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
def get_frame_by_id(self, frame_id:str) -> LLMInformationExtractionFrame:
|
|
151
|
+
"""
|
|
152
|
+
This method use frame_id to search for a frame.
|
|
153
|
+
If there are redundent frame_ids, the first will be returned
|
|
154
|
+
|
|
155
|
+
Parameters:
|
|
156
|
+
-----------
|
|
157
|
+
frame_id : str
|
|
158
|
+
frame id to retrieve
|
|
159
|
+
|
|
160
|
+
Returns : LLMInformationExtractionFrame
|
|
161
|
+
a frame (if found) or None (not found).
|
|
162
|
+
"""
|
|
163
|
+
for frame in self.frames:
|
|
164
|
+
if frame.frame_id == frame_id:
|
|
165
|
+
return frame
|
|
166
|
+
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
|
|
119
170
|
def add_frame(self, frame:LLMInformationExtractionFrame, valid_mode:str=None, create_id:bool=False) -> bool:
|
|
120
171
|
"""
|
|
121
172
|
This method add a new frame to the frames (list).
|
|
@@ -132,7 +183,11 @@ class LLMInformationExtractionDocument:
|
|
|
132
183
|
create_id : bool, Optional
|
|
133
184
|
Assign a sequential frame ID.
|
|
134
185
|
"""
|
|
135
|
-
|
|
186
|
+
if not isinstance(frame, LLMInformationExtractionFrame):
|
|
187
|
+
raise TypeError(f"Expect frame to be LLMInformationExtractionFrame, received {type(frame)} instead.")
|
|
188
|
+
|
|
189
|
+
if valid_mode not in {None, "span", "attr"}:
|
|
190
|
+
raise ValueError(f'Expect valid_mode to be one of {{None, "span", "attr"}}, received {valid_mode}')
|
|
136
191
|
|
|
137
192
|
if valid_mode == "span":
|
|
138
193
|
for exist_frame in self.frames:
|
|
@@ -153,18 +208,76 @@ class LLMInformationExtractionDocument:
|
|
|
153
208
|
return True
|
|
154
209
|
|
|
155
210
|
|
|
211
|
+
def add_frames(self, frames:List[LLMInformationExtractionFrame], valid_mode:str=None, create_id:bool=False):
|
|
212
|
+
"""
|
|
213
|
+
This method adds a list of frames.
|
|
214
|
+
"""
|
|
215
|
+
if not isinstance(frames, Iterable):
|
|
216
|
+
raise TypeError("frames must be a list or Interable.")
|
|
217
|
+
|
|
218
|
+
for frame in frames:
|
|
219
|
+
self.add_frame(frame=frame, valid_mode=valid_mode, create_id=create_id)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def add_relation(self, relation:Dict[str,str]) -> bool:
|
|
223
|
+
"""
|
|
224
|
+
This method add a relation to the relations (list).
|
|
225
|
+
|
|
226
|
+
Parameters:
|
|
227
|
+
-----------
|
|
228
|
+
relation : Dict[str,str]
|
|
229
|
+
the relation to add. Must be a dict with {"frame_1", "frame_2", ("relation")}.
|
|
230
|
+
Could have an optional "relation" key for relation type.
|
|
231
|
+
|
|
232
|
+
Returns : bool
|
|
233
|
+
sucess addition.
|
|
234
|
+
"""
|
|
235
|
+
if not isinstance(relation, Dict):
|
|
236
|
+
raise TypeError(f"Expect relation to be a Dict, received {type(relation)} instead.")
|
|
237
|
+
|
|
238
|
+
required_keys = {"frame_1", "frame_2"}
|
|
239
|
+
if not required_keys.issubset(relation.keys()):
|
|
240
|
+
raise ValueError('relation missing "frame_1" or "frame_2" keys.')
|
|
241
|
+
|
|
242
|
+
allowed_keys = {"frame_1", "frame_2", "relation"}
|
|
243
|
+
if not set(relation.keys()).issubset(allowed_keys):
|
|
244
|
+
raise ValueError('Only keys {"frame_1", "frame_2", "relation"} are allowed.')
|
|
245
|
+
|
|
246
|
+
if not self.get_frame_by_id(relation["frame_1"]):
|
|
247
|
+
raise ValueError(f'frame_id: {relation["frame_1"]} not found in frames.')
|
|
248
|
+
|
|
249
|
+
if not self.get_frame_by_id(relation["frame_2"]):
|
|
250
|
+
raise ValueError(f'frame_id: {relation["frame_2"]} not found in frames.')
|
|
251
|
+
|
|
252
|
+
self.relations.append(relation)
|
|
253
|
+
return True
|
|
254
|
+
|
|
255
|
+
def add_relations(self, relations:List[Dict[str,str]]):
|
|
256
|
+
"""
|
|
257
|
+
This method adds a list of relations.
|
|
258
|
+
"""
|
|
259
|
+
if not isinstance(relations, Iterable):
|
|
260
|
+
raise TypeError("relations must be a list or Interable.")
|
|
261
|
+
for relation in relations:
|
|
262
|
+
self.add_relation(relation)
|
|
263
|
+
|
|
264
|
+
|
|
156
265
|
def __repr__(self, N_top_chars:int=100) -> str:
|
|
157
266
|
text_to_print = self.text[0:N_top_chars]
|
|
158
267
|
frame_count = len(self.frames)
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
f'
|
|
268
|
+
relation_count = len(self.relations)
|
|
269
|
+
return ''.join((f'LLMInformationExtractionDocument(doc_id: "{self.doc_id}"\n',
|
|
270
|
+
f'text: "{text_to_print}...",\n',
|
|
271
|
+
f'frames: {frame_count}\n',
|
|
272
|
+
f'relations: {relation_count}'))
|
|
273
|
+
|
|
162
274
|
|
|
163
275
|
def save(self, filename:str):
|
|
164
|
-
with open(filename, 'w') as
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
276
|
+
with open(filename, 'w') as json_file:
|
|
277
|
+
json.dump({'doc_id': self.doc_id,
|
|
278
|
+
'text': self.text,
|
|
279
|
+
'frames': [frame.to_dict() for frame in self.frames],
|
|
280
|
+
'relations': self.relations},
|
|
281
|
+
json_file, indent=4)
|
|
282
|
+
json_file.flush()
|
|
170
283
|
|
llm_ie/extractors.py
CHANGED
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
import re
|
|
3
3
|
import json
|
|
4
|
+
import inspect
|
|
4
5
|
import importlib.resources
|
|
5
|
-
|
|
6
|
-
|
|
6
|
+
import warnings
|
|
7
|
+
import itertools
|
|
8
|
+
from typing import List, Dict, Tuple, Union, Callable
|
|
9
|
+
from llm_ie.data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
7
10
|
from llm_ie.engines import InferenceEngine
|
|
8
11
|
|
|
9
12
|
|
|
10
|
-
class
|
|
13
|
+
class Extractor:
|
|
11
14
|
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
|
|
12
15
|
"""
|
|
13
|
-
This is the abstract class for frame
|
|
16
|
+
This is the abstract class for (frame and relation) extractors.
|
|
14
17
|
Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
|
|
15
18
|
|
|
16
19
|
Parameters
|
|
@@ -26,12 +29,17 @@ class FrameExtractor:
|
|
|
26
29
|
self.prompt_template = prompt_template
|
|
27
30
|
self.system_prompt = system_prompt
|
|
28
31
|
|
|
32
|
+
|
|
29
33
|
@classmethod
|
|
30
34
|
def get_prompt_guide(cls) -> str:
|
|
35
|
+
"""
|
|
36
|
+
This method returns the pre-defined prompt guideline for the extractor from the package asset.
|
|
37
|
+
"""
|
|
31
38
|
file_path = importlib.resources.files('llm_ie.asset.prompt_guide').joinpath(f"{cls.__name__}_prompt_guide.txt")
|
|
32
39
|
with open(file_path, 'r') as f:
|
|
33
40
|
return f.read()
|
|
34
41
|
|
|
42
|
+
|
|
35
43
|
def _get_user_prompt(self, text_content:Union[str, Dict[str,str]]) -> str:
|
|
36
44
|
"""
|
|
37
45
|
This method applies text_content to prompt_template and returns a prompt.
|
|
@@ -49,18 +57,19 @@ class FrameExtractor:
|
|
|
49
57
|
pattern = re.compile(r'{{(.*?)}}')
|
|
50
58
|
if isinstance(text_content, str):
|
|
51
59
|
matches = pattern.findall(self.prompt_template)
|
|
52
|
-
|
|
53
|
-
"When text_content is str, the prompt template must has
|
|
54
|
-
|
|
60
|
+
if len(matches) != 1:
|
|
61
|
+
raise ValueError("When text_content is str, the prompt template must has exactly 1 placeholder {{<placeholder name>}}.")
|
|
62
|
+
text = re.sub(r'\\', r'\\\\', text_content)
|
|
63
|
+
prompt = pattern.sub(text, self.prompt_template)
|
|
55
64
|
|
|
56
65
|
elif isinstance(text_content, dict):
|
|
57
66
|
placeholders = pattern.findall(self.prompt_template)
|
|
58
|
-
|
|
59
|
-
f"Expect text_content ({len(text_content)}) and prompt template placeholder ({len(placeholders)}) to have equal size."
|
|
60
|
-
|
|
61
|
-
f"All keys in text_content ({text_content.keys()}) must match placeholders in prompt template ({placeholders})."
|
|
67
|
+
if len(placeholders) != len(text_content):
|
|
68
|
+
raise ValueError(f"Expect text_content ({len(text_content)}) and prompt template placeholder ({len(placeholders)}) to have equal size.")
|
|
69
|
+
if not all([k in placeholders for k, _ in text_content.items()]):
|
|
70
|
+
raise ValueError(f"All keys in text_content ({text_content.keys()}) must match placeholders in prompt template ({placeholders}).")
|
|
62
71
|
|
|
63
|
-
prompt = pattern.sub(lambda match: text_content[match.group(1)], self.prompt_template)
|
|
72
|
+
prompt = pattern.sub(lambda match: re.sub(r'\\', r'\\\\', text_content[match.group(1)]), self.prompt_template)
|
|
64
73
|
|
|
65
74
|
return prompt
|
|
66
75
|
|
|
@@ -79,6 +88,27 @@ class FrameExtractor:
|
|
|
79
88
|
return out
|
|
80
89
|
|
|
81
90
|
|
|
91
|
+
class FrameExtractor(Extractor):
|
|
92
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
|
|
93
|
+
"""
|
|
94
|
+
This is the abstract class for frame extraction.
|
|
95
|
+
Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
|
|
96
|
+
|
|
97
|
+
Parameters
|
|
98
|
+
----------
|
|
99
|
+
inference_engine : InferenceEngine
|
|
100
|
+
the LLM inferencing engine object. Must implements the chat() method.
|
|
101
|
+
prompt_template : str
|
|
102
|
+
prompt template with "{{<placeholder name>}}" placeholder.
|
|
103
|
+
system_prompt : str, Optional
|
|
104
|
+
system prompt.
|
|
105
|
+
"""
|
|
106
|
+
super().__init__(inference_engine=inference_engine,
|
|
107
|
+
prompt_template=prompt_template,
|
|
108
|
+
system_prompt=system_prompt,
|
|
109
|
+
**kwrs)
|
|
110
|
+
|
|
111
|
+
|
|
82
112
|
def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False) -> List[Tuple[int]]:
|
|
83
113
|
"""
|
|
84
114
|
This function inputs a text and a list of entity text,
|
|
@@ -290,7 +320,8 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
290
320
|
super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
|
|
291
321
|
system_prompt=system_prompt, **kwrs)
|
|
292
322
|
self.review_prompt = review_prompt
|
|
293
|
-
|
|
323
|
+
if review_mode not in {"addition", "revision"}:
|
|
324
|
+
raise ValueError('review_mode must be one of {"addition", "revision"}.')
|
|
294
325
|
self.review_mode = review_mode
|
|
295
326
|
|
|
296
327
|
|
|
@@ -528,3 +559,395 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
528
559
|
attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
|
|
529
560
|
frame_list.append(frame)
|
|
530
561
|
return frame_list
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
class RelationExtractor(Extractor):
|
|
565
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
|
|
566
|
+
"""
|
|
567
|
+
This is the abstract class for relation extraction.
|
|
568
|
+
Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
|
|
569
|
+
|
|
570
|
+
Parameters
|
|
571
|
+
----------
|
|
572
|
+
inference_engine : InferenceEngine
|
|
573
|
+
the LLM inferencing engine object. Must implements the chat() method.
|
|
574
|
+
prompt_template : str
|
|
575
|
+
prompt template with "{{<placeholder name>}}" placeholder.
|
|
576
|
+
system_prompt : str, Optional
|
|
577
|
+
system prompt.
|
|
578
|
+
"""
|
|
579
|
+
super().__init__(inference_engine=inference_engine,
|
|
580
|
+
prompt_template=prompt_template,
|
|
581
|
+
system_prompt=system_prompt,
|
|
582
|
+
**kwrs)
|
|
583
|
+
|
|
584
|
+
def _get_ROI(self, frame_1:LLMInformationExtractionFrame, frame_2:LLMInformationExtractionFrame,
|
|
585
|
+
text:str, buffer_size:int=100) -> str:
|
|
586
|
+
"""
|
|
587
|
+
This method returns the Region of Interest (ROI) that covers the two frames. Leaves a buffer_size of characters before and after.
|
|
588
|
+
The returned text has the two frames inline annotated with <entity_1>, <entity_2>.
|
|
589
|
+
|
|
590
|
+
Parameters:
|
|
591
|
+
-----------
|
|
592
|
+
frame_1 : LLMInformationExtractionFrame
|
|
593
|
+
a frame
|
|
594
|
+
frame_2 : LLMInformationExtractionFrame
|
|
595
|
+
the other frame
|
|
596
|
+
text : str
|
|
597
|
+
the entire document text
|
|
598
|
+
buffer_size : int, Optional
|
|
599
|
+
the number of characters before and after the two frames in the ROI text.
|
|
600
|
+
|
|
601
|
+
Return : str
|
|
602
|
+
the ROI text with the two frames inline annotated with <entity_1>, <entity_2>.
|
|
603
|
+
"""
|
|
604
|
+
left_frame, right_frame = sorted([frame_1, frame_2], key=lambda f: f.start)
|
|
605
|
+
left_frame_name = "entity_1" if left_frame == frame_1 else "entity_2"
|
|
606
|
+
right_frame_name = "entity_1" if right_frame == frame_1 else "entity_2"
|
|
607
|
+
|
|
608
|
+
start = max(left_frame.start - buffer_size, 0)
|
|
609
|
+
end = min(right_frame.end + buffer_size, len(text))
|
|
610
|
+
roi = text[start:end]
|
|
611
|
+
|
|
612
|
+
roi_annotated = roi[0:left_frame.start - start] + \
|
|
613
|
+
f'<{left_frame_name}>' + \
|
|
614
|
+
roi[left_frame.start - start:left_frame.end - start] + \
|
|
615
|
+
f"</{left_frame_name}>" + \
|
|
616
|
+
roi[left_frame.end - start:right_frame.start - start] + \
|
|
617
|
+
f'<{right_frame_name}>' + \
|
|
618
|
+
roi[right_frame.start - start:right_frame.end - start] + \
|
|
619
|
+
f"</{right_frame_name}>" + \
|
|
620
|
+
roi[right_frame.end - start:end - start]
|
|
621
|
+
|
|
622
|
+
if start > 0:
|
|
623
|
+
roi_annotated = "..." + roi_annotated
|
|
624
|
+
if end < len(text):
|
|
625
|
+
roi_annotated = roi_annotated + "..."
|
|
626
|
+
return roi_annotated
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
@abc.abstractmethod
|
|
630
|
+
def extract_relations(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
|
|
631
|
+
temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict]:
|
|
632
|
+
"""
|
|
633
|
+
This method considers all combinations of two frames.
|
|
634
|
+
|
|
635
|
+
Parameters:
|
|
636
|
+
-----------
|
|
637
|
+
doc : LLMInformationExtractionDocument
|
|
638
|
+
a document with frames.
|
|
639
|
+
buffer_size : int, Optional
|
|
640
|
+
the number of characters before and after the two frames in the ROI text.
|
|
641
|
+
max_new_tokens : str, Optional
|
|
642
|
+
the max number of new tokens LLM should generate.
|
|
643
|
+
temperature : float, Optional
|
|
644
|
+
the temperature for token sampling.
|
|
645
|
+
stream : bool, Optional
|
|
646
|
+
if True, LLM generated text will be printed in terminal in real-time.
|
|
647
|
+
|
|
648
|
+
Return : List[Dict]
|
|
649
|
+
a list of dict with {"frame_1", "frame_2"} for all relations.
|
|
650
|
+
"""
|
|
651
|
+
return NotImplemented
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
class BinaryRelationExtractor(RelationExtractor):
|
|
655
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, possible_relation_func: Callable,
|
|
656
|
+
system_prompt:str=None, **kwrs):
|
|
657
|
+
"""
|
|
658
|
+
This class extracts binary (yes/no) relations between two entities.
|
|
659
|
+
Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
|
|
660
|
+
|
|
661
|
+
Parameters
|
|
662
|
+
----------
|
|
663
|
+
inference_engine : InferenceEngine
|
|
664
|
+
the LLM inferencing engine object. Must implements the chat() method.
|
|
665
|
+
prompt_template : str
|
|
666
|
+
prompt template with "{{<placeholder name>}}" placeholder.
|
|
667
|
+
possible_relation_func : Callable, Optional
|
|
668
|
+
a function that inputs 2 frames and returns a bool indicating possible relations between them.
|
|
669
|
+
system_prompt : str, Optional
|
|
670
|
+
system prompt.
|
|
671
|
+
"""
|
|
672
|
+
super().__init__(inference_engine=inference_engine,
|
|
673
|
+
prompt_template=prompt_template,
|
|
674
|
+
system_prompt=system_prompt,
|
|
675
|
+
**kwrs)
|
|
676
|
+
|
|
677
|
+
if possible_relation_func:
|
|
678
|
+
# Check if possible_relation_func is a function
|
|
679
|
+
if not callable(possible_relation_func):
|
|
680
|
+
raise TypeError(f"Expect possible_relation_func as a function, received {type(possible_relation_func)} instead.")
|
|
681
|
+
|
|
682
|
+
sig = inspect.signature(possible_relation_func)
|
|
683
|
+
# Check if frame_1, frame_2 are in input parameters
|
|
684
|
+
if len(sig.parameters) != 2:
|
|
685
|
+
raise ValueError("The possible_relation_func must have exactly frame_1 and frame_2 as parameters.")
|
|
686
|
+
if "frame_1" not in sig.parameters.keys():
|
|
687
|
+
raise ValueError("The possible_relation_func is missing frame_1 as a parameter.")
|
|
688
|
+
if "frame_2" not in sig.parameters.keys():
|
|
689
|
+
raise ValueError("The possible_relation_func is missing frame_2 as a parameter.")
|
|
690
|
+
# Check if output is a bool
|
|
691
|
+
if sig.return_annotation != bool:
|
|
692
|
+
raise ValueError(f"Expect possible_relation_func to output a bool, current type hint suggests {sig.return_annotation} instead.")
|
|
693
|
+
|
|
694
|
+
self.possible_relation_func = possible_relation_func
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def _extract_relation(self, frame_1:LLMInformationExtractionFrame, frame_2:LLMInformationExtractionFrame,
|
|
698
|
+
text:str, buffer_size:int=100, max_new_tokens:int=128, temperature:float=0.0, stream:bool=False, **kwrs) -> bool:
|
|
699
|
+
"""
|
|
700
|
+
This method inputs two frames and a ROI text, extracts the binary relation.
|
|
701
|
+
|
|
702
|
+
Parameters:
|
|
703
|
+
-----------
|
|
704
|
+
frame_1 : LLMInformationExtractionFrame
|
|
705
|
+
a frame
|
|
706
|
+
frame_2 : LLMInformationExtractionFrame
|
|
707
|
+
the other frame
|
|
708
|
+
text : str
|
|
709
|
+
the entire document text
|
|
710
|
+
buffer_size : int, Optional
|
|
711
|
+
the number of characters before and after the two frames in the ROI text.
|
|
712
|
+
max_new_tokens : str, Optional
|
|
713
|
+
the max number of new tokens LLM should generate.
|
|
714
|
+
temperature : float, Optional
|
|
715
|
+
the temperature for token sampling.
|
|
716
|
+
stream : bool, Optional
|
|
717
|
+
if True, LLM generated text will be printed in terminal in real-time.
|
|
718
|
+
|
|
719
|
+
Return : bool
|
|
720
|
+
a relation indicator
|
|
721
|
+
"""
|
|
722
|
+
roi_text = self._get_ROI(frame_1, frame_2, text, buffer_size=buffer_size)
|
|
723
|
+
if stream:
|
|
724
|
+
print(f"\n\nROI text: \n{roi_text}\n")
|
|
725
|
+
print("Extraction:")
|
|
726
|
+
|
|
727
|
+
messages = []
|
|
728
|
+
if self.system_prompt:
|
|
729
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
730
|
+
|
|
731
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(text_content={"roi_text":roi_text,
|
|
732
|
+
"frame_1": str(frame_1.to_dict()),
|
|
733
|
+
"frame_2": str(frame_2.to_dict())}
|
|
734
|
+
)})
|
|
735
|
+
response = self.inference_engine.chat(
|
|
736
|
+
messages=messages,
|
|
737
|
+
max_new_tokens=max_new_tokens,
|
|
738
|
+
temperature=temperature,
|
|
739
|
+
stream=stream,
|
|
740
|
+
**kwrs
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
rel_json = self._extract_json(response)
|
|
744
|
+
if len(rel_json) > 0:
|
|
745
|
+
if "Relation" in rel_json[0]:
|
|
746
|
+
rel = rel_json[0]["Relation"]
|
|
747
|
+
if isinstance(rel, bool):
|
|
748
|
+
return rel
|
|
749
|
+
elif isinstance(rel, str) and rel in {"True", "False"}:
|
|
750
|
+
return eval(rel)
|
|
751
|
+
else:
|
|
752
|
+
warnings.warn('Extractor output JSON "Relation" key does not have bool or {"True", "False"} as value.' + \
|
|
753
|
+
'Following default, relation = False.', RuntimeWarning)
|
|
754
|
+
else:
|
|
755
|
+
warnings.warn('Extractor output JSON without "Relation" key. Following default, relation = False.', RuntimeWarning)
|
|
756
|
+
else:
|
|
757
|
+
warnings.warn("Extractor did not output a JSON. Following default, relation = False.", RuntimeWarning)
|
|
758
|
+
|
|
759
|
+
return False
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
def extract_relations(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
|
|
763
|
+
temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict]:
|
|
764
|
+
"""
|
|
765
|
+
This method considers all combinations of two frames. Use the possible_relation_func to filter impossible pairs.
|
|
766
|
+
|
|
767
|
+
Parameters:
|
|
768
|
+
-----------
|
|
769
|
+
doc : LLMInformationExtractionDocument
|
|
770
|
+
a document with frames.
|
|
771
|
+
buffer_size : int, Optional
|
|
772
|
+
the number of characters before and after the two frames in the ROI text.
|
|
773
|
+
max_new_tokens : str, Optional
|
|
774
|
+
the max number of new tokens LLM should generate.
|
|
775
|
+
temperature : float, Optional
|
|
776
|
+
the temperature for token sampling.
|
|
777
|
+
stream : bool, Optional
|
|
778
|
+
if True, LLM generated text will be printed in terminal in real-time.
|
|
779
|
+
|
|
780
|
+
Return : List[Dict]
|
|
781
|
+
a list of dict with {"frame_1", "frame_2"} for all relations.
|
|
782
|
+
"""
|
|
783
|
+
if not doc.has_frame():
|
|
784
|
+
raise ValueError("Input document must have frames.")
|
|
785
|
+
|
|
786
|
+
if doc.has_duplicate_frame_ids():
|
|
787
|
+
raise ValueError("All frame_ids in the input document must be unique.")
|
|
788
|
+
|
|
789
|
+
pairs = itertools.combinations(doc.frames, 2)
|
|
790
|
+
rel_pair_list = []
|
|
791
|
+
for frame_1, frame_2 in pairs:
|
|
792
|
+
pos_rel = self.possible_relation_func(frame_1, frame_2)
|
|
793
|
+
if pos_rel:
|
|
794
|
+
rel = self._extract_relation(frame_1=frame_1, frame_2=frame_2, text=doc.text, buffer_size=buffer_size,
|
|
795
|
+
max_new_tokens=max_new_tokens, temperature=temperature, stream=stream, **kwrs)
|
|
796
|
+
if rel:
|
|
797
|
+
rel_pair_list.append({'frame_1':frame_1.frame_id, 'frame_2':frame_2.frame_id})
|
|
798
|
+
|
|
799
|
+
return rel_pair_list
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
|
|
803
|
+
class MultiClassRelationExtractor(RelationExtractor):
|
|
804
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, possible_relation_types_func: Callable,
|
|
805
|
+
system_prompt:str=None, **kwrs):
|
|
806
|
+
"""
|
|
807
|
+
This class extracts relations with relation types.
|
|
808
|
+
Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
|
|
809
|
+
|
|
810
|
+
Parameters
|
|
811
|
+
----------
|
|
812
|
+
inference_engine : InferenceEngine
|
|
813
|
+
the LLM inferencing engine object. Must implements the chat() method.
|
|
814
|
+
prompt_template : str
|
|
815
|
+
prompt template with "{{<placeholder name>}}" placeholder.
|
|
816
|
+
possible_relation_types_func : Callable
|
|
817
|
+
a function that inputs 2 frames and returns a List of possible relation types between them.
|
|
818
|
+
If the two frames must not have relations, this function should return an empty list [].
|
|
819
|
+
system_prompt : str, Optional
|
|
820
|
+
system prompt.
|
|
821
|
+
"""
|
|
822
|
+
super().__init__(inference_engine=inference_engine,
|
|
823
|
+
prompt_template=prompt_template,
|
|
824
|
+
system_prompt=system_prompt,
|
|
825
|
+
**kwrs)
|
|
826
|
+
|
|
827
|
+
if possible_relation_types_func:
|
|
828
|
+
# Check if possible_relation_types_func is a function
|
|
829
|
+
if not callable(possible_relation_types_func):
|
|
830
|
+
raise TypeError(f"Expect possible_relation_types_func as a function, received {type(possible_relation_types_func)} instead.")
|
|
831
|
+
|
|
832
|
+
sig = inspect.signature(possible_relation_types_func)
|
|
833
|
+
# Check if frame_1, frame_2 are in input parameters
|
|
834
|
+
if len(sig.parameters) != 2:
|
|
835
|
+
raise ValueError("The possible_relation_types_func must have exactly frame_1 and frame_2 as parameters.")
|
|
836
|
+
if "frame_1" not in sig.parameters.keys():
|
|
837
|
+
raise ValueError("The possible_relation_types_func is missing frame_1 as a parameter.")
|
|
838
|
+
if "frame_2" not in sig.parameters.keys():
|
|
839
|
+
raise ValueError("The possible_relation_types_func is missing frame_2 as a parameter.")
|
|
840
|
+
# Check if output is a List
|
|
841
|
+
if sig.return_annotation not in {inspect._empty, List, List[str]}:
|
|
842
|
+
raise ValueError(f"Expect possible_relation_types_func to output a List of string, current type hint suggests {sig.return_annotation} instead.")
|
|
843
|
+
|
|
844
|
+
self.possible_relation_types_func = possible_relation_types_func
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
def _extract_relation(self, frame_1:LLMInformationExtractionFrame, frame_2:LLMInformationExtractionFrame,
|
|
848
|
+
pos_rel_types:List[str], text:str, buffer_size:int=100, max_new_tokens:int=128, temperature:float=0.0, stream:bool=False, **kwrs) -> str:
|
|
849
|
+
"""
|
|
850
|
+
This method inputs two frames and a ROI text, extracts the relation.
|
|
851
|
+
|
|
852
|
+
Parameters:
|
|
853
|
+
-----------
|
|
854
|
+
frame_1 : LLMInformationExtractionFrame
|
|
855
|
+
a frame
|
|
856
|
+
frame_2 : LLMInformationExtractionFrame
|
|
857
|
+
the other frame
|
|
858
|
+
pos_rel_types : List[str]
|
|
859
|
+
possible relation types.
|
|
860
|
+
text : str
|
|
861
|
+
the entire document text
|
|
862
|
+
buffer_size : int, Optional
|
|
863
|
+
the number of characters before and after the two frames in the ROI text.
|
|
864
|
+
max_new_tokens : str, Optional
|
|
865
|
+
the max number of new tokens LLM should generate.
|
|
866
|
+
temperature : float, Optional
|
|
867
|
+
the temperature for token sampling.
|
|
868
|
+
stream : bool, Optional
|
|
869
|
+
if True, LLM generated text will be printed in terminal in real-time.
|
|
870
|
+
|
|
871
|
+
Return : str
|
|
872
|
+
a relation type
|
|
873
|
+
"""
|
|
874
|
+
roi_text = self._get_ROI(frame_1, frame_2, text, buffer_size=buffer_size)
|
|
875
|
+
if stream:
|
|
876
|
+
print(f"\n\nROI text: \n{roi_text}\n")
|
|
877
|
+
print("Extraction:")
|
|
878
|
+
|
|
879
|
+
messages = []
|
|
880
|
+
if self.system_prompt:
|
|
881
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
882
|
+
|
|
883
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(text_content={"roi_text":roi_text,
|
|
884
|
+
"frame_1": str(frame_1.to_dict()),
|
|
885
|
+
"frame_2": str(frame_2.to_dict()),
|
|
886
|
+
"pos_rel_types":str(pos_rel_types)})})
|
|
887
|
+
response = self.inference_engine.chat(
|
|
888
|
+
messages=messages,
|
|
889
|
+
max_new_tokens=max_new_tokens,
|
|
890
|
+
temperature=temperature,
|
|
891
|
+
stream=stream,
|
|
892
|
+
**kwrs
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
rel_json = self._extract_json(response)
|
|
896
|
+
if len(rel_json) > 0:
|
|
897
|
+
if "RelationType" in rel_json[0]:
|
|
898
|
+
rel = rel_json[0]["RelationType"]
|
|
899
|
+
if rel in pos_rel_types:
|
|
900
|
+
return rel_json[0]["RelationType"]
|
|
901
|
+
else:
|
|
902
|
+
warnings.warn(f'Extracted relation type "{rel}", which is not in the return of possible_relation_types_func: {pos_rel_types}.'+ \
|
|
903
|
+
'Following default, relation = "No Relation".', RuntimeWarning)
|
|
904
|
+
|
|
905
|
+
else:
|
|
906
|
+
warnings.warn('Extractor output JSON without "RelationType" key. Following default, relation = "No Relation".', RuntimeWarning)
|
|
907
|
+
|
|
908
|
+
else:
|
|
909
|
+
warnings.warn('Extractor did not output a JSON. Following default, relation = "No Relation".', RuntimeWarning)
|
|
910
|
+
|
|
911
|
+
return "No Relation"
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
def extract_relations(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
|
|
915
|
+
temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict]:
|
|
916
|
+
"""
|
|
917
|
+
This method considers all combinations of two frames. Use the possible_relation_types_func to filter impossible pairs
|
|
918
|
+
and to provide possible relation types between two frames.
|
|
919
|
+
|
|
920
|
+
Parameters:
|
|
921
|
+
-----------
|
|
922
|
+
doc : LLMInformationExtractionDocument
|
|
923
|
+
a document with frames.
|
|
924
|
+
buffer_size : int, Optional
|
|
925
|
+
the number of characters before and after the two frames in the ROI text.
|
|
926
|
+
max_new_tokens : str, Optional
|
|
927
|
+
the max number of new tokens LLM should generate.
|
|
928
|
+
temperature : float, Optional
|
|
929
|
+
the temperature for token sampling.
|
|
930
|
+
stream : bool, Optional
|
|
931
|
+
if True, LLM generated text will be printed in terminal in real-time.
|
|
932
|
+
|
|
933
|
+
Return : List[Dict]
|
|
934
|
+
a list of dict with {"frame_1", "frame_2", "relation"} for all relations.
|
|
935
|
+
"""
|
|
936
|
+
if not doc.has_frame():
|
|
937
|
+
raise ValueError("Input document must have frames.")
|
|
938
|
+
|
|
939
|
+
if doc.has_duplicate_frame_ids():
|
|
940
|
+
raise ValueError("All frame_ids in the input document must be unique.")
|
|
941
|
+
|
|
942
|
+
pairs = itertools.combinations(doc.frames, 2)
|
|
943
|
+
rel_pair_list = []
|
|
944
|
+
for frame_1, frame_2 in pairs:
|
|
945
|
+
pos_rel_types = self.possible_relation_types_func(frame_1, frame_2)
|
|
946
|
+
if pos_rel_types:
|
|
947
|
+
rel = self._extract_relation(frame_1=frame_1, frame_2=frame_2, pos_rel_types=pos_rel_types, text=doc.text,
|
|
948
|
+
buffer_size=buffer_size, max_new_tokens=max_new_tokens, temperature=temperature, stream=stream, **kwrs)
|
|
949
|
+
|
|
950
|
+
if rel != "No Relation":
|
|
951
|
+
rel_pair_list.append({'frame_1':frame_1.frame_id, 'frame_2':frame_2.frame_id, "relation":rel})
|
|
952
|
+
|
|
953
|
+
return rel_pair_list
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llm-ie
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Enshuo (David) Hsu
|
|
@@ -25,11 +25,14 @@ An LLM-powered tool that transforms everyday language into robust information ex
|
|
|
25
25
|
- [Prerequisite](#prerequisite)
|
|
26
26
|
- [Installation](#installation)
|
|
27
27
|
- [Quick Start](#quick-start)
|
|
28
|
+
- [Examples](#examples)
|
|
28
29
|
- [User Guide](#user-guide)
|
|
29
30
|
- [LLM Inference Engine](#llm-inference-engine)
|
|
30
31
|
- [Prompt Template](#prompt-template)
|
|
31
32
|
- [Prompt Editor](#prompt-editor)
|
|
32
33
|
- [Extractor](#extractor)
|
|
34
|
+
- [FrameExtractor](#frameextractor)
|
|
35
|
+
- [RelationExtractor](#relationextractor)
|
|
33
36
|
|
|
34
37
|
## Overview
|
|
35
38
|
LLM-IE is a toolkit that provides robust information extraction utilities for frame-based information extraction. Since prompt design has a significant impact on generative information extraction with LLMs, it also provides a built-in LLM editor to help with prompt writing. The flowchart below demonstrates the workflow starting from a casual language request.
|
|
@@ -206,6 +209,10 @@ for frame in frames:
|
|
|
206
209
|
doc.save("<your filename>.llmie")
|
|
207
210
|
```
|
|
208
211
|
|
|
212
|
+
## Examples
|
|
213
|
+
- [Write prompt templates with AI editors](demo/prompt_template_writing.ipynb)
|
|
214
|
+
- [NER + RE for Drug, Strength, Frequency](demo/medication_relation_extraction.ipynb)
|
|
215
|
+
|
|
209
216
|
## User Guide
|
|
210
217
|
This package is comprised of some key classes:
|
|
211
218
|
- LLM Inference Engine
|
|
@@ -547,12 +554,25 @@ Recommendations:
|
|
|
547
554
|
After a few iterations of revision, we will have a high-quality prompt template for the information extraction pipeline.
|
|
548
555
|
|
|
549
556
|
### Extractor
|
|
550
|
-
An extractor implements a prompting method for information extraction.
|
|
557
|
+
An extractor implements a prompting method for information extraction. There are two extractor families: ```FrameExtractor``` and ```RelationExtractor```.
|
|
558
|
+
The ```FrameExtractor``` extracts named entities and entity attributes ("frame"). The ```RelationExtractor``` extracts the relation (and relation types) between frames.
|
|
559
|
+
|
|
560
|
+
#### FrameExtractor
|
|
561
|
+
The ```BasicFrameExtractor``` directly prompts LLM to generate a list of dictionaries. Each dictionary is then post-processed into a frame. The ```ReviewFrameExtractor``` is based on the ```BasicFrameExtractor``` but adds a review step after the initial extraction to boost sensitivity and improve performance. ```SentenceFrameExtractor``` gives LLM the entire document upfront as a reference, then prompts LLM sentence by sentence and collects per-sentence outputs. To learn about an extractor, use the class method ```get_prompt_guide()``` to print out the prompt guide.
|
|
551
562
|
|
|
552
563
|
<details>
|
|
553
564
|
<summary>BasicFrameExtractor</summary>
|
|
554
565
|
|
|
555
|
-
The ```BasicFrameExtractor``` directly prompts LLM to generate a list of dictionaries. Each dictionary is then post-processed into a frame.
|
|
566
|
+
The ```BasicFrameExtractor``` directly prompts LLM to generate a list of dictionaries. Each dictionary is then post-processed into a frame. The ```text_content``` holds the input text as a string, or as a dictionary (if prompt template has multiple input placeholders). The ```entity_key``` defines which JSON key should be used as entity text. It must be consistent with the prompt template.
|
|
567
|
+
|
|
568
|
+
```python
|
|
569
|
+
from llm_ie.extractors import BasicFrameExtractor
|
|
570
|
+
|
|
571
|
+
extractor = BasicFrameExtractor(llm, prompt_temp)
|
|
572
|
+
frames = extractor.extract_frames(text_content=text, entity_key="Diagnosis", stream=True)
|
|
573
|
+
```
|
|
574
|
+
|
|
575
|
+
Use the ```get_prompt_guide()``` method to inspect the prompt template guideline for ```BasicFrameExtractor```.
|
|
556
576
|
|
|
557
577
|
```python
|
|
558
578
|
from llm_ie.extractors import BasicFrameExtractor
|
|
@@ -630,15 +650,202 @@ frames = extractor.extract_frames(text_content=text, entity_key="Diagnosis", str
|
|
|
630
650
|
<details>
|
|
631
651
|
<summary>SentenceFrameExtractor</summary>
|
|
632
652
|
|
|
633
|
-
The ```SentenceFrameExtractor``` instructs the LLM to extract sentence by sentence. The reason is to ensure the accuracy of frame spans. It also prevents LLMs from overseeing sections/ sentences. Empirically, this extractor results in better
|
|
653
|
+
The ```SentenceFrameExtractor``` instructs the LLM to extract sentence by sentence. The reason is to ensure the accuracy of frame spans. It also prevents LLMs from overseeing sections/ sentences. Empirically, this extractor results in better recall than the ```BasicFrameExtractor``` in complex tasks.
|
|
654
|
+
|
|
655
|
+
The ```multi_turn``` parameter specifies multi-turn conversation for prompting. If True, sentences and LLM outputs will be appended to the input message and carry-over. If False, only the current sentence is prompted. For LLM inference engines that supports prompt cache (e.g., Llama.Cpp, Ollama), use multi-turn conversation prompting can better utilize the KV caching and results in faster inferencing. But for vLLM with [Automatic Prefix Caching (APC)](https://docs.vllm.ai/en/latest/automatic_prefix_caching/apc.html), multi-turn conversation is not necessary.
|
|
634
656
|
|
|
635
657
|
```python
|
|
636
658
|
from llm_ie.extractors import SentenceFrameExtractor
|
|
637
659
|
|
|
638
660
|
extractor = SentenceFrameExtractor(llm, prompt_temp)
|
|
639
|
-
frames = extractor.extract_frames(text_content=text, entity_key="Diagnosis", stream=True)
|
|
661
|
+
frames = extractor.extract_frames(text_content=text, entity_key="Diagnosis", multi_turn=True, stream=True)
|
|
640
662
|
```
|
|
641
663
|
</details>
|
|
642
664
|
|
|
665
|
+
#### RelationExtractor
|
|
666
|
+
Relation extractors prompt LLM with combinations of two frames from a document (```LLMInformationExtractionDocument```) and extract relations.
|
|
667
|
+
The ```BinaryRelationExtractor``` extracts binary relations (yes/no) between two frames. The ```MultiClassRelationExtractor``` extracts relations and assign relation types ("multi-class").
|
|
668
|
+
|
|
669
|
+
An important feature of the relation extractors is that users are required to define a ```possible_relation_func``` or ```possible_relation_types_func``` function for the extractors. The reason is, there are too many possible combinations of two frames (N choose 2 combinations). The ```possible_relation_func``` helps rule out impossible combinations and therefore, reduce the LLM inferencing burden.
|
|
670
|
+
|
|
671
|
+
<details>
|
|
672
|
+
<summary>BinaryRelationExtractor</summary>
|
|
673
|
+
|
|
674
|
+
Use the get_prompt_guide() method to inspect the prompt template guideline for BinaryRelationExtractor.
|
|
675
|
+
```python
|
|
676
|
+
from llm_ie.extractors import BinaryRelationExtractor
|
|
677
|
+
|
|
678
|
+
print(BinaryRelationExtractor.get_prompt_guide())
|
|
679
|
+
```
|
|
680
|
+
|
|
681
|
+
```
|
|
682
|
+
Prompt template design:
|
|
683
|
+
1. Task description (mention binary relation extraction and ROI)
|
|
684
|
+
2. Schema definition (defines relation)
|
|
685
|
+
3. Output format definition (must use the key "Relation")
|
|
686
|
+
4. Hints
|
|
687
|
+
5. Input placeholders (must include "roi_text", "frame_1", and "frame_2" placeholders)
|
|
643
688
|
|
|
644
689
|
|
|
690
|
+
Example:
|
|
691
|
+
|
|
692
|
+
# Task description
|
|
693
|
+
This is a binary relation extraction task. Given a region of interest (ROI) text and two entities from a medical note, indicate the relation existence between the two entities.
|
|
694
|
+
|
|
695
|
+
# Schema definition
|
|
696
|
+
True: if there is a relationship between a medication name (one of the entities) and its strength or frequency (the other entity).
|
|
697
|
+
False: Otherwise.
|
|
698
|
+
|
|
699
|
+
# Output format definition
|
|
700
|
+
Your output should follow the JSON format:
|
|
701
|
+
{"Relation": "<True or False>"}
|
|
702
|
+
|
|
703
|
+
I am only interested in the content between []. Do not explain your answer.
|
|
704
|
+
|
|
705
|
+
# Hints
|
|
706
|
+
1. Your input always contains one medication entity and 1) one strength entity or 2) one frequency entity.
|
|
707
|
+
2. Pay attention to the medication entity and see if the strength or frequency is for it.
|
|
708
|
+
3. If the strength or frequency is for another medication, output False.
|
|
709
|
+
4. If the strength or frequency is for the same medication but at a different location (span), output False.
|
|
710
|
+
|
|
711
|
+
# Input placeholders
|
|
712
|
+
ROI Text with the two entities annotated with <entity_1> and <entity_2>:
|
|
713
|
+
"{{roi_text}}"
|
|
714
|
+
|
|
715
|
+
Entity 1 full information:
|
|
716
|
+
{{frame_1}}
|
|
717
|
+
|
|
718
|
+
Entity 2 full information:
|
|
719
|
+
{{frame_2}}
|
|
720
|
+
```
|
|
721
|
+
|
|
722
|
+
As an example, we define the ```possible_relation_func``` function:
|
|
723
|
+
- if the two frames are > 500 characters apart, we assume no relation (False)
|
|
724
|
+
- if the two frames are "Medication" and "Strength", or "Medication" and "Frequency", there could be relations (True)
|
|
725
|
+
|
|
726
|
+
```python
|
|
727
|
+
def possible_relation_func(frame_1, frame_2) -> bool:
|
|
728
|
+
"""
|
|
729
|
+
This function pre-process two frames and outputs a bool indicating whether the two frames could be related.
|
|
730
|
+
"""
|
|
731
|
+
# if the distance between the two frames are > 500 characters, assume no relation.
|
|
732
|
+
if abs(frame_1.start - frame_2.start) > 500:
|
|
733
|
+
return False
|
|
734
|
+
|
|
735
|
+
# if the entity types are "Medication" and "Strength", there could be relations.
|
|
736
|
+
if (frame_1.attr["entity_type"] == "Medication" and frame_2.attr["entity_type"] == "Strength") or \
|
|
737
|
+
(frame_2.attr["entity_type"] == "Medication" and frame_1.attr["entity_type"] == "Strength"):
|
|
738
|
+
return True
|
|
739
|
+
|
|
740
|
+
# if the entity types are "Medication" and "Frequency", there could be relations.
|
|
741
|
+
if (frame_1.attr["entity_type"] == "Medication" and frame_2.attr["entity_type"] == "Frequency") or \
|
|
742
|
+
(frame_2.attr["entity_type"] == "Medication" and frame_1.attr["entity_type"] == "Frequency"):
|
|
743
|
+
return True
|
|
744
|
+
|
|
745
|
+
# Otherwise, no relation.
|
|
746
|
+
return False
|
|
747
|
+
```
|
|
748
|
+
|
|
749
|
+
In the ```BinaryRelationExtractor``` constructor, we pass in the prompt template and ```possible_relation_func```.
|
|
750
|
+
|
|
751
|
+
```python
|
|
752
|
+
from llm_ie.extractors import BinaryRelationExtractor
|
|
753
|
+
|
|
754
|
+
extractor = BinaryRelationExtractor(llm, prompt_template=prompt_template, possible_relation_func=possible_relation_func)
|
|
755
|
+
relations = extractor.extract_relations(doc, stream=True)
|
|
756
|
+
```
|
|
757
|
+
|
|
758
|
+
</details>
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
<details>
|
|
762
|
+
<summary>MultiClassRelationExtractor</summary>
|
|
763
|
+
|
|
764
|
+
The main difference from ```BinaryRelationExtractor``` is that the ```MultiClassRelationExtractor``` allows specifying relation types. The prompt template guideline has an additional placeholder for possible relation types ```{{pos_rel_types}}```.
|
|
765
|
+
|
|
766
|
+
```python
|
|
767
|
+
print(MultiClassRelationExtractor.get_prompt_guide())
|
|
768
|
+
```
|
|
769
|
+
|
|
770
|
+
```
|
|
771
|
+
Prompt template design:
|
|
772
|
+
1. Task description (mention multi-class relation extraction and ROI)
|
|
773
|
+
2. Schema definition (defines relation types)
|
|
774
|
+
3. Output format definition (must use the key "RelationType")
|
|
775
|
+
4. Input placeholders (must include "roi_text", "frame_1", and "frame_2" placeholders)
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
Example:
|
|
779
|
+
|
|
780
|
+
# Task description
|
|
781
|
+
This is a multi-class relation extraction task. Given a region of interest (ROI) text and two frames from a medical note, classify the relation types between the two frames.
|
|
782
|
+
|
|
783
|
+
# Schema definition
|
|
784
|
+
Strength-Drug: this is a relationship between the drug strength and its name.
|
|
785
|
+
Dosage-Drug: this is a relationship between the drug dosage and its name.
|
|
786
|
+
Duration-Drug: this is a relationship between a drug duration and its name.
|
|
787
|
+
Frequency-Drug: this is a relationship between a drug frequency and its name.
|
|
788
|
+
Form-Drug: this is a relationship between a drug form and its name.
|
|
789
|
+
Route-Drug: this is a relationship between the route of administration for a drug and its name.
|
|
790
|
+
Reason-Drug: this is a relationship between the reason for which a drug was administered (e.g., symptoms, diseases, etc.) and a drug name.
|
|
791
|
+
ADE-Drug: this is a relationship between an adverse drug event (ADE) and a drug name.
|
|
792
|
+
|
|
793
|
+
# Output format definition
|
|
794
|
+
Choose one of the relation types listed below or choose "No Relation":
|
|
795
|
+
{{pos_rel_types}}
|
|
796
|
+
|
|
797
|
+
Your output should follow the JSON format:
|
|
798
|
+
{"RelationType": "<relation type or No Relation>"}
|
|
799
|
+
|
|
800
|
+
I am only interested in the content between []. Do not explain your answer.
|
|
801
|
+
|
|
802
|
+
# Hints
|
|
803
|
+
1. Your input always contains one medication entity and 1) one strength entity or 2) one frequency entity.
|
|
804
|
+
2. Pay attention to the medication entity and see if the strength or frequency is for it.
|
|
805
|
+
3. If the strength or frequency is for another medication, output "No Relation".
|
|
806
|
+
4. If the strength or frequency is for the same medication but at a different location (span), output "No Relation".
|
|
807
|
+
|
|
808
|
+
# Input placeholders
|
|
809
|
+
ROI Text with the two entities annotated with <entity_1> and <entity_2>:
|
|
810
|
+
"{{roi_text}}"
|
|
811
|
+
|
|
812
|
+
Entity 1 full information:
|
|
813
|
+
{{frame_1}}
|
|
814
|
+
|
|
815
|
+
Entity 2 full information:
|
|
816
|
+
{{frame_2}}
|
|
817
|
+
```
|
|
818
|
+
|
|
819
|
+
As an example, we define the ```possible_relation_types_func``` :
|
|
820
|
+
- if the two frames are > 500 characters apart, we assume "No Relation" (output [])
|
|
821
|
+
- if the two frames are "Medication" and "Strength", the only possible relation types are "Strength-Drug" or "No Relation"
|
|
822
|
+
- if the two frames are "Medication" and "Frequency", the only possible relation types are "Frequency-Drug" or "No Relation"
|
|
823
|
+
|
|
824
|
+
```python
|
|
825
|
+
def possible_relation_types_func(frame_1, frame_2) -> List[str]:
|
|
826
|
+
# If the two frames are > 500 characters apart, we assume "No Relation"
|
|
827
|
+
if abs(frame_1.start - frame_2.start) > 500:
|
|
828
|
+
return []
|
|
829
|
+
|
|
830
|
+
# If the two frames are "Medication" and "Strength", the only possible relation types are "Strength-Drug" or "No Relation"
|
|
831
|
+
if (frame_1.attr["entity_type"] == "Medication" and frame_2.attr["entity_type"] == "Strength") or \
|
|
832
|
+
(frame_2.attr["entity_type"] == "Medication" and frame_1.attr["entity_type"] == "Strength"):
|
|
833
|
+
return ['Strength-Drug']
|
|
834
|
+
|
|
835
|
+
# If the two frames are "Medication" and "Frequency", the only possible relation types are "Frequency-Drug" or "No Relation"
|
|
836
|
+
if (frame_1.attr["entity_type"] == "Medication" and frame_2.attr["entity_type"] == "Frequency") or \
|
|
837
|
+
(frame_2.attr["entity_type"] == "Medication" and frame_1.attr["entity_type"] == "Frequency"):
|
|
838
|
+
return ['Frequency-Drug']
|
|
839
|
+
|
|
840
|
+
return []
|
|
841
|
+
```
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
```python
|
|
845
|
+
from llm_ie.extractors import MultiClassRelationExtractor
|
|
846
|
+
|
|
847
|
+
extractor = MultiClassRelationExtractor(llm, prompt_template=re_prompt_template, possible_relation_types_func=possible_relation_types_func)
|
|
848
|
+
relations = extractor.extract_relations(doc, stream=True)
|
|
849
|
+
```
|
|
850
|
+
|
|
851
|
+
</details>
|
|
@@ -2,12 +2,14 @@ llm_ie/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
2
2
|
llm_ie/asset/PromptEditor_prompts/comment.txt,sha256=C_lxx-dlOlFJ__jkHKosZ8HsNAeV1aowh2B36nIipBY,159
|
|
3
3
|
llm_ie/asset/PromptEditor_prompts/rewrite.txt,sha256=bYLOix7DUBlcWv-Q0JZ5kDnZ9OEXBt_AGDN0TydLB8o,191
|
|
4
4
|
llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt,sha256=XbnU8byLGGUA3A3lT0bb2Hw-ggzhcqD3ZuKzduod2ww,1944
|
|
5
|
+
llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt,sha256=z9Xg0fdFbVVwnTYcUTcAUvEIWhF075W8qGxN-Vj7xdo,1548
|
|
6
|
+
llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt,sha256=D5DphUHw8SUERUVdcIjUynuTmYJa6-PwBlF7FzxNsvQ,2276
|
|
5
7
|
llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt,sha256=XbnU8byLGGUA3A3lT0bb2Hw-ggzhcqD3ZuKzduod2ww,1944
|
|
6
8
|
llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=8nj9OLPJMtr9Soi5JU3Xk-HC7pKNoI54xA_A4u7I5j4,2620
|
|
7
|
-
llm_ie/data_types.py,sha256=
|
|
9
|
+
llm_ie/data_types.py,sha256=2RKP4wXDuku-Tn4s8uzzUFavG1fZ2e47SaY8oL57LsI,10923
|
|
8
10
|
llm_ie/engines.py,sha256=m9ytGUX61jEy9SmVHbb90mrfGMAwC6dV-v7Jke1U7Ho,9296
|
|
9
|
-
llm_ie/extractors.py,sha256=
|
|
11
|
+
llm_ie/extractors.py,sha256=i0m8uFaKXiVY1ucjvzbUFbV1slPYfZ3EGOZrolnFVHA,44079
|
|
10
12
|
llm_ie/prompt_editor.py,sha256=dbu7A3O7O7Iw2v-xCgrTFH1-wTLAGf4SHDqdeS-He2Q,1869
|
|
11
|
-
llm_ie-0.
|
|
12
|
-
llm_ie-0.
|
|
13
|
-
llm_ie-0.
|
|
13
|
+
llm_ie-0.2.0.dist-info/METADATA,sha256=9CPC3OAd2J0nROZ7z8DI7lvGKOO2H2uAnVQv_YDFItg,40052
|
|
14
|
+
llm_ie-0.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
15
|
+
llm_ie-0.2.0.dist-info/RECORD,,
|
|
File without changes
|