llm-ie 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llm_ie-0.3.0 → llm_ie-0.3.2}/PKG-INFO +2 -1
- {llm_ie-0.3.0 → llm_ie-0.3.2}/pyproject.toml +2 -1
- llm_ie-0.3.2/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +3 -0
- llm_ie-0.3.2/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +2 -0
- llm_ie-0.3.2/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt +4 -0
- llm_ie-0.3.2/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_revision_review_prompt.txt +3 -0
- llm_ie-0.3.2/src/llm_ie/asset/prompt_guide/SentenceCoTFrameExtractor_prompt_guide.txt +217 -0
- llm_ie-0.3.2/src/llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +145 -0
- {llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/extractors.py +423 -31
- {llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/prompt_editor.py +11 -5
- {llm_ie-0.3.0 → llm_ie-0.3.2}/README.md +0 -0
- {llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/__init__.py +0 -0
- {llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/asset/PromptEditor_prompts/chat.txt +0 -0
- {llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/asset/PromptEditor_prompts/comment.txt +0 -0
- {llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/asset/PromptEditor_prompts/rewrite.txt +0 -0
- {llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/asset/PromptEditor_prompts/system.txt +0 -0
- {llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/data_types.py +0 -0
- {llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/engines.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llm-ie
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Enshuo (David) Hsu
|
|
@@ -9,6 +9,7 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.11
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Requires-Dist: colorama (>=0.4.6,<0.5.0)
|
|
12
13
|
Requires-Dist: nltk (>=3.8,<4.0)
|
|
13
14
|
Description-Content-Type: text/markdown
|
|
14
15
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "llm-ie"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.2"
|
|
4
4
|
description = "An LLM-powered tool that transforms everyday language into robust information extraction pipelines."
|
|
5
5
|
authors = ["Enshuo (David) Hsu"]
|
|
6
6
|
license = "MIT"
|
|
@@ -14,6 +14,7 @@ exclude = [
|
|
|
14
14
|
[tool.poetry.dependencies]
|
|
15
15
|
python = "^3.11"
|
|
16
16
|
nltk = "^3.8"
|
|
17
|
+
colorama = "^0.4.6"
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
[build-system]
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
Review the input sentence and your output carefully. If anything was missed, add it to your output following the defined output formats.
|
|
2
|
+
You should ONLY adding new items. Do NOT re-generate the entire answer.
|
|
3
|
+
Your output should be based on the input sentence.
|
|
4
|
+
Your output should strictly adheres to the defined output formats.
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
Review the input sentence and your output carefully. If you find any omissions or errors, correct them by generating a revised output following the defined output formats.
|
|
2
|
+
Your output should be based on the input sentence.
|
|
3
|
+
Your output should strictly adheres to the defined output formats.
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
Prompt Template Design:
|
|
2
|
+
|
|
3
|
+
1. Task Description:
|
|
4
|
+
Provide a detailed description of the task, including the background and the type of task (e.g., named entity recognition).
|
|
5
|
+
|
|
6
|
+
2. Schema Definition:
|
|
7
|
+
List the key concepts that should be extracted, and provide clear definitions for each one.
|
|
8
|
+
|
|
9
|
+
3. Thinking process:
|
|
10
|
+
Provide clear step-by-step instructions for analyzing the input text. Typically, this process should begin with an analysis section and proceed to the output generation. Each section should have a specific purpose:
|
|
11
|
+
|
|
12
|
+
Optional: Recall Section (<Recall>... </Recall>):
|
|
13
|
+
Write a brief recall of the task description and schema definition for better understanding of the task.
|
|
14
|
+
|
|
15
|
+
Analysis Section (<Analysis>... </Analysis>):
|
|
16
|
+
Break down the input text to identify important medical contents and clarify ambiguous concepts.
|
|
17
|
+
|
|
18
|
+
Output Section (<Outputs>... </Outputs>):
|
|
19
|
+
Based on the analysis, generate the required output in the defined format. Ensure that the extracted information adheres to the schema and task description.
|
|
20
|
+
|
|
21
|
+
4. Output Format Definition:
|
|
22
|
+
The output should be a JSON list, where each element is a dictionary representing a frame (an entity along with its attributes). Each dictionary must include a key that holds the entity text. This key can be named "entity_text" or anything else depend on the context. The attributes can either be flat (e.g., {"entity_text": "<entity_text>", "attr1": "<attr1>", "attr2": "<attr2>"}) or nested (e.g., {"entity_text": "<entity_text>", "attributes": {"attr1": "<attr1>", "attr2": "<attr2>"}}).
|
|
23
|
+
|
|
24
|
+
5. Optional: Hints:
|
|
25
|
+
Provide itemized hints for the information extractors to guide the extraction process.
|
|
26
|
+
|
|
27
|
+
6. Optional: Examples:
|
|
28
|
+
Include examples in the format:
|
|
29
|
+
Input: ...
|
|
30
|
+
Output: ...
|
|
31
|
+
|
|
32
|
+
7. Input Placeholder:
|
|
33
|
+
The template must include a placeholder in the format {{<placeholder_name>}} for the input text. The placeholder name can be customized as needed.
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
Example 1 (single entity type with attributes):
|
|
37
|
+
|
|
38
|
+
# Task description
|
|
39
|
+
The paragraph below is from the Food and Drug Administration (FDA) Clinical Pharmacology Section of Labeling for Human Prescription Drug and Biological Products, Adverse reactions section. Please carefully review it and extract the adverse reactions and percentages. Note that each adverse reaction is nested under a clinical trial and potentially an arm. Your output should take that into consideration.
|
|
40
|
+
|
|
41
|
+
# Schema definition
|
|
42
|
+
Your output should contain:
|
|
43
|
+
"ClinicalTrial" which is the name of the trial,
|
|
44
|
+
If applicable, "Arm" which is the arm within the clinical trial,
|
|
45
|
+
"AdverseReaction" which is the name of the adverse reaction,
|
|
46
|
+
If applicable, "Percentage" which is the occurance of the adverse reaction within the trial and arm,
|
|
47
|
+
"Evidence" which is the EXACT sentence in the text where you found the AdverseReaction from
|
|
48
|
+
|
|
49
|
+
# Thinking process
|
|
50
|
+
Approach this task step by step. Start with a recall section (<Recall>... </Recall>) that briefly summarize of the task description and schema definition for better understanding of the task. Then write an analysis section (<Analysis>... </Analysis>) to analyze the input sentence. Identify important pharmacology contents and clarify ambiguous concepts. Finally, the output section (<Outputs>... </Outputs>) that list your final outputs following the defined format.
|
|
51
|
+
|
|
52
|
+
# Output format definition
|
|
53
|
+
Your output should follow JSON format, for example:
|
|
54
|
+
[
|
|
55
|
+
{"ClinicalTrial": "<Clinical trial name or number>", "Arm": "<name of arm>", "AdverseReaction": "<Adverse reaction text>", "Percentage": "<a percent>", "Evidence": "<exact sentence from the text>"},
|
|
56
|
+
{"ClinicalTrial": "<Clinical trial name or number>", "Arm": "<name of arm>", "AdverseReaction": "<Adverse reaction text>", "Percentage": "<a percent>", "Evidence": "<exact sentence from the text>"}
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
# Additional hints
|
|
60
|
+
Your output should be 100% based on the provided content. DO NOT output fake numbers.
|
|
61
|
+
If there is no specific arm, just omit the "Arm" key. If the percentage is not reported, just omit the "Percentage" key. The "Evidence" should always be provided.
|
|
62
|
+
|
|
63
|
+
# Input placeholder
|
|
64
|
+
Below is the Adverse reactions section for your reference. I will feed you with sentences from it one by one.
|
|
65
|
+
{{input}}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
Example 2 (multiple entity types):
|
|
69
|
+
|
|
70
|
+
# Task description
|
|
71
|
+
This is a named entity recognition task. Given a sentence from a medical note, annotate the Drug, Form, Strength, Frequency, Route, Dosage, Reason, ADE, and Duration.
|
|
72
|
+
|
|
73
|
+
# Schema definition
|
|
74
|
+
Your output should contain:
|
|
75
|
+
"entity_text": the exact wording as mentioned in the note.
|
|
76
|
+
"entity_type": type of the entity. It should be one of the "Drug", "Form", "Strength", "Frequency", "Route", "Dosage", "Reason", "ADE", or "Duration".
|
|
77
|
+
|
|
78
|
+
# Thinking process
|
|
79
|
+
Approach this task step by step. Start with an analysis section (<Analysis>... </Analysis>) to analyze the input sentence. Identify important medical contents and clarify ambiguous concepts. Then, the output section (<Outputs>... </Outputs>) that list your final outputs following the defined format.
|
|
80
|
+
|
|
81
|
+
# Output format definition
|
|
82
|
+
Your output should follow JSON format,
|
|
83
|
+
if there are one of the entity mentions: Drug, Form, Strength, Frequency, Route, Dosage, Reason, ADE, or Duration:
|
|
84
|
+
[{"entity_text": "<Exact entity mentions as in the note>", "entity_type": "<entity type as listed above>"},
|
|
85
|
+
{"entity_text": "<Exact entity mentions as in the note>", "entity_type": "<entity type as listed above>"}]
|
|
86
|
+
if there is no entity mentioned in the given note, just output an empty list:
|
|
87
|
+
[]
|
|
88
|
+
|
|
89
|
+
# Examples
|
|
90
|
+
Below are some examples:
|
|
91
|
+
|
|
92
|
+
Input: Acetaminophen 650 mg PO BID 5.
|
|
93
|
+
Output:
|
|
94
|
+
<Analysis>
|
|
95
|
+
The sentence "Acetaminophen 650 mg PO BID 5." contains several potential medical entities.
|
|
96
|
+
|
|
97
|
+
"Acetaminophen" is a Drug.
|
|
98
|
+
"650 mg" represents the Strength.
|
|
99
|
+
"PO" is the Route (meaning by mouth).
|
|
100
|
+
"BID" stands for a dosing frequency, which represents Frequency (meaning twice a day).
|
|
101
|
+
</Analysis>
|
|
102
|
+
|
|
103
|
+
<Outputs>
|
|
104
|
+
[{"entity_text": "Acetaminophen", "entity_type": "Drug"}, {"entity_text": "650 mg", "entity_type": "Strength"}, {"entity_text": "PO", "entity_type": "Route"}, {"entity_text": "BID", "entity_type": "Frequency"}]
|
|
105
|
+
</Outputs>
|
|
106
|
+
|
|
107
|
+
Input: Mesalamine DR 1200 mg PO BID 2.
|
|
108
|
+
Output:
|
|
109
|
+
<Analysis>
|
|
110
|
+
The sentence "Mesalamine DR 1200 mg PO BID 2." contains the following medical entities:
|
|
111
|
+
|
|
112
|
+
"Mesalamine" is a Drug.
|
|
113
|
+
"DR" stands for Form (delayed-release).
|
|
114
|
+
"1200 mg" represents the Strength.
|
|
115
|
+
"PO" is the Route (by mouth).
|
|
116
|
+
"BID" is the Frequency (twice a day).
|
|
117
|
+
</Analysis>
|
|
118
|
+
|
|
119
|
+
<Outputs>
|
|
120
|
+
[{"entity_text": "Mesalamine DR", "entity_type": "Drug"}, {"entity_text": "1200 mg", "entity_type": "Strength"}, {"entity_text": "BID", "entity_type": "Frequency"}, {"entity_text": "PO", "entity_type": "Route"}]
|
|
121
|
+
</Outputs>
|
|
122
|
+
|
|
123
|
+
# Input placeholder
|
|
124
|
+
Below is the medical note for your reference. I will feed you with sentences from it one by one.
|
|
125
|
+
"{{input}}"
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
Example 3 (multiple entity types with corresponding attributes):
|
|
129
|
+
|
|
130
|
+
# Task description
|
|
131
|
+
This is a named entity recognition task. Given a sentence from a medical note, annotate the events (EVENT) and time expressions (TIMEX3):
|
|
132
|
+
|
|
133
|
+
# Schema definition
|
|
134
|
+
Your output should contain:
|
|
135
|
+
"entity_text": the exact wording as mentioned in the note.
|
|
136
|
+
"entity_type": type of the entity. It should be one of the "EVENT" or "TIMEX3".
|
|
137
|
+
if entity_type is "EVENT",
|
|
138
|
+
"type": the event type as one of the "TEST", "PROBLEM", "TREATMENT", "CLINICAL_DEPT", "EVIDENTIAL", or "OCCURRENCE".
|
|
139
|
+
"polarity": whether an EVENT is positive ("POS") or negative ("NAG"). For example, in “the patient reports headache, and denies chills”, the EVENT [headache] is positive in its polarity, and the EVENT [chills] is negative in its polarity.
|
|
140
|
+
"modality": whether an EVENT actually occurred or not. Must be one of the "FACTUAL", "CONDITIONAL", "POSSIBLE", or "PROPOSED".
|
|
141
|
+
|
|
142
|
+
if entity_type is "TIMEX3",
|
|
143
|
+
"type": the type as one of the "DATE", "TIME", "DURATION", or "FREQUENCY".
|
|
144
|
+
"val": the numeric value 1) DATE: [YYYY]-[MM]-[DD], 2) TIME: [hh]:[mm]:[ss], 3) DURATION: P[n][Y/M/W/D]. So, “for eleven days” will be
|
|
145
|
+
represented as “P11D”, meaning a period of 11 days. 4) R[n][duration], where n denotes the number of repeats. When the n is omitted, the expression denotes an unspecified amount of repeats. For example, “once a day for 3 days” is “R3P1D” (repeat the time interval of 1 day (P1D) for 3 times (R3)), twice every day is “RP12H” (repeat every 12 hours)
|
|
146
|
+
"mod": additional information regarding the temporal value of a time expression. Must be one of the:
|
|
147
|
+
“NA”: the default value, no relevant modifier is present;
|
|
148
|
+
“MORE”, means “more than”, e.g. over 2 days (val = P2D, mod = MORE);
|
|
149
|
+
“LESS”, means “less than”, e.g. almost 2 months (val = P2M, mod=LESS);
|
|
150
|
+
“APPROX”, means “approximate”, e.g. nearly a week (val = P1W, mod=APPROX);
|
|
151
|
+
“START”, describes the beginning of a period of time, e.g. Christmas morning, 2005 (val= 2005-12-25, mod= START).
|
|
152
|
+
“END”, describes the end of a period of time, e.g. late last year, (val = 2010, mod = END)
|
|
153
|
+
“MIDDLE”, describes the middle of a period of time, e.g. mid-September 2001 (val = 2001-09, mod = MIDDLE)
|
|
154
|
+
|
|
155
|
+
# Thinking process
|
|
156
|
+
Approach this task step by step. Start with a recall section (<Recall>... </Recall>) that briefly summarize of the task description and schema definition for better understanding of the task. Followed by an analysis section (<Analysis>... </Analysis>) to analyze the input sentence. Identify important medical contents and clarify ambiguous concepts. Then, the output section (<Outputs>... </Outputs>) that list your final outputs following the defined format.
|
|
157
|
+
|
|
158
|
+
# Output format definition
|
|
159
|
+
Your output should follow JSON format,
|
|
160
|
+
if there are one of the EVENT or TIMEX3 entity mentions:
|
|
161
|
+
[
|
|
162
|
+
{"entity_text": "<Exact entity mentions as in the note>", "entity_type": "EVENT", "type": "<event type>", "polarity": "<event polarity>", "modality": "<event modality>"},
|
|
163
|
+
{"entity_text": "<Exact entity mentions as in the note>", "entity_type": "TIMEX3", "type": "<TIMEX3 type>", "val": "<time value>", "mod": "<additional information>"}
|
|
164
|
+
...
|
|
165
|
+
]
|
|
166
|
+
if there is no entity mentioned in the given note, just output an empty list:
|
|
167
|
+
[]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# Examples
|
|
171
|
+
Below are some examples:
|
|
172
|
+
|
|
173
|
+
Input: At 9/7/93 , 1:00 a.m. , intravenous fluids rate was decreased to 50 cc's per hour , total fluids given during the first 24 hours were 140 to 150 cc's per kilo per day .
|
|
174
|
+
Output:
|
|
175
|
+
<Recall>
|
|
176
|
+
This is a named entity recognition task that focuses on extracting medical events (EVENT) and time expressions (TIMEX3). Events are categorized by their type (TEST, PROBLEM, TREATMENT, etc.), polarity (POS or NEG), and modality (FACTUAL, CONDITIONAL, POSSIBLE, or PROPOSED). Time expressions are identified as either DATE, TIME, DURATION, or FREQUENCY and include specific values or modifiers where applicable.
|
|
177
|
+
</Recall>
|
|
178
|
+
|
|
179
|
+
<Analysis>
|
|
180
|
+
In this sentence:
|
|
181
|
+
|
|
182
|
+
"9/7/93" represents a TIMEX3 entity for the date.
|
|
183
|
+
"1:00 a.m." is a TIMEX3 entity representing the time.
|
|
184
|
+
"first 24 hours" refers to a TIMEX3 entity of duration.
|
|
185
|
+
"intravenous fluids rate was decreased" is an EVENT referring to a TREATMENT event with a negative polarity (as it was "decreased") and a FACTUAL modality (it actually happened).
|
|
186
|
+
"total fluids given during the first 24 hours" is another EVENT representing a TREATMENT that is FACTUAL in its modality.
|
|
187
|
+
</Analysis>
|
|
188
|
+
|
|
189
|
+
<Outputs>
|
|
190
|
+
[{"entity_text": "intravenous fluids", "entity_type": "EVENT", "type": "TREATMENT", "polarity": "POS", "modality": "FACTUAL"},
|
|
191
|
+
{"entity_text": "decreased", "entity_type": "EVENT", "type": "OCCURRENCE", "polarity": "POS", "modality": "FACTUAL"},
|
|
192
|
+
{"entity_text": "total fluids", "entity_type": "EVENT", "type": "TREATMENT", "polarity": "POS", "modality": "FACTUAL"},
|
|
193
|
+
{"entity_text": "9/7/93 , 1:00 a.m.", "entity_type": "TIMEX3", "type": "TIME", "val": "1993-09-07T01:00", "mod": "NA"},
|
|
194
|
+
{"entity_text": "24 hours", "entity_type": "TIMEX3", "type": "DURATION", "val": "PT24H", "mod": "NA"}]
|
|
195
|
+
</Outputs>
|
|
196
|
+
|
|
197
|
+
Input: At that time it appeared well adhered to the underlying skin .
|
|
198
|
+
Output:
|
|
199
|
+
<Recall>
|
|
200
|
+
This is a named entity recognition task focused on extracting medical events (EVENT) and time expressions (TIMEX3). Events are categorized by their type (e.g., TEST, PROBLEM, TREATMENT), polarity (POS or NEG), and modality (FACTUAL, CONDITIONAL, POSSIBLE, or PROPOSED). Time expressions are categorized as DATE, TIME, DURATION, or FREQUENCY, and include values or modifiers where applicable.
|
|
201
|
+
</Recall>
|
|
202
|
+
|
|
203
|
+
<Analysis>
|
|
204
|
+
In this sentence:
|
|
205
|
+
|
|
206
|
+
"At that time" refers to a TIMEX3 entity that is vague, so it can be considered as a TIME with an unspecified value.
|
|
207
|
+
"appeared well adhered to the underlying skin" describes an EVENT that likely indicates a PROBLEM (the condition of the skin) and has a POS polarity (since it is "well adhered") with a FACTUAL modality (it actually occurred).
|
|
208
|
+
</Analysis>
|
|
209
|
+
|
|
210
|
+
<Outputs>
|
|
211
|
+
[{"entity_text": "it", "entity_type": "EVENT", "type": "TREATMENT", "polarity": "POS", "modality": "FACTUAL"},
|
|
212
|
+
{"entity_text": "well adhered", "entity_type": "EVENT", "type": "OCCURRENCE", "polarity": "POS", "modality": "FACTUAL"}]
|
|
213
|
+
</Outputs>
|
|
214
|
+
|
|
215
|
+
# Input placeholder
|
|
216
|
+
Below is the entire medical note for your reference. I will feed you with sentences from it one by one.
|
|
217
|
+
"{{input}}"
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
Prompt Template Design:
|
|
2
|
+
|
|
3
|
+
1. Task Description:
|
|
4
|
+
Provide a detailed description of the task, including the background and the type of task (e.g., named entity recognition).
|
|
5
|
+
|
|
6
|
+
2. Schema Definition:
|
|
7
|
+
List the key concepts that should be extracted, and provide clear definitions for each one.
|
|
8
|
+
|
|
9
|
+
3. Output Format Definition:
|
|
10
|
+
The output should be a JSON list, where each element is a dictionary representing a frame (an entity along with its attributes). Each dictionary must include a key that holds the entity text. This key can be named "entity_text" or anything else depend on the context. The attributes can either be flat (e.g., {"entity_text": "<entity_text>", "attr1": "<attr1>", "attr2": "<attr2>"}) or nested (e.g., {"entity_text": "<entity_text>", "attributes": {"attr1": "<attr1>", "attr2": "<attr2>"}}).
|
|
11
|
+
|
|
12
|
+
4. Optional: Hints:
|
|
13
|
+
Provide itemized hints for the information extractors to guide the extraction process.
|
|
14
|
+
|
|
15
|
+
5. Optional: Examples:
|
|
16
|
+
Include examples in the format:
|
|
17
|
+
Input: ...
|
|
18
|
+
Output: ...
|
|
19
|
+
|
|
20
|
+
6. Input Placeholder:
|
|
21
|
+
The template must include a placeholder in the format {{<placeholder_name>}} for the input text. The placeholder name can be customized as needed.
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
Example 1 (single entity type with attributes):
|
|
25
|
+
|
|
26
|
+
# Task description
|
|
27
|
+
The paragraph below is from the Food and Drug Administration (FDA) Clinical Pharmacology Section of Labeling for Human Prescription Drug and Biological Products, Adverse reactions section. Please carefully review it and extract the adverse reactions and percentages. Note that each adverse reaction is nested under a clinical trial and potentially an arm. Your output should take that into consideration.
|
|
28
|
+
|
|
29
|
+
# Schema definition
|
|
30
|
+
Your output should contain:
|
|
31
|
+
"ClinicalTrial" which is the name of the trial,
|
|
32
|
+
If applicable, "Arm" which is the arm within the clinical trial,
|
|
33
|
+
"AdverseReaction" which is the name of the adverse reaction,
|
|
34
|
+
If applicable, "Percentage" which is the occurance of the adverse reaction within the trial and arm,
|
|
35
|
+
"Evidence" which is the EXACT sentence in the text where you found the AdverseReaction from
|
|
36
|
+
|
|
37
|
+
# Output format definition
|
|
38
|
+
Your output should follow JSON format, for example:
|
|
39
|
+
[
|
|
40
|
+
{"ClinicalTrial": "<Clinical trial name or number>", "Arm": "<name of arm>", "AdverseReaction": "<Adverse reaction text>", "Percentage": "<a percent>", "Evidence": "<exact sentence from the text>"},
|
|
41
|
+
{"ClinicalTrial": "<Clinical trial name or number>", "Arm": "<name of arm>", "AdverseReaction": "<Adverse reaction text>", "Percentage": "<a percent>", "Evidence": "<exact sentence from the text>"}
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
# Additional hints
|
|
45
|
+
Your output should be 100% based on the provided content. DO NOT output fake numbers.
|
|
46
|
+
If there is no specific arm, just omit the "Arm" key. If the percentage is not reported, just omit the "Percentage" key. The "Evidence" should always be provided.
|
|
47
|
+
|
|
48
|
+
# Input placeholder
|
|
49
|
+
Below is the Adverse reactions section for your reference. I will feed you with sentences from it one by one.
|
|
50
|
+
{{input}}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
Example 2 (multiple entity types):
|
|
54
|
+
|
|
55
|
+
# Task description
|
|
56
|
+
This is a named entity recognition task. Given a sentence from a medical note, annotate the Drug, Form, Strength, Frequency, Route, Dosage, Reason, ADE, and Duration.
|
|
57
|
+
|
|
58
|
+
# Schema definition
|
|
59
|
+
Your output should contain:
|
|
60
|
+
"entity_text": the exact wording as mentioned in the note.
|
|
61
|
+
"entity_type": type of the entity. It should be one of the "Drug", "Form", "Strength", "Frequency", "Route", "Dosage", "Reason", "ADE", or "Duration".
|
|
62
|
+
|
|
63
|
+
# Output format definition
|
|
64
|
+
Your output should follow JSON format,
|
|
65
|
+
if there are one of the entity mentions: Drug, Form, Strength, Frequency, Route, Dosage, Reason, ADE, or Duration:
|
|
66
|
+
[{"entity_text": "<Exact entity mentions as in the note>", "entity_type": "<entity type as listed above>"},
|
|
67
|
+
{"entity_text": "<Exact entity mentions as in the note>", "entity_type": "<entity type as listed above>"}]
|
|
68
|
+
if there is no entity mentioned in the given note, just output an empty list:
|
|
69
|
+
[]
|
|
70
|
+
|
|
71
|
+
I am only interested in the extracted contents in []. Do NOT explain your answer.
|
|
72
|
+
|
|
73
|
+
# Examples
|
|
74
|
+
Below are some examples:
|
|
75
|
+
|
|
76
|
+
Input: Acetaminophen 650 mg PO BID 5.
|
|
77
|
+
Output: [{"entity_text": "Acetaminophen", "entity_type": "Drug"}, {"entity_text": "650 mg", "entity_type": "Strength"}, {"entity_text": "PO", "entity_type": "Route"}, {"entity_text": "BID", "entity_type": "Frequency"}]
|
|
78
|
+
|
|
79
|
+
Input: Mesalamine DR 1200 mg PO BID 2.
|
|
80
|
+
Output: [{"entity_text": "Mesalamine DR", "entity_type": "Drug"}, {"entity_text": "1200 mg", "entity_type": "Strength"}, {"entity_text": "BID", "entity_type": "Frequency"}, {"entity_text": "PO", "entity_type": "Route"}]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# Input placeholder
|
|
84
|
+
Below is the medical note for your reference. I will feed you with sentences from it one by one.
|
|
85
|
+
"{{input}}"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
Example 3 (multiple entity types with corresponding attributes):
|
|
89
|
+
|
|
90
|
+
# Task description
|
|
91
|
+
This is a named entity recognition task. Given a sentence from a medical note, annotate the events (EVENT) and time expressions (TIMEX3):
|
|
92
|
+
|
|
93
|
+
# Schema definition
|
|
94
|
+
Your output should contain:
|
|
95
|
+
"entity_text": the exact wording as mentioned in the note.
|
|
96
|
+
"entity_type": type of the entity. It should be one of the "EVENT" or "TIMEX3".
|
|
97
|
+
if entity_type is "EVENT",
|
|
98
|
+
"type": the event type as one of the "TEST", "PROBLEM", "TREATMENT", "CLINICAL_DEPT", "EVIDENTIAL", or "OCCURRENCE".
|
|
99
|
+
"polarity": whether an EVENT is positive ("POS") or negative ("NAG"). For example, in “the patient reports headache, and denies chills”, the EVENT [headache] is positive in its polarity, and the EVENT [chills] is negative in its polarity.
|
|
100
|
+
"modality": whether an EVENT actually occurred or not. Must be one of the "FACTUAL", "CONDITIONAL", "POSSIBLE", or "PROPOSED".
|
|
101
|
+
|
|
102
|
+
if entity_type is "TIMEX3",
|
|
103
|
+
"type": the type as one of the "DATE", "TIME", "DURATION", or "FREQUENCY".
|
|
104
|
+
"val": the numeric value 1) DATE: [YYYY]-[MM]-[DD], 2) TIME: [hh]:[mm]:[ss], 3) DURATION: P[n][Y/M/W/D]. So, “for eleven days” will be
|
|
105
|
+
represented as “P11D”, meaning a period of 11 days. 4) R[n][duration], where n denotes the number of repeats. When the n is omitted, the expression denotes an unspecified amount of repeats. For example, “once a day for 3 days” is “R3P1D” (repeat the time interval of 1 day (P1D) for 3 times (R3)), twice every day is “RP12H” (repeat every 12 hours)
|
|
106
|
+
"mod": additional information regarding the temporal value of a time expression. Must be one of the:
|
|
107
|
+
“NA”: the default value, no relevant modifier is present;
|
|
108
|
+
“MORE”, means “more than”, e.g. over 2 days (val = P2D, mod = MORE);
|
|
109
|
+
“LESS”, means “less than”, e.g. almost 2 months (val = P2M, mod=LESS);
|
|
110
|
+
“APPROX”, means “approximate”, e.g. nearly a week (val = P1W, mod=APPROX);
|
|
111
|
+
“START”, describes the beginning of a period of time, e.g. Christmas morning, 2005 (val= 2005-12-25, mod= START).
|
|
112
|
+
“END”, describes the end of a period of time, e.g. late last year, (val = 2010, mod = END)
|
|
113
|
+
“MIDDLE”, describes the middle of a period of time, e.g. mid-September 2001 (val = 2001-09, mod = MIDDLE)
|
|
114
|
+
|
|
115
|
+
# Output format definition
|
|
116
|
+
Your output should follow JSON format,
|
|
117
|
+
if there are one of the EVENT or TIMEX3 entity mentions:
|
|
118
|
+
[
|
|
119
|
+
{"entity_text": "<Exact entity mentions as in the note>", "entity_type": "EVENT", "type": "<event type>", "polarity": "<event polarity>", "modality": "<event modality>"},
|
|
120
|
+
{"entity_text": "<Exact entity mentions as in the note>", "entity_type": "TIMEX3", "type": "<TIMEX3 type>", "val": "<time value>", "mod": "<additional information>"}
|
|
121
|
+
...
|
|
122
|
+
]
|
|
123
|
+
if there is no entity mentioned in the given note, just output an empty list:
|
|
124
|
+
[]
|
|
125
|
+
|
|
126
|
+
I am only interested in the extracted contents in []. Do NOT explain your answer.
|
|
127
|
+
|
|
128
|
+
# Examples
|
|
129
|
+
Below are some examples:
|
|
130
|
+
|
|
131
|
+
Input: At 9/7/93 , 1:00 a.m. , intravenous fluids rate was decreased to 50 cc's per hour , total fluids given during the first 24 hours were 140 to 150 cc's per kilo per day .
|
|
132
|
+
Output: [{"entity_text": "intravenous fluids", "entity_type": "EVENT", "type": "TREATMENT", "polarity": "POS", "modality": "FACTUAL"},
|
|
133
|
+
{"entity_text": "decreased", "entity_type": "EVENT", "type": "OCCURRENCE", "polarity": "POS", "modality": "FACTUAL"},
|
|
134
|
+
{"entity_text": "total fluids", "entity_type": "EVENT", "type": "TREATMENT", "polarity": "POS", "modality": "FACTUAL"},
|
|
135
|
+
{"entity_text": "9/7/93 , 1:00 a.m.", "entity_type": "TIMEX3", "type": "TIME", "val": "1993-09-07T01:00", "mod": "NA"},
|
|
136
|
+
{"entity_text": "24 hours", "entity_type": "TIMEX3", "type": "DURATION", "val": "PT24H", "mod": "NA"}]
|
|
137
|
+
|
|
138
|
+
Input: At that time it appeared well adhered to the underlying skin .
|
|
139
|
+
Output: [{"entity_text": "it", "entity_type": "EVENT", "type": "TREATMENT", "polarity": "POS", "modality": "FACTUAL"},
|
|
140
|
+
{"entity_text": "well adhered", "entity_type": "EVENT", "type": "OCCURRENCE", "polarity": "POS", "modality": "FACTUAL"}]
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# Input placeholder
|
|
144
|
+
Below is the entire medical note for your reference. I will feed you with sentences from it one by one.
|
|
145
|
+
"{{input}}"
|
|
@@ -5,9 +5,11 @@ import inspect
|
|
|
5
5
|
import importlib.resources
|
|
6
6
|
import warnings
|
|
7
7
|
import itertools
|
|
8
|
-
from typing import List, Dict, Tuple, Union, Callable
|
|
8
|
+
from typing import Set, List, Dict, Tuple, Union, Callable
|
|
9
9
|
from llm_ie.data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
10
10
|
from llm_ie.engines import InferenceEngine
|
|
11
|
+
from colorama import Fore, Style
|
|
12
|
+
from nltk.tokenize import RegexpTokenizer
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
class Extractor:
|
|
@@ -36,7 +38,7 @@ class Extractor:
|
|
|
36
38
|
This method returns the pre-defined prompt guideline for the extractor from the package asset.
|
|
37
39
|
"""
|
|
38
40
|
file_path = importlib.resources.files('llm_ie.asset.prompt_guide').joinpath(f"{cls.__name__}_prompt_guide.txt")
|
|
39
|
-
with open(file_path, 'r') as f:
|
|
41
|
+
with open(file_path, 'r', encoding="utf-8") as f:
|
|
40
42
|
return f.read()
|
|
41
43
|
|
|
42
44
|
|
|
@@ -115,7 +117,7 @@ class Extractor:
|
|
|
115
117
|
dict_obj = json.loads(dict_str)
|
|
116
118
|
out.append(dict_obj)
|
|
117
119
|
except json.JSONDecodeError:
|
|
118
|
-
|
|
120
|
+
warnings.warn(f'Post-processing failed:\n{dict_str}', RuntimeWarning)
|
|
119
121
|
return out
|
|
120
122
|
|
|
121
123
|
|
|
@@ -138,9 +140,68 @@ class FrameExtractor(Extractor):
|
|
|
138
140
|
prompt_template=prompt_template,
|
|
139
141
|
system_prompt=system_prompt,
|
|
140
142
|
**kwrs)
|
|
141
|
-
|
|
143
|
+
self.tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _jaccard_score(self, s1:set, s2:set) -> float:
|
|
147
|
+
"""
|
|
148
|
+
This method calculates the Jaccard score between two sets of word tokens.
|
|
149
|
+
"""
|
|
150
|
+
return len(s1.intersection(s2)) / len(s1.union(s2))
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _get_word_tokens(self, text) -> Tuple[List[str], List[Tuple[int]]]:
|
|
154
|
+
"""
|
|
155
|
+
This method tokenizes the input text into a list of word tokens and their spans.
|
|
156
|
+
"""
|
|
157
|
+
tokens = []
|
|
158
|
+
spans = []
|
|
159
|
+
for span in self.tokenizer.span_tokenize(text):
|
|
160
|
+
spans.append(span)
|
|
161
|
+
start, end = span
|
|
162
|
+
tokens.append(text[start:end])
|
|
163
|
+
return tokens, spans
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _get_closest_substring(self, text:str, pattern:str, buffer_size:float=0.2) -> Tuple[Tuple[int, int], float]:
|
|
167
|
+
"""
|
|
168
|
+
This method finds the closest (highest Jaccard score) substring in text that matches the pattern.
|
|
169
|
+
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
text : str
|
|
173
|
+
the input text.
|
|
174
|
+
pattern : str
|
|
175
|
+
the pattern to match.
|
|
176
|
+
buffer_size : float, Optional
|
|
177
|
+
the buffer size for the matching window. Default is 20% of pattern length.
|
|
178
|
+
|
|
179
|
+
Returns : Tuple[Tuple[int, int], float]
|
|
180
|
+
a tuple of 2-tuple span and Jaccard score.
|
|
181
|
+
"""
|
|
182
|
+
text_tokens, text_spans = self._get_word_tokens(text)
|
|
183
|
+
pattern_tokens, _ = self._get_word_tokens(pattern)
|
|
184
|
+
pattern_tokens = set(pattern_tokens)
|
|
185
|
+
window_size = len(pattern_tokens)
|
|
186
|
+
window_size_min = int(window_size * (1 - buffer_size))
|
|
187
|
+
window_size_max = int(window_size * (1 + buffer_size))
|
|
188
|
+
closest_substring_spans = None
|
|
189
|
+
best_score = 0
|
|
190
|
+
|
|
191
|
+
for i in range(len(text_tokens) - window_size_max):
|
|
192
|
+
for w in range(window_size_min, window_size_max):
|
|
193
|
+
sub_str_tokens = set(text_tokens[i:i + w])
|
|
194
|
+
score = self._jaccard_score(sub_str_tokens, pattern_tokens)
|
|
195
|
+
if score > best_score:
|
|
196
|
+
best_score = score
|
|
197
|
+
sub_string_word_spans = text_spans[i:i + w]
|
|
198
|
+
closest_substring_spans = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
|
|
199
|
+
|
|
200
|
+
return closest_substring_spans, best_score
|
|
142
201
|
|
|
143
|
-
|
|
202
|
+
|
|
203
|
+
def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
|
|
204
|
+
fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8) -> List[Tuple[int]]:
|
|
144
205
|
"""
|
|
145
206
|
This function inputs a text and a list of entity text,
|
|
146
207
|
outputs a list of spans (2-tuple) for each entity.
|
|
@@ -150,19 +211,46 @@ class FrameExtractor(Extractor):
|
|
|
150
211
|
----------
|
|
151
212
|
text : str
|
|
152
213
|
text that contains entities
|
|
214
|
+
entities : List[str]
|
|
215
|
+
a list of entity text to find in the text
|
|
216
|
+
case_sensitive : bool, Optional
|
|
217
|
+
if True, entity text matching will be case-sensitive.
|
|
218
|
+
fuzzy_match : bool, Optional
|
|
219
|
+
if True, fuzzy matching will be applied to find entity text.
|
|
220
|
+
fuzzy_buffer_size : float, Optional
|
|
221
|
+
the buffer size for fuzzy matching. Default is 20% of entity text length.
|
|
222
|
+
fuzzy_score_cutoff : float, Optional
|
|
223
|
+
the Jaccard score cutoff for fuzzy matching.
|
|
224
|
+
Matched entity text must have a score higher than this value or a None will be returned.
|
|
153
225
|
"""
|
|
226
|
+
# Handle case sensitivity
|
|
227
|
+
if not case_sensitive:
|
|
228
|
+
text = text.lower()
|
|
229
|
+
|
|
230
|
+
# Match entities
|
|
154
231
|
entity_spans = []
|
|
155
|
-
for entity in entities:
|
|
156
|
-
if case_sensitive:
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
232
|
+
for entity in entities:
|
|
233
|
+
if not case_sensitive:
|
|
234
|
+
entity = entity.lower()
|
|
235
|
+
|
|
236
|
+
# Exact match
|
|
237
|
+
match = re.search(re.escape(entity), text)
|
|
161
238
|
if match:
|
|
162
239
|
start, end = match.span()
|
|
163
240
|
entity_spans.append((start, end))
|
|
164
241
|
# Replace the found entity with spaces to avoid finding the same instance again
|
|
165
242
|
text = text[:start] + ' ' * (end - start) + text[end:]
|
|
243
|
+
# Fuzzy match
|
|
244
|
+
elif fuzzy_match:
|
|
245
|
+
closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
|
|
246
|
+
if best_score >= fuzzy_score_cutoff:
|
|
247
|
+
entity_spans.append(closest_substring_span)
|
|
248
|
+
# Replace the found entity with spaces to avoid finding the same instance again
|
|
249
|
+
text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
|
|
250
|
+
else:
|
|
251
|
+
entity_spans.append(None)
|
|
252
|
+
|
|
253
|
+
# No match
|
|
166
254
|
else:
|
|
167
255
|
entity_spans.append(None)
|
|
168
256
|
|
|
@@ -275,7 +363,9 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
275
363
|
|
|
276
364
|
|
|
277
365
|
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
|
|
278
|
-
temperature:float=0.0, document_key:str=None,
|
|
366
|
+
temperature:float=0.0, document_key:str=None, stream:bool=False,
|
|
367
|
+
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2,
|
|
368
|
+
fuzzy_score_cutoff:float=0.8, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
279
369
|
"""
|
|
280
370
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
281
371
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -295,14 +385,35 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
295
385
|
document_key : str, Optional
|
|
296
386
|
specify the key in text_content where document text is.
|
|
297
387
|
If text_content is str, this parameter will be ignored.
|
|
388
|
+
stream : bool, Optional
|
|
389
|
+
if True, LLM generated text will be printed in terminal in real-time.
|
|
390
|
+
case_sensitive : bool, Optional
|
|
391
|
+
if True, entity text matching will be case-sensitive.
|
|
392
|
+
fuzzy_match : bool, Optional
|
|
393
|
+
if True, fuzzy matching will be applied to find entity text.
|
|
394
|
+
fuzzy_buffer_size : float, Optional
|
|
395
|
+
the buffer size for fuzzy matching. Default is 20% of entity text length.
|
|
396
|
+
fuzzy_score_cutoff : float, Optional
|
|
397
|
+
the Jaccard score cutoff for fuzzy matching.
|
|
398
|
+
Matched entity text must have a score higher than this value or a None will be returned.
|
|
298
399
|
|
|
299
400
|
Return : str
|
|
300
401
|
a list of frames.
|
|
301
402
|
"""
|
|
302
403
|
frame_list = []
|
|
303
404
|
gen_text = self.extract(text_content=text_content,
|
|
304
|
-
max_new_tokens=max_new_tokens,
|
|
305
|
-
|
|
405
|
+
max_new_tokens=max_new_tokens,
|
|
406
|
+
temperature=temperature,
|
|
407
|
+
stream=stream,
|
|
408
|
+
**kwrs)
|
|
409
|
+
|
|
410
|
+
entity_json = []
|
|
411
|
+
for entity in self._extract_json(gen_text=gen_text):
|
|
412
|
+
if entity_key in entity:
|
|
413
|
+
entity_json.append(entity)
|
|
414
|
+
else:
|
|
415
|
+
warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
|
|
416
|
+
|
|
306
417
|
if isinstance(text_content, str):
|
|
307
418
|
text = text_content
|
|
308
419
|
elif isinstance(text_content, dict):
|
|
@@ -310,7 +421,10 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
310
421
|
|
|
311
422
|
spans = self._find_entity_spans(text=text,
|
|
312
423
|
entities=[e[entity_key] for e in entity_json],
|
|
313
|
-
case_sensitive=
|
|
424
|
+
case_sensitive=case_sensitive,
|
|
425
|
+
fuzzy_match=fuzzy_match,
|
|
426
|
+
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
427
|
+
fuzzy_score_cutoff=fuzzy_score_cutoff)
|
|
314
428
|
|
|
315
429
|
for i, (ent, span) in enumerate(zip(entity_json, spans)):
|
|
316
430
|
if span is not None:
|
|
@@ -325,8 +439,8 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
325
439
|
|
|
326
440
|
|
|
327
441
|
class ReviewFrameExtractor(BasicFrameExtractor):
|
|
328
|
-
def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
|
|
329
|
-
review_mode:str, system_prompt:str=None, **kwrs):
|
|
442
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
|
|
443
|
+
review_mode:str, review_prompt:str=None,system_prompt:str=None, **kwrs):
|
|
330
444
|
"""
|
|
331
445
|
This class add a review step after the BasicFrameExtractor.
|
|
332
446
|
The Review process asks LLM to review its output and:
|
|
@@ -340,8 +454,9 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
340
454
|
the LLM inferencing engine object. Must implements the chat() method.
|
|
341
455
|
prompt_template : str
|
|
342
456
|
prompt template with "{{<placeholder name>}}" placeholder.
|
|
343
|
-
review_prompt : str
|
|
344
|
-
the prompt text that ask LLM to review. Specify addition or revision in the instruction.
|
|
457
|
+
review_prompt : str: Optional
|
|
458
|
+
the prompt text that ask LLM to review. Specify addition or revision in the instruction.
|
|
459
|
+
if not provided, a default review prompt will be used.
|
|
345
460
|
review_mode : str
|
|
346
461
|
review mode. Must be one of {"addition", "revision"}
|
|
347
462
|
addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
|
|
@@ -350,11 +465,20 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
350
465
|
"""
|
|
351
466
|
super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
|
|
352
467
|
system_prompt=system_prompt, **kwrs)
|
|
353
|
-
self.review_prompt = review_prompt
|
|
354
468
|
if review_mode not in {"addition", "revision"}:
|
|
355
469
|
raise ValueError('review_mode must be one of {"addition", "revision"}.')
|
|
356
470
|
self.review_mode = review_mode
|
|
357
471
|
|
|
472
|
+
if review_prompt:
|
|
473
|
+
self.review_prompt = review_prompt
|
|
474
|
+
else:
|
|
475
|
+
file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
|
|
476
|
+
joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
|
|
477
|
+
with open(file_path, 'r', encoding="utf-8") as f:
|
|
478
|
+
self.review_prompt = f.read()
|
|
479
|
+
|
|
480
|
+
warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
|
|
481
|
+
|
|
358
482
|
|
|
359
483
|
def extract(self, text_content:Union[str, Dict[str,str]],
|
|
360
484
|
max_new_tokens:int=4096, temperature:float=0.0, stream:bool=False, **kwrs) -> str:
|
|
@@ -377,12 +501,15 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
377
501
|
Return : str
|
|
378
502
|
the output from LLM. Need post-processing.
|
|
379
503
|
"""
|
|
380
|
-
# Pormpt extraction
|
|
381
504
|
messages = []
|
|
382
505
|
if self.system_prompt:
|
|
383
506
|
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
384
507
|
|
|
385
508
|
messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
|
|
509
|
+
# Initial output
|
|
510
|
+
if stream:
|
|
511
|
+
print(f"{Fore.BLUE}Initial Output:{Style.RESET_ALL}")
|
|
512
|
+
|
|
386
513
|
initial = self.inference_engine.chat(
|
|
387
514
|
messages=messages,
|
|
388
515
|
max_new_tokens=max_new_tokens,
|
|
@@ -395,6 +522,8 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
395
522
|
messages.append({'role': 'assistant', 'content': initial})
|
|
396
523
|
messages.append({'role': 'user', 'content': self.review_prompt})
|
|
397
524
|
|
|
525
|
+
if stream:
|
|
526
|
+
print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
|
|
398
527
|
review = self.inference_engine.chat(
|
|
399
528
|
messages=messages,
|
|
400
529
|
max_new_tokens=max_new_tokens,
|
|
@@ -459,7 +588,7 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
459
588
|
|
|
460
589
|
|
|
461
590
|
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
462
|
-
document_key:str=None, multi_turn:bool=
|
|
591
|
+
document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
463
592
|
"""
|
|
464
593
|
This method inputs a text and outputs a list of outputs per sentence.
|
|
465
594
|
|
|
@@ -507,8 +636,8 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
507
636
|
for sent in sentences:
|
|
508
637
|
messages.append({'role': 'user', 'content': sent['sentence_text']})
|
|
509
638
|
if stream:
|
|
510
|
-
print(f"\n\
|
|
511
|
-
print("Extraction:")
|
|
639
|
+
print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
|
|
640
|
+
print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
|
|
512
641
|
|
|
513
642
|
gen_text = self.inference_engine.chat(
|
|
514
643
|
messages=messages,
|
|
@@ -534,7 +663,9 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
534
663
|
|
|
535
664
|
|
|
536
665
|
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=512,
|
|
537
|
-
document_key:str=None, multi_turn:bool=
|
|
666
|
+
document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False,
|
|
667
|
+
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
|
|
668
|
+
**kwrs) -> List[LLMInformationExtractionFrame]:
|
|
538
669
|
"""
|
|
539
670
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
540
671
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -562,6 +693,15 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
562
693
|
the temperature for token sampling.
|
|
563
694
|
stream : bool, Optional
|
|
564
695
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
696
|
+
case_sensitive : bool, Optional
|
|
697
|
+
if True, entity text matching will be case-sensitive.
|
|
698
|
+
fuzzy_match : bool, Optional
|
|
699
|
+
if True, fuzzy matching will be applied to find entity text.
|
|
700
|
+
fuzzy_buffer_size : float, Optional
|
|
701
|
+
the buffer size for fuzzy matching. Default is 20% of entity text length.
|
|
702
|
+
fuzzy_score_cutoff : float, Optional
|
|
703
|
+
the Jaccard score cutoff for fuzzy matching.
|
|
704
|
+
Matched entity text must have a score higher than this value or a None will be returned.
|
|
565
705
|
|
|
566
706
|
Return : str
|
|
567
707
|
a list of frames.
|
|
@@ -575,9 +715,19 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
575
715
|
**kwrs)
|
|
576
716
|
frame_list = []
|
|
577
717
|
for sent in llm_output_sentence:
|
|
578
|
-
entity_json =
|
|
718
|
+
entity_json = []
|
|
719
|
+
for entity in self._extract_json(gen_text=sent['gen_text']):
|
|
720
|
+
if entity_key in entity:
|
|
721
|
+
entity_json.append(entity)
|
|
722
|
+
else:
|
|
723
|
+
warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
|
|
724
|
+
|
|
579
725
|
spans = self._find_entity_spans(text=sent['sentence_text'],
|
|
580
|
-
|
|
726
|
+
entities=[e[entity_key] for e in entity_json],
|
|
727
|
+
case_sensitive=case_sensitive,
|
|
728
|
+
fuzzy_match=fuzzy_match,
|
|
729
|
+
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
730
|
+
fuzzy_score_cutoff=fuzzy_score_cutoff)
|
|
581
731
|
for ent, span in zip(entity_json, spans):
|
|
582
732
|
if span is not None:
|
|
583
733
|
start, end = span
|
|
@@ -592,6 +742,248 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
592
742
|
return frame_list
|
|
593
743
|
|
|
594
744
|
|
|
745
|
+
class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
746
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
|
|
747
|
+
review_mode:str, review_prompt:str=None, system_prompt:str=None, **kwrs):
|
|
748
|
+
"""
|
|
749
|
+
This class adds a review step after the SentenceFrameExtractor.
|
|
750
|
+
For each sentence, the review process asks LLM to review its output and:
|
|
751
|
+
1. add more frames while keeping current. This is efficient for boosting recall.
|
|
752
|
+
2. or, regenerate frames (add new and delete existing).
|
|
753
|
+
Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.
|
|
754
|
+
|
|
755
|
+
Parameters:
|
|
756
|
+
----------
|
|
757
|
+
inference_engine : InferenceEngine
|
|
758
|
+
the LLM inferencing engine object. Must implements the chat() method.
|
|
759
|
+
prompt_template : str
|
|
760
|
+
prompt template with "{{<placeholder name>}}" placeholder.
|
|
761
|
+
review_prompt : str: Optional
|
|
762
|
+
the prompt text that ask LLM to review. Specify addition or revision in the instruction.
|
|
763
|
+
if not provided, a default review prompt will be used.
|
|
764
|
+
review_mode : str
|
|
765
|
+
review mode. Must be one of {"addition", "revision"}
|
|
766
|
+
addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
|
|
767
|
+
system_prompt : str, Optional
|
|
768
|
+
system prompt.
|
|
769
|
+
"""
|
|
770
|
+
super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
|
|
771
|
+
system_prompt=system_prompt, **kwrs)
|
|
772
|
+
|
|
773
|
+
if review_mode not in {"addition", "revision"}:
|
|
774
|
+
raise ValueError('review_mode must be one of {"addition", "revision"}.')
|
|
775
|
+
self.review_mode = review_mode
|
|
776
|
+
|
|
777
|
+
if review_prompt:
|
|
778
|
+
self.review_prompt = review_prompt
|
|
779
|
+
else:
|
|
780
|
+
file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
|
|
781
|
+
joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
|
|
782
|
+
with open(file_path, 'r', encoding="utf-8") as f:
|
|
783
|
+
self.review_prompt = f.read()
|
|
784
|
+
|
|
785
|
+
warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
789
|
+
document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
790
|
+
"""
|
|
791
|
+
This method inputs a text and outputs a list of outputs per sentence.
|
|
792
|
+
|
|
793
|
+
Parameters:
|
|
794
|
+
----------
|
|
795
|
+
text_content : Union[str, Dict[str,str]]
|
|
796
|
+
the input text content to put in prompt template.
|
|
797
|
+
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
798
|
+
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
799
|
+
max_new_tokens : str, Optional
|
|
800
|
+
the max number of new tokens LLM should generate.
|
|
801
|
+
document_key : str, Optional
|
|
802
|
+
specify the key in text_content where document text is.
|
|
803
|
+
If text_content is str, this parameter will be ignored.
|
|
804
|
+
multi_turn : bool, Optional
|
|
805
|
+
multi-turn conversation prompting.
|
|
806
|
+
If True, sentences and LLM outputs will be appended to the input message and carry-over.
|
|
807
|
+
If False, only the current sentence is prompted.
|
|
808
|
+
For LLM inference engines that supports prompt cache (e.g., Llama.Cpp, Ollama), use multi-turn conversation prompting
|
|
809
|
+
can better utilize the KV caching.
|
|
810
|
+
temperature : float, Optional
|
|
811
|
+
the temperature for token sampling.
|
|
812
|
+
stream : bool, Optional
|
|
813
|
+
if True, LLM generated text will be printed in terminal in real-time.
|
|
814
|
+
|
|
815
|
+
Return : str
|
|
816
|
+
the output from LLM. Need post-processing.
|
|
817
|
+
"""
|
|
818
|
+
# define output
|
|
819
|
+
output = []
|
|
820
|
+
# sentence tokenization
|
|
821
|
+
if isinstance(text_content, str):
|
|
822
|
+
sentences = self._get_sentences(text_content)
|
|
823
|
+
elif isinstance(text_content, dict):
|
|
824
|
+
sentences = self._get_sentences(text_content[document_key])
|
|
825
|
+
# construct chat messages
|
|
826
|
+
messages = []
|
|
827
|
+
if self.system_prompt:
|
|
828
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
829
|
+
|
|
830
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
|
|
831
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please start with the first sentence.'})
|
|
832
|
+
|
|
833
|
+
# generate sentence by sentence
|
|
834
|
+
for sent in sentences:
|
|
835
|
+
messages.append({'role': 'user', 'content': sent['sentence_text']})
|
|
836
|
+
if stream:
|
|
837
|
+
print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
|
|
838
|
+
print(f"{Fore.BLUE}Initial Output:{Style.RESET_ALL}")
|
|
839
|
+
|
|
840
|
+
initial = self.inference_engine.chat(
|
|
841
|
+
messages=messages,
|
|
842
|
+
max_new_tokens=max_new_tokens,
|
|
843
|
+
temperature=temperature,
|
|
844
|
+
stream=stream,
|
|
845
|
+
**kwrs
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
# Review
|
|
849
|
+
if stream:
|
|
850
|
+
print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
|
|
851
|
+
messages.append({'role': 'assistant', 'content': initial})
|
|
852
|
+
messages.append({'role': 'user', 'content': self.review_prompt})
|
|
853
|
+
|
|
854
|
+
review = self.inference_engine.chat(
|
|
855
|
+
messages=messages,
|
|
856
|
+
max_new_tokens=max_new_tokens,
|
|
857
|
+
temperature=temperature,
|
|
858
|
+
stream=stream,
|
|
859
|
+
**kwrs
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
# Output
|
|
863
|
+
if self.review_mode == "revision":
|
|
864
|
+
gen_text = review
|
|
865
|
+
elif self.review_mode == "addition":
|
|
866
|
+
gen_text = initial + '\n' + review
|
|
867
|
+
|
|
868
|
+
if multi_turn:
|
|
869
|
+
# update chat messages with LLM outputs
|
|
870
|
+
messages.append({'role': 'assistant', 'content': review})
|
|
871
|
+
else:
|
|
872
|
+
# delete sentence and review so that message is reset
|
|
873
|
+
del messages[-3:]
|
|
874
|
+
|
|
875
|
+
# add to output
|
|
876
|
+
output.append({'sentence_start': sent['start'],
|
|
877
|
+
'sentence_end': sent['end'],
|
|
878
|
+
'sentence_text': sent['sentence_text'],
|
|
879
|
+
'gen_text': gen_text})
|
|
880
|
+
return output
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
class SentenceCoTFrameExtractor(SentenceFrameExtractor):
|
|
884
|
+
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
|
885
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
|
|
886
|
+
"""
|
|
887
|
+
This class performs sentence-based Chain-of-thoughts (CoT) information extraction.
|
|
888
|
+
A simulated chat follows this process:
|
|
889
|
+
1. system prompt (optional)
|
|
890
|
+
2. user instructions (schema, background, full text, few-shot example...)
|
|
891
|
+
3. user input first sentence
|
|
892
|
+
4. assistant analyze the sentence
|
|
893
|
+
5. assistant extract outputs
|
|
894
|
+
6. repeat #3, #4, #5
|
|
895
|
+
|
|
896
|
+
Input system prompt (optional), prompt template (with user instructions),
|
|
897
|
+
and specify a LLM.
|
|
898
|
+
|
|
899
|
+
Parameters
|
|
900
|
+
----------
|
|
901
|
+
inference_engine : InferenceEngine
|
|
902
|
+
the LLM inferencing engine object. Must implements the chat() method.
|
|
903
|
+
prompt_template : str
|
|
904
|
+
prompt template with "{{<placeholder name>}}" placeholder.
|
|
905
|
+
system_prompt : str, Optional
|
|
906
|
+
system prompt.
|
|
907
|
+
"""
|
|
908
|
+
super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
|
|
909
|
+
system_prompt=system_prompt, **kwrs)
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
913
|
+
document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
914
|
+
"""
|
|
915
|
+
This method inputs a text and outputs a list of outputs per sentence.
|
|
916
|
+
|
|
917
|
+
Parameters:
|
|
918
|
+
----------
|
|
919
|
+
text_content : Union[str, Dict[str,str]]
|
|
920
|
+
the input text content to put in prompt template.
|
|
921
|
+
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
922
|
+
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
923
|
+
max_new_tokens : str, Optional
|
|
924
|
+
the max number of new tokens LLM should generate.
|
|
925
|
+
document_key : str, Optional
|
|
926
|
+
specify the key in text_content where document text is.
|
|
927
|
+
If text_content is str, this parameter will be ignored.
|
|
928
|
+
multi_turn : bool, Optional
|
|
929
|
+
multi-turn conversation prompting.
|
|
930
|
+
If True, sentences and LLM outputs will be appended to the input message and carry-over.
|
|
931
|
+
If False, only the current sentence is prompted.
|
|
932
|
+
For LLM inference engines that supports prompt cache (e.g., Llama.Cpp, Ollama), use multi-turn conversation prompting
|
|
933
|
+
can better utilize the KV caching.
|
|
934
|
+
temperature : float, Optional
|
|
935
|
+
the temperature for token sampling.
|
|
936
|
+
stream : bool, Optional
|
|
937
|
+
if True, LLM generated text will be printed in terminal in real-time.
|
|
938
|
+
|
|
939
|
+
Return : str
|
|
940
|
+
the output from LLM. Need post-processing.
|
|
941
|
+
"""
|
|
942
|
+
# define output
|
|
943
|
+
output = []
|
|
944
|
+
# sentence tokenization
|
|
945
|
+
if isinstance(text_content, str):
|
|
946
|
+
sentences = self._get_sentences(text_content)
|
|
947
|
+
elif isinstance(text_content, dict):
|
|
948
|
+
sentences = self._get_sentences(text_content[document_key])
|
|
949
|
+
# construct chat messages
|
|
950
|
+
messages = []
|
|
951
|
+
if self.system_prompt:
|
|
952
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
953
|
+
|
|
954
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
|
|
955
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please start with the first sentence.'})
|
|
956
|
+
|
|
957
|
+
# generate sentence by sentence
|
|
958
|
+
for sent in sentences:
|
|
959
|
+
messages.append({'role': 'user', 'content': sent['sentence_text']})
|
|
960
|
+
if stream:
|
|
961
|
+
print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
|
|
962
|
+
print(f"{Fore.BLUE}CoT:{Style.RESET_ALL}")
|
|
963
|
+
|
|
964
|
+
gen_text = self.inference_engine.chat(
|
|
965
|
+
messages=messages,
|
|
966
|
+
max_new_tokens=max_new_tokens,
|
|
967
|
+
temperature=temperature,
|
|
968
|
+
stream=stream,
|
|
969
|
+
**kwrs
|
|
970
|
+
)
|
|
971
|
+
|
|
972
|
+
if multi_turn:
|
|
973
|
+
# update chat messages with LLM outputs
|
|
974
|
+
messages.append({'role': 'assistant', 'content': gen_text})
|
|
975
|
+
else:
|
|
976
|
+
# delete sentence so that message is reset
|
|
977
|
+
del messages[-1]
|
|
978
|
+
|
|
979
|
+
# add to output
|
|
980
|
+
output.append({'sentence_start': sent['start'],
|
|
981
|
+
'sentence_end': sent['end'],
|
|
982
|
+
'sentence_text': sent['sentence_text'],
|
|
983
|
+
'gen_text': gen_text})
|
|
984
|
+
return output
|
|
985
|
+
|
|
986
|
+
|
|
595
987
|
class RelationExtractor(Extractor):
|
|
596
988
|
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
|
|
597
989
|
"""
|
|
@@ -752,8 +1144,8 @@ class BinaryRelationExtractor(RelationExtractor):
|
|
|
752
1144
|
"""
|
|
753
1145
|
roi_text = self._get_ROI(frame_1, frame_2, text, buffer_size=buffer_size)
|
|
754
1146
|
if stream:
|
|
755
|
-
print(f"\n\
|
|
756
|
-
print("Extraction:")
|
|
1147
|
+
print(f"\n\n{Fore.GREEN}ROI text:{Style.RESET_ALL} \n{roi_text}\n")
|
|
1148
|
+
print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
|
|
757
1149
|
|
|
758
1150
|
messages = []
|
|
759
1151
|
if self.system_prompt:
|
|
@@ -904,8 +1296,8 @@ class MultiClassRelationExtractor(RelationExtractor):
|
|
|
904
1296
|
"""
|
|
905
1297
|
roi_text = self._get_ROI(frame_1, frame_2, text, buffer_size=buffer_size)
|
|
906
1298
|
if stream:
|
|
907
|
-
print(f"\n\
|
|
908
|
-
print("Extraction:")
|
|
1299
|
+
print(f"\n\n{Fore.GREEN}ROI text:{Style.RESET_ALL} \n{roi_text}\n")
|
|
1300
|
+
print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
|
|
909
1301
|
|
|
910
1302
|
messages = []
|
|
911
1303
|
if self.system_prompt:
|
|
@@ -1,13 +1,10 @@
|
|
|
1
1
|
import sys
|
|
2
|
-
from typing import Dict
|
|
2
|
+
from typing import Dict
|
|
3
3
|
import importlib.resources
|
|
4
4
|
from llm_ie.engines import InferenceEngine
|
|
5
5
|
from llm_ie.extractors import FrameExtractor
|
|
6
6
|
import re
|
|
7
|
-
import colorama
|
|
8
7
|
from colorama import Fore, Style
|
|
9
|
-
import ipywidgets as widgets
|
|
10
|
-
from IPython.display import display, HTML
|
|
11
8
|
|
|
12
9
|
|
|
13
10
|
class PromptEditor:
|
|
@@ -90,7 +87,6 @@ class PromptEditor:
|
|
|
90
87
|
"""
|
|
91
88
|
This method runs an interactive chat session in the terminal to help users write prompt templates.
|
|
92
89
|
"""
|
|
93
|
-
colorama.init(autoreset=True)
|
|
94
90
|
file_path = importlib.resources.files('llm_ie.asset.PromptEditor_prompts').joinpath('chat.txt')
|
|
95
91
|
with open(file_path, 'r') as f:
|
|
96
92
|
chat_prompt_template = f.read()
|
|
@@ -123,6 +119,16 @@ class PromptEditor:
|
|
|
123
119
|
"""
|
|
124
120
|
This method runs an interactive chat session in Jupyter/IPython using ipywidgets to help users write prompt templates.
|
|
125
121
|
"""
|
|
122
|
+
# Check if ipywidgets is installed
|
|
123
|
+
if importlib.util.find_spec("ipywidgets") is None:
|
|
124
|
+
raise ImportError("ipywidgets not found. Please install ipywidgets (```pip install ipywidgets```).")
|
|
125
|
+
import ipywidgets as widgets
|
|
126
|
+
|
|
127
|
+
# Check if IPython is installed
|
|
128
|
+
if importlib.util.find_spec("IPython") is None:
|
|
129
|
+
raise ImportError("IPython not found. Please install IPython (```pip install ipython```).")
|
|
130
|
+
from IPython.display import display, HTML
|
|
131
|
+
|
|
126
132
|
# Load the chat prompt template from the resources
|
|
127
133
|
file_path = importlib.resources.files('llm_ie.asset.PromptEditor_prompts').joinpath('chat.txt')
|
|
128
134
|
with open(file_path, 'r') as f:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
{llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
|
File without changes
|
{llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
{llm_ie-0.3.0 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|