llm-ie 1.2.2__tar.gz → 1.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llm_ie-1.2.2 → llm_ie-1.2.4}/PKG-INFO +1 -1
- {llm_ie-1.2.2 → llm_ie-1.2.4}/pyproject.toml +1 -1
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/__init__.py +5 -4
- llm_ie-1.2.4/src/llm_ie/asset/default_prompts/LLMUnitChunker_user_prompt.txt +129 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/chunkers.py +145 -6
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/data_types.py +23 -37
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/engines.py +621 -61
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/extractors.py +341 -297
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/prompt_editor.py +9 -32
- llm_ie-1.2.4/src/llm_ie/utils.py +95 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/README.md +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/chat.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/comment.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/rewrite.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/system.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/BasicReviewFrameExtractor_addition_review_prompt.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/BasicReviewFrameExtractor_revision_review_prompt.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_revision_review_prompt.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/BasicReviewFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/DirectFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llm-ie
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.4
|
|
4
4
|
Summary: A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Enshuo (David) Hsu
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "llm-ie"
|
|
3
|
-
version = "1.2.
|
|
3
|
+
version = "1.2.4"
|
|
4
4
|
description = "A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines."
|
|
5
5
|
authors = ["Enshuo (David) Hsu"]
|
|
6
6
|
license = "MIT"
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
from .data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
2
|
-
from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
|
|
2
|
+
from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
|
|
3
|
+
from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, SGLangInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
|
|
3
4
|
from .extractors import DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
|
|
4
|
-
from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
|
|
5
|
+
from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, LLMUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
|
|
5
6
|
from .prompt_editor import PromptEditor
|
|
6
7
|
|
|
7
8
|
__all__ = ["LLMInformationExtractionFrame", "LLMInformationExtractionDocument",
|
|
8
|
-
"BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
|
|
9
|
+
"BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "SGLangInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
|
|
9
10
|
"DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
|
|
10
|
-
"UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
|
|
11
|
+
"UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "LLMUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
|
|
11
12
|
"PromptEditor"]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
### Task description
|
|
2
|
+
You are a helpful assistant that breaks down a text document into semantic units or chunks. Each chunk should represent a coherent section of the text, such as a paragraph, subsection, or topic.
|
|
3
|
+
|
|
4
|
+
### Schema definition
|
|
5
|
+
You will output a JSON array of objects. Each object should have the following fields:
|
|
6
|
+
- "title": Generate a brief title summarizing the content of the chunk.
|
|
7
|
+
- "anchor_text": the first line of text in the chunk used to locate it in the original document. Must be an exact match.
|
|
8
|
+
- if there is a title or heading for the chunk, use that as the anchor_text.
|
|
9
|
+
- if there is no title or heading, use the first sentence of the chunk as the anchor_text.
|
|
10
|
+
|
|
11
|
+
```JSON
|
|
12
|
+
[
|
|
13
|
+
{
|
|
14
|
+
"title": "<your title here>",
|
|
15
|
+
"anchor_text": "<the anchor text of the chunk here>"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"title": "<your title here>",
|
|
19
|
+
"anchor_text": "<the anchor text of the chunk here>"
|
|
20
|
+
}
|
|
21
|
+
]
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### Examples
|
|
25
|
+
|
|
26
|
+
**Input**:
|
|
27
|
+
"# Clinical Note
|
|
28
|
+
|
|
29
|
+
**Patient Name**: Michael Green
|
|
30
|
+
**Medical Record Number**: 1122334455
|
|
31
|
+
**Date of Visit**: January 5, 2025
|
|
32
|
+
**Provider**: Dr. Emily Carter, MD
|
|
33
|
+
|
|
34
|
+
## Reason for Visit
|
|
35
|
+
Follow-up for poorly controlled type 2 diabetes and complaints of occasional dizziness and blurred vision.
|
|
36
|
+
|
|
37
|
+
## Summary of Visit
|
|
38
|
+
Michael Green, a 62-year-old male with a known history of type 2 diabetes, hypertension, and obesity, presents for follow-up regarding his glycemic control. Despite recent adjustments to his treatment plan, his glucose readings have remained elevated, averaging 180-220 mg/dL. He reports occasional episodes of dizziness and blurred vision, particularly in the morning before meals. He denies chest pain, palpitations, or recent falls. He reports compliance with his medication regimen but admits to difficulty following a consistent low-carbohydrate diet.
|
|
39
|
+
|
|
40
|
+
Michael has been using a glucose meter to monitor his blood sugar levels and logs them daily. His last hemoglobin A1c, performed three months ago, was 9.2%. He reports no recent hospitalizations, infections, or significant stressors.
|
|
41
|
+
|
|
42
|
+
## Notable History
|
|
43
|
+
- **Chronic Conditions**:
|
|
44
|
+
- Type 2 diabetes mellitus, diagnosed 10 years ago.
|
|
45
|
+
- Hypertension, well-controlled on medication.
|
|
46
|
+
- Hyperlipidemia, on statin therapy.
|
|
47
|
+
- **Past Surgical History**:
|
|
48
|
+
- Knee arthroscopy for a meniscal tear, age 50.
|
|
49
|
+
- **Family History**:
|
|
50
|
+
- Mother: Deceased at 75, complications from diabetes.
|
|
51
|
+
- Father: Deceased at 70, myocardial infarction."
|
|
52
|
+
|
|
53
|
+
**Output**:
|
|
54
|
+
```JSON
|
|
55
|
+
[
|
|
56
|
+
{
|
|
57
|
+
"title": "Patient Information",
|
|
58
|
+
"anchor_text": "# Clinical Note"
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
"title": "Reason for Visit",
|
|
62
|
+
"anchor_text": "## Reason for Visit"
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
"title": "Summary of Visit",
|
|
66
|
+
"anchor_text": "## Summary of Visit"
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"title": "Notable History",
|
|
70
|
+
"anchor_text": "## Notable History"
|
|
71
|
+
}
|
|
72
|
+
]
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
**Input**:
|
|
76
|
+
"In the [**Hospital1 18**] ED, 35.3 102 133/58 100%AC 500x20, 5, 1.0 with an
|
|
77
|
+
ABG 7.16/66/162. He had a CTH which was unremarkable. He then
|
|
78
|
+
had a CTA chest, afterwhich he went into PEA arrest.
|
|
79
|
+
Rescucitation last approximately 10-15 minutes with multiple
|
|
80
|
+
rounds of epi and bicarb, with ROSC. He was then admitted to the
|
|
81
|
+
MICU for further management.
|
|
82
|
+
.
|
|
83
|
+
Currently, the patient is intubated, sedated, and parlyzed.
|
|
84
|
+
|
|
85
|
+
Past Medical History:
|
|
86
|
+
Asthma
|
|
87
|
+
Dilated cardiomyopathy
|
|
88
|
+
Multiple admissions for dyspnea this winter ([**1-26**]).
|
|
89
|
+
Anxiety/depression
|
|
90
|
+
CKD
|
|
91
|
+
HLD
|
|
92
|
+
Obesity
|
|
93
|
+
HTN
|
|
94
|
+
|
|
95
|
+
Social History:
|
|
96
|
+
Unknown
|
|
97
|
+
|
|
98
|
+
Family History:
|
|
99
|
+
Unknown"
|
|
100
|
+
|
|
101
|
+
**Output**:
|
|
102
|
+
```JSON
|
|
103
|
+
[
|
|
104
|
+
{
|
|
105
|
+
"title": "Patient Presentation and Initial Management",
|
|
106
|
+
"anchor_text": "In the [**Hospital1 18**] ED, 35.3 102 133/58 100%AC 500x20, 5, 1.0 with an"
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
"title": "Current Status of the Patient",
|
|
110
|
+
"anchor_text": "Currently, the patient is intubated, sedated, and parlyzed."
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
"title": "Past Medical History",
|
|
114
|
+
"anchor_text": "Past Medical History:"
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
"title": "Social History",
|
|
118
|
+
"anchor_text": "Social History:"
|
|
119
|
+
},
|
|
120
|
+
{
|
|
121
|
+
"title": "Family History",
|
|
122
|
+
"anchor_text": "Family History:"
|
|
123
|
+
}
|
|
124
|
+
]
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Document text
|
|
128
|
+
|
|
129
|
+
"{{document_text}}"
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
import abc
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import List
|
|
3
|
+
import asyncio
|
|
4
|
+
import uuid
|
|
5
|
+
import importlib.resources
|
|
6
|
+
from llm_ie.utils import extract_json, apply_prompt_template
|
|
3
7
|
from llm_ie.data_types import FrameExtractionUnit
|
|
8
|
+
from llm_ie.engines import InferenceEngine
|
|
4
9
|
|
|
5
10
|
|
|
6
11
|
class UnitChunker(abc.ABC):
|
|
@@ -11,7 +16,8 @@ class UnitChunker(abc.ABC):
|
|
|
11
16
|
"""
|
|
12
17
|
pass
|
|
13
18
|
|
|
14
|
-
|
|
19
|
+
@abc.abstractmethod
|
|
20
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
15
21
|
"""
|
|
16
22
|
Parameters:
|
|
17
23
|
----------
|
|
@@ -20,6 +26,12 @@ class UnitChunker(abc.ABC):
|
|
|
20
26
|
"""
|
|
21
27
|
return NotImplemented
|
|
22
28
|
|
|
29
|
+
async def chunk_async(self, text:str, doc_id:str=None, executor=None) -> List[FrameExtractionUnit]:
|
|
30
|
+
"""
|
|
31
|
+
asynchronous version of chunk method.
|
|
32
|
+
"""
|
|
33
|
+
loop = asyncio.get_running_loop()
|
|
34
|
+
return await loop.run_in_executor(executor, self.chunk, text, doc_id)
|
|
23
35
|
|
|
24
36
|
class WholeDocumentUnitChunker(UnitChunker):
|
|
25
37
|
def __init__(self):
|
|
@@ -28,7 +40,7 @@ class WholeDocumentUnitChunker(UnitChunker):
|
|
|
28
40
|
"""
|
|
29
41
|
super().__init__()
|
|
30
42
|
|
|
31
|
-
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
43
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
32
44
|
"""
|
|
33
45
|
Parameters:
|
|
34
46
|
----------
|
|
@@ -36,6 +48,7 @@ class WholeDocumentUnitChunker(UnitChunker):
|
|
|
36
48
|
The document text.
|
|
37
49
|
"""
|
|
38
50
|
return [FrameExtractionUnit(
|
|
51
|
+
doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
|
|
39
52
|
start=0,
|
|
40
53
|
end=len(text),
|
|
41
54
|
text=text
|
|
@@ -57,19 +70,21 @@ class SeparatorUnitChunker(UnitChunker):
|
|
|
57
70
|
|
|
58
71
|
self.sep = sep
|
|
59
72
|
|
|
60
|
-
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
73
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
61
74
|
"""
|
|
62
75
|
Parameters:
|
|
63
76
|
----------
|
|
64
77
|
text : str
|
|
65
78
|
The document text.
|
|
66
79
|
"""
|
|
80
|
+
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
|
|
67
81
|
paragraphs = text.split(self.sep)
|
|
68
82
|
paragraph_units = []
|
|
69
83
|
start = 0
|
|
70
84
|
for paragraph in paragraphs:
|
|
71
85
|
end = start + len(paragraph)
|
|
72
86
|
paragraph_units.append(FrameExtractionUnit(
|
|
87
|
+
doc_id=doc_id,
|
|
73
88
|
start=start,
|
|
74
89
|
end=end,
|
|
75
90
|
text=paragraph
|
|
@@ -77,6 +92,7 @@ class SeparatorUnitChunker(UnitChunker):
|
|
|
77
92
|
start = end + len(self.sep)
|
|
78
93
|
return paragraph_units
|
|
79
94
|
|
|
95
|
+
|
|
80
96
|
class SentenceUnitChunker(UnitChunker):
|
|
81
97
|
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
|
82
98
|
def __init__(self):
|
|
@@ -85,16 +101,18 @@ class SentenceUnitChunker(UnitChunker):
|
|
|
85
101
|
"""
|
|
86
102
|
super().__init__()
|
|
87
103
|
|
|
88
|
-
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
104
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
89
105
|
"""
|
|
90
106
|
Parameters:
|
|
91
107
|
----------
|
|
92
108
|
text : str
|
|
93
109
|
The document text.
|
|
94
110
|
"""
|
|
111
|
+
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
|
|
95
112
|
sentences = []
|
|
96
113
|
for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
|
|
97
114
|
sentences.append(FrameExtractionUnit(
|
|
115
|
+
doc_id=doc_id,
|
|
98
116
|
start=start,
|
|
99
117
|
end=end,
|
|
100
118
|
text=text[start:end]
|
|
@@ -109,19 +127,21 @@ class TextLineUnitChunker(UnitChunker):
|
|
|
109
127
|
"""
|
|
110
128
|
super().__init__()
|
|
111
129
|
|
|
112
|
-
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
130
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
113
131
|
"""
|
|
114
132
|
Parameters:
|
|
115
133
|
----------
|
|
116
134
|
text : str
|
|
117
135
|
The document text.
|
|
118
136
|
"""
|
|
137
|
+
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
|
|
119
138
|
lines = text.split('\n')
|
|
120
139
|
line_units = []
|
|
121
140
|
start = 0
|
|
122
141
|
for line in lines:
|
|
123
142
|
end = start + len(line)
|
|
124
143
|
line_units.append(FrameExtractionUnit(
|
|
144
|
+
doc_id=doc_id,
|
|
125
145
|
start=start,
|
|
126
146
|
end=end,
|
|
127
147
|
text=line
|
|
@@ -129,6 +149,100 @@ class TextLineUnitChunker(UnitChunker):
|
|
|
129
149
|
start = end + 1
|
|
130
150
|
return line_units
|
|
131
151
|
|
|
152
|
+
class LLMUnitChunker(UnitChunker):
|
|
153
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str=None, system_prompt:str=None):
|
|
154
|
+
"""
|
|
155
|
+
This class prompt an LLM for document segmentation (e.g., sections, paragraphs).
|
|
156
|
+
|
|
157
|
+
Parameters:
|
|
158
|
+
----------
|
|
159
|
+
inference_engine : InferenceEngine
|
|
160
|
+
the LLM inferencing engine object.
|
|
161
|
+
prompt_template : str
|
|
162
|
+
the prompt template that defines how to chunk the document. Must define a JSON schema with
|
|
163
|
+
```json
|
|
164
|
+
[
|
|
165
|
+
{
|
|
166
|
+
"title": "<your title here>",
|
|
167
|
+
"anchor_text": "<the anchor text of the chunk here>"
|
|
168
|
+
},
|
|
169
|
+
{
|
|
170
|
+
"title": "<your title here>",
|
|
171
|
+
"anchor_text": "<the anchor text of the chunk here>"
|
|
172
|
+
}
|
|
173
|
+
]
|
|
174
|
+
```
|
|
175
|
+
system_prompt : str, optional
|
|
176
|
+
The system prompt.
|
|
177
|
+
"""
|
|
178
|
+
self.inference_engine = inference_engine
|
|
179
|
+
|
|
180
|
+
if prompt_template is None:
|
|
181
|
+
file_path = importlib.resources.files('llm_ie.asset.default_prompts').joinpath("LLMUnitChunker_user_prompt.txt")
|
|
182
|
+
with open(file_path, 'r', encoding="utf-8") as f:
|
|
183
|
+
self.prompt_template = f.read()
|
|
184
|
+
else:
|
|
185
|
+
self.prompt_template = prompt_template
|
|
186
|
+
|
|
187
|
+
self.system_prompt = system_prompt
|
|
188
|
+
|
|
189
|
+
def chunk(self, text, doc_id=None) -> List[FrameExtractionUnit]:
|
|
190
|
+
"""
|
|
191
|
+
Parameters:
|
|
192
|
+
-----------
|
|
193
|
+
text : str
|
|
194
|
+
the document text.
|
|
195
|
+
doc_id : str, optional
|
|
196
|
+
the document id.
|
|
197
|
+
"""
|
|
198
|
+
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
|
|
199
|
+
user_prompt = apply_prompt_template(prompt_template=self.prompt_template, text_content=text)
|
|
200
|
+
messages = []
|
|
201
|
+
if self.system_prompt is not None:
|
|
202
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
203
|
+
messages.append({'role': 'user', 'content': user_prompt})
|
|
204
|
+
|
|
205
|
+
gen_text = self.inference_engine.chat(messages=messages)
|
|
206
|
+
|
|
207
|
+
header_list = extract_json(gen_text=gen_text["response"])
|
|
208
|
+
units = []
|
|
209
|
+
start = 0
|
|
210
|
+
prev_end = 0
|
|
211
|
+
for header in header_list:
|
|
212
|
+
if "anchor_text" not in header:
|
|
213
|
+
Warning.warn(f"Missing anchor_text in header: {header}. Skipping this header.")
|
|
214
|
+
continue
|
|
215
|
+
if not isinstance(header["anchor_text"], str):
|
|
216
|
+
Warning.warn(f"Invalid anchor_text: {header['anchor_text']}. Skipping this header.")
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
start = prev_end
|
|
220
|
+
# find the first instandce of the leading sentence in the rest of the text
|
|
221
|
+
end = text.find(header["anchor_text"], start)
|
|
222
|
+
# if not found, skip this header
|
|
223
|
+
if end == -1:
|
|
224
|
+
continue
|
|
225
|
+
# if start == end (empty text), skip this header
|
|
226
|
+
if start == end:
|
|
227
|
+
continue
|
|
228
|
+
# create a frame extraction unit
|
|
229
|
+
units.append(FrameExtractionUnit(
|
|
230
|
+
doc_id=doc_id,
|
|
231
|
+
start=start,
|
|
232
|
+
end=end,
|
|
233
|
+
text=text[start:end]
|
|
234
|
+
))
|
|
235
|
+
prev_end = end
|
|
236
|
+
# add the last section
|
|
237
|
+
if prev_end < len(text):
|
|
238
|
+
units.append(FrameExtractionUnit(
|
|
239
|
+
doc_id=doc_id,
|
|
240
|
+
start=prev_end,
|
|
241
|
+
end=len(text),
|
|
242
|
+
text=text[prev_end:]
|
|
243
|
+
))
|
|
244
|
+
return units
|
|
245
|
+
|
|
132
246
|
|
|
133
247
|
class ContextChunker(abc.ABC):
|
|
134
248
|
def __init__(self):
|
|
@@ -138,6 +252,24 @@ class ContextChunker(abc.ABC):
|
|
|
138
252
|
"""
|
|
139
253
|
pass
|
|
140
254
|
|
|
255
|
+
@abc.abstractmethod
|
|
256
|
+
def fit(self, text:str, units:List[FrameExtractionUnit]):
|
|
257
|
+
"""
|
|
258
|
+
Parameters:
|
|
259
|
+
----------
|
|
260
|
+
text : str
|
|
261
|
+
The document text.
|
|
262
|
+
"""
|
|
263
|
+
pass
|
|
264
|
+
|
|
265
|
+
async def fit_async(self, text:str, units:List[FrameExtractionUnit], executor=None):
|
|
266
|
+
"""
|
|
267
|
+
asynchronous version of fit method.
|
|
268
|
+
"""
|
|
269
|
+
loop = asyncio.get_running_loop()
|
|
270
|
+
return await loop.run_in_executor(executor, self.fit, text, units)
|
|
271
|
+
|
|
272
|
+
@abc.abstractmethod
|
|
141
273
|
def chunk(self, unit:FrameExtractionUnit) -> str:
|
|
142
274
|
"""
|
|
143
275
|
Parameters:
|
|
@@ -150,6 +282,13 @@ class ContextChunker(abc.ABC):
|
|
|
150
282
|
"""
|
|
151
283
|
return NotImplemented
|
|
152
284
|
|
|
285
|
+
async def chunk_async(self, unit:FrameExtractionUnit, executor=None) -> str:
|
|
286
|
+
"""
|
|
287
|
+
asynchronous version of chunk method.
|
|
288
|
+
"""
|
|
289
|
+
loop = asyncio.get_running_loop()
|
|
290
|
+
return await loop.run_in_executor(executor, self.chunk, unit)
|
|
291
|
+
|
|
153
292
|
|
|
154
293
|
class NoContextChunker(ContextChunker):
|
|
155
294
|
def __init__(self):
|
|
@@ -7,13 +7,15 @@ import json
|
|
|
7
7
|
|
|
8
8
|
@dataclass
|
|
9
9
|
class FrameExtractionUnit:
|
|
10
|
-
def __init__(self, start:int, end:int, text:str):
|
|
10
|
+
def __init__(self, doc_id:str, start:int, end:int, text:str):
|
|
11
11
|
"""
|
|
12
12
|
This class holds the unit text for frame extraction, for example, a sentence.
|
|
13
13
|
FrameExtractor prompt it one at a time to extract frames.
|
|
14
14
|
|
|
15
15
|
Parameters
|
|
16
16
|
----------
|
|
17
|
+
doc_id : str, Optional
|
|
18
|
+
document ID.
|
|
17
19
|
start : int
|
|
18
20
|
start character position of the unit text, relative to the whole document
|
|
19
21
|
end : int
|
|
@@ -21,9 +23,28 @@ class FrameExtractionUnit:
|
|
|
21
23
|
text : str
|
|
22
24
|
the unit text. Should be the exact string by [start:end]
|
|
23
25
|
"""
|
|
26
|
+
self.doc_id = doc_id
|
|
24
27
|
self.start = start
|
|
25
28
|
self.end = end
|
|
26
29
|
self.text = text
|
|
30
|
+
# status: "pending", "success", "fail"
|
|
31
|
+
self.status = "pending"
|
|
32
|
+
# generated text by LLM
|
|
33
|
+
self.gen_text = None
|
|
34
|
+
|
|
35
|
+
def get_status(self) -> str:
|
|
36
|
+
return self.status
|
|
37
|
+
|
|
38
|
+
def set_status(self, status:str):
|
|
39
|
+
if status not in {"pending", "success", "fail"}:
|
|
40
|
+
raise ValueError('status must be one of {"pending", "success", "fail"}.')
|
|
41
|
+
self.status = status
|
|
42
|
+
|
|
43
|
+
def get_generated_text(self) -> str:
|
|
44
|
+
return self.gen_text
|
|
45
|
+
|
|
46
|
+
def set_generated_text(self, gen_text:str):
|
|
47
|
+
self.gen_text = gen_text
|
|
27
48
|
|
|
28
49
|
def __eq__(self, other):
|
|
29
50
|
if not isinstance(other, FrameExtractionUnit):
|
|
@@ -39,43 +60,8 @@ class FrameExtractionUnit:
|
|
|
39
60
|
return self.start < other.start
|
|
40
61
|
|
|
41
62
|
def __repr__(self):
|
|
42
|
-
return f"FrameExtractionUnit(start={self.start}, end={self.end}, text='{self.text[:100]}...')"
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
@dataclass
|
|
46
|
-
class FrameExtractionUnitResult:
|
|
47
|
-
def __init__(self, start:int, end:int, text:str, gen_text:str):
|
|
48
|
-
"""
|
|
49
|
-
This class holds the unit text for frame extraction, for example, a sentence.
|
|
50
|
-
FrameExtractor prompt it one at a time to extract frames.
|
|
51
|
-
|
|
52
|
-
Parameters
|
|
53
|
-
----------
|
|
54
|
-
start : int
|
|
55
|
-
start character position of the unit text, relative to the whole document
|
|
56
|
-
end : int
|
|
57
|
-
end character position of the unit text, relative to the whole document
|
|
58
|
-
text : str
|
|
59
|
-
the unit text. Should be the exact string by [start:end]
|
|
60
|
-
gen_text : str
|
|
61
|
-
the generated text by LLM (ideally) following '[{"entity_text": "xxx", "attr": {"key": "value"}}]' format. Does not contain spans (start/end).
|
|
62
|
-
"""
|
|
63
|
-
self.start = start
|
|
64
|
-
self.end = end
|
|
65
|
-
self.text = text
|
|
66
|
-
self.gen_text = gen_text
|
|
67
|
-
|
|
68
|
-
def __eq__(self, other):
|
|
69
|
-
if not isinstance(other, FrameExtractionUnit):
|
|
70
|
-
return NotImplemented
|
|
71
|
-
return (self.start == other.start and self.end == other.end and self.text == other.text and self.gen_text == other.gen_text)
|
|
63
|
+
return f"FrameExtractionUnit(doc_id={self.doc_id}, start={self.start}, end={self.end}, status={self.status}, text='{self.text[:100]}...')"
|
|
72
64
|
|
|
73
|
-
def __hash__(self):
|
|
74
|
-
return hash((self.start, self.end, self.text, self.gen_text))
|
|
75
|
-
|
|
76
|
-
def __repr__(self):
|
|
77
|
-
return f"FrameExtractionUnitResult(start={self.start}, end={self.end}, text='{self.text[:100]}...', gen_text='{self.gen_text[:100]}...')"
|
|
78
|
-
|
|
79
65
|
|
|
80
66
|
@dataclass
|
|
81
67
|
class LLMInformationExtractionFrame:
|