llm-ie 0.4.7__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_ie/__init__.py +6 -4
- llm_ie/asset/default_prompts/BasicReviewFrameExtractor_addition_review_prompt.txt +3 -0
- llm_ie/asset/default_prompts/BasicReviewFrameExtractor_revision_review_prompt.txt +2 -0
- llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +2 -1
- llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +2 -1
- llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +104 -86
- llm_ie/asset/prompt_guide/BasicReviewFrameExtractor_prompt_guide.txt +163 -0
- llm_ie/asset/prompt_guide/DirectFrameExtractor_prompt_guide.txt +163 -0
- llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +103 -85
- llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +103 -86
- llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +103 -86
- llm_ie/chunkers.py +191 -0
- llm_ie/data_types.py +75 -1
- llm_ie/engines.py +600 -262
- llm_ie/extractors.py +859 -899
- llm_ie/prompt_editor.py +45 -12
- llm_ie-1.1.0.dist-info/METADATA +18 -0
- llm_ie-1.1.0.dist-info/RECORD +27 -0
- llm_ie/asset/prompt_guide/SentenceCoTFrameExtractor_prompt_guide.txt +0 -217
- llm_ie-0.4.7.dist-info/METADATA +0 -1219
- llm_ie-0.4.7.dist-info/RECORD +0 -23
- {llm_ie-0.4.7.dist-info → llm_ie-1.1.0.dist-info}/WHEEL +0 -0
llm_ie/chunkers.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Set, List, Dict, Tuple, Union, Callable
|
|
3
|
+
from llm_ie.data_types import FrameExtractionUnit
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class UnitChunker(abc.ABC):
|
|
7
|
+
def __init__(self):
|
|
8
|
+
"""
|
|
9
|
+
This is the abstract class for frame extraction unit chunker.
|
|
10
|
+
It chunks a document into units (e.g., sentences). LLMs process unit by unit.
|
|
11
|
+
"""
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
15
|
+
"""
|
|
16
|
+
Parameters:
|
|
17
|
+
----------
|
|
18
|
+
text : str
|
|
19
|
+
The document text.
|
|
20
|
+
"""
|
|
21
|
+
return NotImplemented
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class WholeDocumentUnitChunker(UnitChunker):
|
|
25
|
+
def __init__(self):
|
|
26
|
+
"""
|
|
27
|
+
This class chunks the whole document into a single unit (no chunking).
|
|
28
|
+
"""
|
|
29
|
+
super().__init__()
|
|
30
|
+
|
|
31
|
+
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
32
|
+
"""
|
|
33
|
+
Parameters:
|
|
34
|
+
----------
|
|
35
|
+
text : str
|
|
36
|
+
The document text.
|
|
37
|
+
"""
|
|
38
|
+
return [FrameExtractionUnit(
|
|
39
|
+
start=0,
|
|
40
|
+
end=len(text),
|
|
41
|
+
text=text
|
|
42
|
+
)]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class SentenceUnitChunker(UnitChunker):
|
|
46
|
+
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
|
47
|
+
def __init__(self):
|
|
48
|
+
"""
|
|
49
|
+
This class uses the NLTK PunktSentenceTokenizer to chunk a document into sentences.
|
|
50
|
+
"""
|
|
51
|
+
super().__init__()
|
|
52
|
+
|
|
53
|
+
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
54
|
+
"""
|
|
55
|
+
Parameters:
|
|
56
|
+
----------
|
|
57
|
+
text : str
|
|
58
|
+
The document text.
|
|
59
|
+
"""
|
|
60
|
+
sentences = []
|
|
61
|
+
for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
|
|
62
|
+
sentences.append(FrameExtractionUnit(
|
|
63
|
+
start=start,
|
|
64
|
+
end=end,
|
|
65
|
+
text=text[start:end]
|
|
66
|
+
))
|
|
67
|
+
return sentences
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class TextLineUnitChunker(UnitChunker):
|
|
71
|
+
def __init__(self):
|
|
72
|
+
"""
|
|
73
|
+
This class chunks a document into lines.
|
|
74
|
+
"""
|
|
75
|
+
super().__init__()
|
|
76
|
+
|
|
77
|
+
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
78
|
+
"""
|
|
79
|
+
Parameters:
|
|
80
|
+
----------
|
|
81
|
+
text : str
|
|
82
|
+
The document text.
|
|
83
|
+
"""
|
|
84
|
+
lines = text.split('\n')
|
|
85
|
+
line_units = []
|
|
86
|
+
start = 0
|
|
87
|
+
for line in lines:
|
|
88
|
+
end = start + len(line)
|
|
89
|
+
line_units.append(FrameExtractionUnit(
|
|
90
|
+
start=start,
|
|
91
|
+
end=end,
|
|
92
|
+
text=line
|
|
93
|
+
))
|
|
94
|
+
start = end + 1
|
|
95
|
+
return line_units
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class ContextChunker(abc.ABC):
|
|
99
|
+
def __init__(self):
|
|
100
|
+
"""
|
|
101
|
+
This is the abstract class for context chunker. Given a frame extraction unit,
|
|
102
|
+
it returns the context for it.
|
|
103
|
+
"""
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
def chunk(self, unit:FrameExtractionUnit) -> str:
|
|
107
|
+
"""
|
|
108
|
+
Parameters:
|
|
109
|
+
----------
|
|
110
|
+
unit : FrameExtractionUnit
|
|
111
|
+
The frame extraction unit.
|
|
112
|
+
|
|
113
|
+
Return : str
|
|
114
|
+
The context for the frame extraction unit.
|
|
115
|
+
"""
|
|
116
|
+
return NotImplemented
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class NoContextChunker(ContextChunker):
|
|
120
|
+
def __init__(self):
|
|
121
|
+
"""
|
|
122
|
+
This class does not provide any context.
|
|
123
|
+
"""
|
|
124
|
+
super().__init__()
|
|
125
|
+
|
|
126
|
+
def fit(self, text:str, units:List[FrameExtractionUnit]):
|
|
127
|
+
"""
|
|
128
|
+
Parameters:
|
|
129
|
+
----------
|
|
130
|
+
text : str
|
|
131
|
+
The document text.
|
|
132
|
+
"""
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
def chunk(self, unit:FrameExtractionUnit) -> str:
|
|
136
|
+
return ""
|
|
137
|
+
|
|
138
|
+
class WholeDocumentContextChunker(ContextChunker):
|
|
139
|
+
def __init__(self):
|
|
140
|
+
"""
|
|
141
|
+
This class provides the whole document as context.
|
|
142
|
+
"""
|
|
143
|
+
super().__init__()
|
|
144
|
+
self.text = None
|
|
145
|
+
|
|
146
|
+
def fit(self, text:str, units:List[FrameExtractionUnit]):
|
|
147
|
+
"""
|
|
148
|
+
Parameters:
|
|
149
|
+
----------
|
|
150
|
+
text : str
|
|
151
|
+
The document text.
|
|
152
|
+
"""
|
|
153
|
+
self.text = text
|
|
154
|
+
|
|
155
|
+
def chunk(self, unit:FrameExtractionUnit) -> str:
|
|
156
|
+
if self.text is None:
|
|
157
|
+
raise ValueError("The context chunker has not been fitted yet. Please call fit() before chunk().")
|
|
158
|
+
return self.text
|
|
159
|
+
|
|
160
|
+
class SlideWindowContextChunker(ContextChunker):
|
|
161
|
+
def __init__(self, window_size:int):
|
|
162
|
+
"""
|
|
163
|
+
This class provides a sliding window context. For example, +-2 sentences around a unit sentence.
|
|
164
|
+
"""
|
|
165
|
+
super().__init__()
|
|
166
|
+
self.window_size = window_size
|
|
167
|
+
self.units = None
|
|
168
|
+
|
|
169
|
+
def fit(self, text:str, units:List[FrameExtractionUnit]):
|
|
170
|
+
"""
|
|
171
|
+
Parameters:
|
|
172
|
+
----------
|
|
173
|
+
units : List[FrameExtractionUnit]
|
|
174
|
+
The list of frame extraction units.
|
|
175
|
+
"""
|
|
176
|
+
self.units = sorted(units)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def chunk(self, unit:FrameExtractionUnit) -> str:
|
|
180
|
+
if self.units is None:
|
|
181
|
+
raise ValueError("The context chunker has not been fitted yet. Please call fit() before chunk().")
|
|
182
|
+
|
|
183
|
+
index = self.units.index(unit)
|
|
184
|
+
start = max(0, index - self.window_size)
|
|
185
|
+
end = min(len(self.units), index + self.window_size + 1)
|
|
186
|
+
context = []
|
|
187
|
+
for i in range(start, end):
|
|
188
|
+
context.append(self.units[i].text)
|
|
189
|
+
|
|
190
|
+
return " ".join(context)
|
|
191
|
+
|
llm_ie/data_types.py
CHANGED
|
@@ -1,9 +1,83 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
1
2
|
from typing import List, Dict, Tuple, Iterable, Callable
|
|
2
3
|
import importlib.util
|
|
3
4
|
import warnings
|
|
4
5
|
import json
|
|
5
6
|
|
|
6
7
|
|
|
8
|
+
@dataclass
|
|
9
|
+
class FrameExtractionUnit:
|
|
10
|
+
def __init__(self, start:int, end:int, text:str):
|
|
11
|
+
"""
|
|
12
|
+
This class holds the unit text for frame extraction, for example, a sentence.
|
|
13
|
+
FrameExtractor prompt it one at a time to extract frames.
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
----------
|
|
17
|
+
start : int
|
|
18
|
+
start character position of the unit text, relative to the whole document
|
|
19
|
+
end : int
|
|
20
|
+
end character position of the unit text, relative to the whole document
|
|
21
|
+
text : str
|
|
22
|
+
the unit text. Should be the exact string by [start:end]
|
|
23
|
+
"""
|
|
24
|
+
self.start = start
|
|
25
|
+
self.end = end
|
|
26
|
+
self.text = text
|
|
27
|
+
|
|
28
|
+
def __eq__(self, other):
|
|
29
|
+
if not isinstance(other, FrameExtractionUnit):
|
|
30
|
+
return NotImplemented
|
|
31
|
+
return (self.start == other.start and self.end == other.end)
|
|
32
|
+
|
|
33
|
+
def __hash__(self):
|
|
34
|
+
return hash((self.start, self.end))
|
|
35
|
+
|
|
36
|
+
def __lt__(self, other):
|
|
37
|
+
if not isinstance(other, FrameExtractionUnit):
|
|
38
|
+
return NotImplemented
|
|
39
|
+
return self.start < other.start
|
|
40
|
+
|
|
41
|
+
def __repr__(self):
|
|
42
|
+
return f"FrameExtractionUnit(start={self.start}, end={self.end}, text='{self.text[:100]}...')"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class FrameExtractionUnitResult:
|
|
47
|
+
def __init__(self, start:int, end:int, text:str, gen_text:str):
|
|
48
|
+
"""
|
|
49
|
+
This class holds the unit text for frame extraction, for example, a sentence.
|
|
50
|
+
FrameExtractor prompt it one at a time to extract frames.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
start : int
|
|
55
|
+
start character position of the unit text, relative to the whole document
|
|
56
|
+
end : int
|
|
57
|
+
end character position of the unit text, relative to the whole document
|
|
58
|
+
text : str
|
|
59
|
+
the unit text. Should be the exact string by [start:end]
|
|
60
|
+
gen_text : str
|
|
61
|
+
the generated text by LLM (ideally) following '[{"entity_text": "xxx", "attr": {"key": "value"}}]' format. Does not contain spans (start/end).
|
|
62
|
+
"""
|
|
63
|
+
self.start = start
|
|
64
|
+
self.end = end
|
|
65
|
+
self.text = text
|
|
66
|
+
self.gen_text = gen_text
|
|
67
|
+
|
|
68
|
+
def __eq__(self, other):
|
|
69
|
+
if not isinstance(other, FrameExtractionUnit):
|
|
70
|
+
return NotImplemented
|
|
71
|
+
return (self.start == other.start and self.end == other.end and self.text == other.text and self.gen_text == other.gen_text)
|
|
72
|
+
|
|
73
|
+
def __hash__(self):
|
|
74
|
+
return hash((self.start, self.end, self.text, self.gen_text))
|
|
75
|
+
|
|
76
|
+
def __repr__(self):
|
|
77
|
+
return f"FrameExtractionUnitResult(start={self.start}, end={self.end}, text='{self.text[:100]}...', gen_text='{self.gen_text[:100]}...')"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
7
81
|
class LLMInformationExtractionFrame:
|
|
8
82
|
def __init__(self, frame_id:str, start:int, end:int, entity_text:str, attr:Dict[str,str]=None):
|
|
9
83
|
"""
|
|
@@ -33,7 +107,7 @@ class LLMInformationExtractionFrame:
|
|
|
33
107
|
if attr:
|
|
34
108
|
self.attr = attr.copy()
|
|
35
109
|
else:
|
|
36
|
-
self.attr =
|
|
110
|
+
self.attr = {}
|
|
37
111
|
|
|
38
112
|
def is_equal(self, frame:"LLMInformationExtractionFrame") -> bool:
|
|
39
113
|
"""
|