llm-ie 1.2.1__py3-none-any.whl → 1.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_ie/__init__.py +5 -4
- llm_ie/chunkers.py +78 -4
- llm_ie/data_types.py +23 -37
- llm_ie/engines.py +663 -112
- llm_ie/extractors.py +357 -206
- llm_ie/prompt_editor.py +4 -4
- {llm_ie-1.2.1.dist-info → llm_ie-1.2.3.dist-info}/METADATA +1 -1
- {llm_ie-1.2.1.dist-info → llm_ie-1.2.3.dist-info}/RECORD +9 -9
- {llm_ie-1.2.1.dist-info → llm_ie-1.2.3.dist-info}/WHEEL +0 -0
llm_ie/__init__.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
from .data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
2
|
-
from .engines import BasicLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
|
|
2
|
+
from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
|
|
3
|
+
from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
|
|
3
4
|
from .extractors import DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
|
|
4
|
-
from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
|
|
5
|
+
from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
|
|
5
6
|
from .prompt_editor import PromptEditor
|
|
6
7
|
|
|
7
8
|
__all__ = ["LLMInformationExtractionFrame", "LLMInformationExtractionDocument",
|
|
8
|
-
"BasicLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
|
|
9
|
+
"BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
|
|
9
10
|
"DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
|
|
10
|
-
"UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
|
|
11
|
+
"UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
|
|
11
12
|
"PromptEditor"]
|
llm_ie/chunkers.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
from typing import Set, List, Dict, Tuple, Union, Callable
|
|
3
|
+
import asyncio
|
|
4
|
+
import uuid
|
|
3
5
|
from llm_ie.data_types import FrameExtractionUnit
|
|
4
6
|
|
|
5
7
|
|
|
@@ -11,7 +13,8 @@ class UnitChunker(abc.ABC):
|
|
|
11
13
|
"""
|
|
12
14
|
pass
|
|
13
15
|
|
|
14
|
-
|
|
16
|
+
@abc.abstractmethod
|
|
17
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
15
18
|
"""
|
|
16
19
|
Parameters:
|
|
17
20
|
----------
|
|
@@ -20,6 +23,12 @@ class UnitChunker(abc.ABC):
|
|
|
20
23
|
"""
|
|
21
24
|
return NotImplemented
|
|
22
25
|
|
|
26
|
+
async def chunk_async(self, text:str, doc_id:str=None, executor=None) -> List[FrameExtractionUnit]:
|
|
27
|
+
"""
|
|
28
|
+
asynchronous version of chunk method.
|
|
29
|
+
"""
|
|
30
|
+
loop = asyncio.get_running_loop()
|
|
31
|
+
return await loop.run_in_executor(executor, self.chunk, text, doc_id)
|
|
23
32
|
|
|
24
33
|
class WholeDocumentUnitChunker(UnitChunker):
|
|
25
34
|
def __init__(self):
|
|
@@ -28,7 +37,7 @@ class WholeDocumentUnitChunker(UnitChunker):
|
|
|
28
37
|
"""
|
|
29
38
|
super().__init__()
|
|
30
39
|
|
|
31
|
-
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
40
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
32
41
|
"""
|
|
33
42
|
Parameters:
|
|
34
43
|
----------
|
|
@@ -36,11 +45,49 @@ class WholeDocumentUnitChunker(UnitChunker):
|
|
|
36
45
|
The document text.
|
|
37
46
|
"""
|
|
38
47
|
return [FrameExtractionUnit(
|
|
48
|
+
doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
|
|
39
49
|
start=0,
|
|
40
50
|
end=len(text),
|
|
41
51
|
text=text
|
|
42
52
|
)]
|
|
43
53
|
|
|
54
|
+
class SeparatorUnitChunker(UnitChunker):
|
|
55
|
+
def __init__(self, sep:str):
|
|
56
|
+
"""
|
|
57
|
+
This class chunks a document by separator provided.
|
|
58
|
+
|
|
59
|
+
Parameters:
|
|
60
|
+
----------
|
|
61
|
+
sep : str
|
|
62
|
+
a separator string.
|
|
63
|
+
"""
|
|
64
|
+
super().__init__()
|
|
65
|
+
if not isinstance(sep, str):
|
|
66
|
+
raise ValueError("sep must be a string")
|
|
67
|
+
|
|
68
|
+
self.sep = sep
|
|
69
|
+
|
|
70
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
71
|
+
"""
|
|
72
|
+
Parameters:
|
|
73
|
+
----------
|
|
74
|
+
text : str
|
|
75
|
+
The document text.
|
|
76
|
+
"""
|
|
77
|
+
paragraphs = text.split(self.sep)
|
|
78
|
+
paragraph_units = []
|
|
79
|
+
start = 0
|
|
80
|
+
for paragraph in paragraphs:
|
|
81
|
+
end = start + len(paragraph)
|
|
82
|
+
paragraph_units.append(FrameExtractionUnit(
|
|
83
|
+
doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
|
|
84
|
+
start=start,
|
|
85
|
+
end=end,
|
|
86
|
+
text=paragraph
|
|
87
|
+
))
|
|
88
|
+
start = end + len(self.sep)
|
|
89
|
+
return paragraph_units
|
|
90
|
+
|
|
44
91
|
|
|
45
92
|
class SentenceUnitChunker(UnitChunker):
|
|
46
93
|
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
|
@@ -50,7 +97,7 @@ class SentenceUnitChunker(UnitChunker):
|
|
|
50
97
|
"""
|
|
51
98
|
super().__init__()
|
|
52
99
|
|
|
53
|
-
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
100
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
54
101
|
"""
|
|
55
102
|
Parameters:
|
|
56
103
|
----------
|
|
@@ -60,6 +107,7 @@ class SentenceUnitChunker(UnitChunker):
|
|
|
60
107
|
sentences = []
|
|
61
108
|
for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
|
|
62
109
|
sentences.append(FrameExtractionUnit(
|
|
110
|
+
doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
|
|
63
111
|
start=start,
|
|
64
112
|
end=end,
|
|
65
113
|
text=text[start:end]
|
|
@@ -74,7 +122,7 @@ class TextLineUnitChunker(UnitChunker):
|
|
|
74
122
|
"""
|
|
75
123
|
super().__init__()
|
|
76
124
|
|
|
77
|
-
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
125
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
78
126
|
"""
|
|
79
127
|
Parameters:
|
|
80
128
|
----------
|
|
@@ -87,6 +135,7 @@ class TextLineUnitChunker(UnitChunker):
|
|
|
87
135
|
for line in lines:
|
|
88
136
|
end = start + len(line)
|
|
89
137
|
line_units.append(FrameExtractionUnit(
|
|
138
|
+
doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
|
|
90
139
|
start=start,
|
|
91
140
|
end=end,
|
|
92
141
|
text=line
|
|
@@ -103,6 +152,24 @@ class ContextChunker(abc.ABC):
|
|
|
103
152
|
"""
|
|
104
153
|
pass
|
|
105
154
|
|
|
155
|
+
@abc.abstractmethod
|
|
156
|
+
def fit(self, text:str, units:List[FrameExtractionUnit]):
|
|
157
|
+
"""
|
|
158
|
+
Parameters:
|
|
159
|
+
----------
|
|
160
|
+
text : str
|
|
161
|
+
The document text.
|
|
162
|
+
"""
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
async def fit_async(self, text:str, units:List[FrameExtractionUnit], executor=None):
|
|
166
|
+
"""
|
|
167
|
+
asynchronous version of fit method.
|
|
168
|
+
"""
|
|
169
|
+
loop = asyncio.get_running_loop()
|
|
170
|
+
return await loop.run_in_executor(executor, self.fit, text, units)
|
|
171
|
+
|
|
172
|
+
@abc.abstractmethod
|
|
106
173
|
def chunk(self, unit:FrameExtractionUnit) -> str:
|
|
107
174
|
"""
|
|
108
175
|
Parameters:
|
|
@@ -115,6 +182,13 @@ class ContextChunker(abc.ABC):
|
|
|
115
182
|
"""
|
|
116
183
|
return NotImplemented
|
|
117
184
|
|
|
185
|
+
async def chunk_async(self, unit:FrameExtractionUnit, executor=None) -> str:
|
|
186
|
+
"""
|
|
187
|
+
asynchronous version of chunk method.
|
|
188
|
+
"""
|
|
189
|
+
loop = asyncio.get_running_loop()
|
|
190
|
+
return await loop.run_in_executor(executor, self.chunk, unit)
|
|
191
|
+
|
|
118
192
|
|
|
119
193
|
class NoContextChunker(ContextChunker):
|
|
120
194
|
def __init__(self):
|
llm_ie/data_types.py
CHANGED
|
@@ -7,13 +7,15 @@ import json
|
|
|
7
7
|
|
|
8
8
|
@dataclass
|
|
9
9
|
class FrameExtractionUnit:
|
|
10
|
-
def __init__(self, start:int, end:int, text:str):
|
|
10
|
+
def __init__(self, doc_id:str, start:int, end:int, text:str):
|
|
11
11
|
"""
|
|
12
12
|
This class holds the unit text for frame extraction, for example, a sentence.
|
|
13
13
|
FrameExtractor prompt it one at a time to extract frames.
|
|
14
14
|
|
|
15
15
|
Parameters
|
|
16
16
|
----------
|
|
17
|
+
doc_id : str, Optional
|
|
18
|
+
document ID.
|
|
17
19
|
start : int
|
|
18
20
|
start character position of the unit text, relative to the whole document
|
|
19
21
|
end : int
|
|
@@ -21,9 +23,28 @@ class FrameExtractionUnit:
|
|
|
21
23
|
text : str
|
|
22
24
|
the unit text. Should be the exact string by [start:end]
|
|
23
25
|
"""
|
|
26
|
+
self.doc_id = doc_id
|
|
24
27
|
self.start = start
|
|
25
28
|
self.end = end
|
|
26
29
|
self.text = text
|
|
30
|
+
# status: "pending", "success", "fail"
|
|
31
|
+
self.status = "pending"
|
|
32
|
+
# generated text by LLM
|
|
33
|
+
self.gen_text = None
|
|
34
|
+
|
|
35
|
+
def get_status(self) -> str:
|
|
36
|
+
return self.status
|
|
37
|
+
|
|
38
|
+
def set_status(self, status:str):
|
|
39
|
+
if status not in {"pending", "success", "fail"}:
|
|
40
|
+
raise ValueError('status must be one of {"pending", "success", "fail"}.')
|
|
41
|
+
self.status = status
|
|
42
|
+
|
|
43
|
+
def get_generated_text(self) -> str:
|
|
44
|
+
return self.gen_text
|
|
45
|
+
|
|
46
|
+
def set_generated_text(self, gen_text:str):
|
|
47
|
+
self.gen_text = gen_text
|
|
27
48
|
|
|
28
49
|
def __eq__(self, other):
|
|
29
50
|
if not isinstance(other, FrameExtractionUnit):
|
|
@@ -39,43 +60,8 @@ class FrameExtractionUnit:
|
|
|
39
60
|
return self.start < other.start
|
|
40
61
|
|
|
41
62
|
def __repr__(self):
|
|
42
|
-
return f"FrameExtractionUnit(start={self.start}, end={self.end}, text='{self.text[:100]}...')"
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
@dataclass
|
|
46
|
-
class FrameExtractionUnitResult:
|
|
47
|
-
def __init__(self, start:int, end:int, text:str, gen_text:str):
|
|
48
|
-
"""
|
|
49
|
-
This class holds the unit text for frame extraction, for example, a sentence.
|
|
50
|
-
FrameExtractor prompt it one at a time to extract frames.
|
|
51
|
-
|
|
52
|
-
Parameters
|
|
53
|
-
----------
|
|
54
|
-
start : int
|
|
55
|
-
start character position of the unit text, relative to the whole document
|
|
56
|
-
end : int
|
|
57
|
-
end character position of the unit text, relative to the whole document
|
|
58
|
-
text : str
|
|
59
|
-
the unit text. Should be the exact string by [start:end]
|
|
60
|
-
gen_text : str
|
|
61
|
-
the generated text by LLM (ideally) following '[{"entity_text": "xxx", "attr": {"key": "value"}}]' format. Does not contain spans (start/end).
|
|
62
|
-
"""
|
|
63
|
-
self.start = start
|
|
64
|
-
self.end = end
|
|
65
|
-
self.text = text
|
|
66
|
-
self.gen_text = gen_text
|
|
67
|
-
|
|
68
|
-
def __eq__(self, other):
|
|
69
|
-
if not isinstance(other, FrameExtractionUnit):
|
|
70
|
-
return NotImplemented
|
|
71
|
-
return (self.start == other.start and self.end == other.end and self.text == other.text and self.gen_text == other.gen_text)
|
|
63
|
+
return f"FrameExtractionUnit(doc_id={self.doc_id}, start={self.start}, end={self.end}, status={self.status}, text='{self.text[:100]}...')"
|
|
72
64
|
|
|
73
|
-
def __hash__(self):
|
|
74
|
-
return hash((self.start, self.end, self.text, self.gen_text))
|
|
75
|
-
|
|
76
|
-
def __repr__(self):
|
|
77
|
-
return f"FrameExtractionUnitResult(start={self.start}, end={self.end}, text='{self.text[:100]}...', gen_text='{self.gen_text[:100]}...')"
|
|
78
|
-
|
|
79
65
|
|
|
80
66
|
@dataclass
|
|
81
67
|
class LLMInformationExtractionFrame:
|