llm-ie 1.2.2__py3-none-any.whl → 1.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_ie/__init__.py +5 -4
- llm_ie/chunkers.py +44 -5
- llm_ie/data_types.py +23 -37
- llm_ie/engines.py +577 -61
- llm_ie/extractors.py +335 -219
- {llm_ie-1.2.2.dist-info → llm_ie-1.2.3.dist-info}/METADATA +1 -1
- {llm_ie-1.2.2.dist-info → llm_ie-1.2.3.dist-info}/RECORD +8 -8
- {llm_ie-1.2.2.dist-info → llm_ie-1.2.3.dist-info}/WHEEL +0 -0
llm_ie/__init__.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
from .data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
2
|
-
from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
|
|
2
|
+
from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
|
|
3
|
+
from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
|
|
3
4
|
from .extractors import DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
|
|
4
|
-
from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
|
|
5
|
+
from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
|
|
5
6
|
from .prompt_editor import PromptEditor
|
|
6
7
|
|
|
7
8
|
__all__ = ["LLMInformationExtractionFrame", "LLMInformationExtractionDocument",
|
|
8
|
-
"BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
|
|
9
|
+
"BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
|
|
9
10
|
"DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
|
|
10
|
-
"UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
|
|
11
|
+
"UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
|
|
11
12
|
"PromptEditor"]
|
llm_ie/chunkers.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
from typing import Set, List, Dict, Tuple, Union, Callable
|
|
3
|
+
import asyncio
|
|
4
|
+
import uuid
|
|
3
5
|
from llm_ie.data_types import FrameExtractionUnit
|
|
4
6
|
|
|
5
7
|
|
|
@@ -11,7 +13,8 @@ class UnitChunker(abc.ABC):
|
|
|
11
13
|
"""
|
|
12
14
|
pass
|
|
13
15
|
|
|
14
|
-
|
|
16
|
+
@abc.abstractmethod
|
|
17
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
15
18
|
"""
|
|
16
19
|
Parameters:
|
|
17
20
|
----------
|
|
@@ -20,6 +23,12 @@ class UnitChunker(abc.ABC):
|
|
|
20
23
|
"""
|
|
21
24
|
return NotImplemented
|
|
22
25
|
|
|
26
|
+
async def chunk_async(self, text:str, doc_id:str=None, executor=None) -> List[FrameExtractionUnit]:
|
|
27
|
+
"""
|
|
28
|
+
asynchronous version of chunk method.
|
|
29
|
+
"""
|
|
30
|
+
loop = asyncio.get_running_loop()
|
|
31
|
+
return await loop.run_in_executor(executor, self.chunk, text, doc_id)
|
|
23
32
|
|
|
24
33
|
class WholeDocumentUnitChunker(UnitChunker):
|
|
25
34
|
def __init__(self):
|
|
@@ -28,7 +37,7 @@ class WholeDocumentUnitChunker(UnitChunker):
|
|
|
28
37
|
"""
|
|
29
38
|
super().__init__()
|
|
30
39
|
|
|
31
|
-
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
40
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
32
41
|
"""
|
|
33
42
|
Parameters:
|
|
34
43
|
----------
|
|
@@ -36,6 +45,7 @@ class WholeDocumentUnitChunker(UnitChunker):
|
|
|
36
45
|
The document text.
|
|
37
46
|
"""
|
|
38
47
|
return [FrameExtractionUnit(
|
|
48
|
+
doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
|
|
39
49
|
start=0,
|
|
40
50
|
end=len(text),
|
|
41
51
|
text=text
|
|
@@ -57,7 +67,7 @@ class SeparatorUnitChunker(UnitChunker):
|
|
|
57
67
|
|
|
58
68
|
self.sep = sep
|
|
59
69
|
|
|
60
|
-
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
70
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
61
71
|
"""
|
|
62
72
|
Parameters:
|
|
63
73
|
----------
|
|
@@ -70,6 +80,7 @@ class SeparatorUnitChunker(UnitChunker):
|
|
|
70
80
|
for paragraph in paragraphs:
|
|
71
81
|
end = start + len(paragraph)
|
|
72
82
|
paragraph_units.append(FrameExtractionUnit(
|
|
83
|
+
doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
|
|
73
84
|
start=start,
|
|
74
85
|
end=end,
|
|
75
86
|
text=paragraph
|
|
@@ -77,6 +88,7 @@ class SeparatorUnitChunker(UnitChunker):
|
|
|
77
88
|
start = end + len(self.sep)
|
|
78
89
|
return paragraph_units
|
|
79
90
|
|
|
91
|
+
|
|
80
92
|
class SentenceUnitChunker(UnitChunker):
|
|
81
93
|
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
|
82
94
|
def __init__(self):
|
|
@@ -85,7 +97,7 @@ class SentenceUnitChunker(UnitChunker):
|
|
|
85
97
|
"""
|
|
86
98
|
super().__init__()
|
|
87
99
|
|
|
88
|
-
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
100
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
89
101
|
"""
|
|
90
102
|
Parameters:
|
|
91
103
|
----------
|
|
@@ -95,6 +107,7 @@ class SentenceUnitChunker(UnitChunker):
|
|
|
95
107
|
sentences = []
|
|
96
108
|
for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
|
|
97
109
|
sentences.append(FrameExtractionUnit(
|
|
110
|
+
doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
|
|
98
111
|
start=start,
|
|
99
112
|
end=end,
|
|
100
113
|
text=text[start:end]
|
|
@@ -109,7 +122,7 @@ class TextLineUnitChunker(UnitChunker):
|
|
|
109
122
|
"""
|
|
110
123
|
super().__init__()
|
|
111
124
|
|
|
112
|
-
def chunk(self, text:str) -> List[FrameExtractionUnit]:
|
|
125
|
+
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
|
|
113
126
|
"""
|
|
114
127
|
Parameters:
|
|
115
128
|
----------
|
|
@@ -122,6 +135,7 @@ class TextLineUnitChunker(UnitChunker):
|
|
|
122
135
|
for line in lines:
|
|
123
136
|
end = start + len(line)
|
|
124
137
|
line_units.append(FrameExtractionUnit(
|
|
138
|
+
doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
|
|
125
139
|
start=start,
|
|
126
140
|
end=end,
|
|
127
141
|
text=line
|
|
@@ -138,6 +152,24 @@ class ContextChunker(abc.ABC):
|
|
|
138
152
|
"""
|
|
139
153
|
pass
|
|
140
154
|
|
|
155
|
+
@abc.abstractmethod
|
|
156
|
+
def fit(self, text:str, units:List[FrameExtractionUnit]):
|
|
157
|
+
"""
|
|
158
|
+
Parameters:
|
|
159
|
+
----------
|
|
160
|
+
text : str
|
|
161
|
+
The document text.
|
|
162
|
+
"""
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
async def fit_async(self, text:str, units:List[FrameExtractionUnit], executor=None):
|
|
166
|
+
"""
|
|
167
|
+
asynchronous version of fit method.
|
|
168
|
+
"""
|
|
169
|
+
loop = asyncio.get_running_loop()
|
|
170
|
+
return await loop.run_in_executor(executor, self.fit, text, units)
|
|
171
|
+
|
|
172
|
+
@abc.abstractmethod
|
|
141
173
|
def chunk(self, unit:FrameExtractionUnit) -> str:
|
|
142
174
|
"""
|
|
143
175
|
Parameters:
|
|
@@ -150,6 +182,13 @@ class ContextChunker(abc.ABC):
|
|
|
150
182
|
"""
|
|
151
183
|
return NotImplemented
|
|
152
184
|
|
|
185
|
+
async def chunk_async(self, unit:FrameExtractionUnit, executor=None) -> str:
|
|
186
|
+
"""
|
|
187
|
+
asynchronous version of chunk method.
|
|
188
|
+
"""
|
|
189
|
+
loop = asyncio.get_running_loop()
|
|
190
|
+
return await loop.run_in_executor(executor, self.chunk, unit)
|
|
191
|
+
|
|
153
192
|
|
|
154
193
|
class NoContextChunker(ContextChunker):
|
|
155
194
|
def __init__(self):
|
llm_ie/data_types.py
CHANGED
|
@@ -7,13 +7,15 @@ import json
|
|
|
7
7
|
|
|
8
8
|
@dataclass
|
|
9
9
|
class FrameExtractionUnit:
|
|
10
|
-
def __init__(self, start:int, end:int, text:str):
|
|
10
|
+
def __init__(self, doc_id:str, start:int, end:int, text:str):
|
|
11
11
|
"""
|
|
12
12
|
This class holds the unit text for frame extraction, for example, a sentence.
|
|
13
13
|
FrameExtractor prompt it one at a time to extract frames.
|
|
14
14
|
|
|
15
15
|
Parameters
|
|
16
16
|
----------
|
|
17
|
+
doc_id : str, Optional
|
|
18
|
+
document ID.
|
|
17
19
|
start : int
|
|
18
20
|
start character position of the unit text, relative to the whole document
|
|
19
21
|
end : int
|
|
@@ -21,9 +23,28 @@ class FrameExtractionUnit:
|
|
|
21
23
|
text : str
|
|
22
24
|
the unit text. Should be the exact string by [start:end]
|
|
23
25
|
"""
|
|
26
|
+
self.doc_id = doc_id
|
|
24
27
|
self.start = start
|
|
25
28
|
self.end = end
|
|
26
29
|
self.text = text
|
|
30
|
+
# status: "pending", "success", "fail"
|
|
31
|
+
self.status = "pending"
|
|
32
|
+
# generated text by LLM
|
|
33
|
+
self.gen_text = None
|
|
34
|
+
|
|
35
|
+
def get_status(self) -> str:
|
|
36
|
+
return self.status
|
|
37
|
+
|
|
38
|
+
def set_status(self, status:str):
|
|
39
|
+
if status not in {"pending", "success", "fail"}:
|
|
40
|
+
raise ValueError('status must be one of {"pending", "success", "fail"}.')
|
|
41
|
+
self.status = status
|
|
42
|
+
|
|
43
|
+
def get_generated_text(self) -> str:
|
|
44
|
+
return self.gen_text
|
|
45
|
+
|
|
46
|
+
def set_generated_text(self, gen_text:str):
|
|
47
|
+
self.gen_text = gen_text
|
|
27
48
|
|
|
28
49
|
def __eq__(self, other):
|
|
29
50
|
if not isinstance(other, FrameExtractionUnit):
|
|
@@ -39,43 +60,8 @@ class FrameExtractionUnit:
|
|
|
39
60
|
return self.start < other.start
|
|
40
61
|
|
|
41
62
|
def __repr__(self):
|
|
42
|
-
return f"FrameExtractionUnit(start={self.start}, end={self.end}, text='{self.text[:100]}...')"
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
@dataclass
|
|
46
|
-
class FrameExtractionUnitResult:
|
|
47
|
-
def __init__(self, start:int, end:int, text:str, gen_text:str):
|
|
48
|
-
"""
|
|
49
|
-
This class holds the unit text for frame extraction, for example, a sentence.
|
|
50
|
-
FrameExtractor prompt it one at a time to extract frames.
|
|
51
|
-
|
|
52
|
-
Parameters
|
|
53
|
-
----------
|
|
54
|
-
start : int
|
|
55
|
-
start character position of the unit text, relative to the whole document
|
|
56
|
-
end : int
|
|
57
|
-
end character position of the unit text, relative to the whole document
|
|
58
|
-
text : str
|
|
59
|
-
the unit text. Should be the exact string by [start:end]
|
|
60
|
-
gen_text : str
|
|
61
|
-
the generated text by LLM (ideally) following '[{"entity_text": "xxx", "attr": {"key": "value"}}]' format. Does not contain spans (start/end).
|
|
62
|
-
"""
|
|
63
|
-
self.start = start
|
|
64
|
-
self.end = end
|
|
65
|
-
self.text = text
|
|
66
|
-
self.gen_text = gen_text
|
|
67
|
-
|
|
68
|
-
def __eq__(self, other):
|
|
69
|
-
if not isinstance(other, FrameExtractionUnit):
|
|
70
|
-
return NotImplemented
|
|
71
|
-
return (self.start == other.start and self.end == other.end and self.text == other.text and self.gen_text == other.gen_text)
|
|
63
|
+
return f"FrameExtractionUnit(doc_id={self.doc_id}, start={self.start}, end={self.end}, status={self.status}, text='{self.text[:100]}...')"
|
|
72
64
|
|
|
73
|
-
def __hash__(self):
|
|
74
|
-
return hash((self.start, self.end, self.text, self.gen_text))
|
|
75
|
-
|
|
76
|
-
def __repr__(self):
|
|
77
|
-
return f"FrameExtractionUnitResult(start={self.start}, end={self.end}, text='{self.text[:100]}...', gen_text='{self.gen_text[:100]}...')"
|
|
78
|
-
|
|
79
65
|
|
|
80
66
|
@dataclass
|
|
81
67
|
class LLMInformationExtractionFrame:
|