llm-ie 0.4.7__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/chunkers.py ADDED
@@ -0,0 +1,191 @@
1
+ import abc
2
+ from typing import Set, List, Dict, Tuple, Union, Callable
3
+ from llm_ie.data_types import FrameExtractionUnit
4
+
5
+
6
+ class UnitChunker(abc.ABC):
7
+ def __init__(self):
8
+ """
9
+ This is the abstract class for frame extraction unit chunker.
10
+ It chunks a document into units (e.g., sentences). LLMs process unit by unit.
11
+ """
12
+ pass
13
+
14
+ def chunk(self, text:str) -> List[FrameExtractionUnit]:
15
+ """
16
+ Parameters:
17
+ ----------
18
+ text : str
19
+ The document text.
20
+ """
21
+ return NotImplemented
22
+
23
+
24
+ class WholeDocumentUnitChunker(UnitChunker):
25
+ def __init__(self):
26
+ """
27
+ This class chunks the whole document into a single unit (no chunking).
28
+ """
29
+ super().__init__()
30
+
31
+ def chunk(self, text:str) -> List[FrameExtractionUnit]:
32
+ """
33
+ Parameters:
34
+ ----------
35
+ text : str
36
+ The document text.
37
+ """
38
+ return [FrameExtractionUnit(
39
+ start=0,
40
+ end=len(text),
41
+ text=text
42
+ )]
43
+
44
+
45
+ class SentenceUnitChunker(UnitChunker):
46
+ from nltk.tokenize.punkt import PunktSentenceTokenizer
47
+ def __init__(self):
48
+ """
49
+ This class uses the NLTK PunktSentenceTokenizer to chunk a document into sentences.
50
+ """
51
+ super().__init__()
52
+
53
+ def chunk(self, text:str) -> List[FrameExtractionUnit]:
54
+ """
55
+ Parameters:
56
+ ----------
57
+ text : str
58
+ The document text.
59
+ """
60
+ sentences = []
61
+ for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
62
+ sentences.append(FrameExtractionUnit(
63
+ start=start,
64
+ end=end,
65
+ text=text[start:end]
66
+ ))
67
+ return sentences
68
+
69
+
70
+ class TextLineUnitChunker(UnitChunker):
71
+ def __init__(self):
72
+ """
73
+ This class chunks a document into lines.
74
+ """
75
+ super().__init__()
76
+
77
+ def chunk(self, text:str) -> List[FrameExtractionUnit]:
78
+ """
79
+ Parameters:
80
+ ----------
81
+ text : str
82
+ The document text.
83
+ """
84
+ lines = text.split('\n')
85
+ line_units = []
86
+ start = 0
87
+ for line in lines:
88
+ end = start + len(line)
89
+ line_units.append(FrameExtractionUnit(
90
+ start=start,
91
+ end=end,
92
+ text=line
93
+ ))
94
+ start = end + 1
95
+ return line_units
96
+
97
+
98
+ class ContextChunker(abc.ABC):
99
+ def __init__(self):
100
+ """
101
+ This is the abstract class for context chunker. Given a frame extraction unit,
102
+ it returns the context for it.
103
+ """
104
+ pass
105
+
106
+ def chunk(self, unit:FrameExtractionUnit) -> str:
107
+ """
108
+ Parameters:
109
+ ----------
110
+ unit : FrameExtractionUnit
111
+ The frame extraction unit.
112
+
113
+ Return : str
114
+ The context for the frame extraction unit.
115
+ """
116
+ return NotImplemented
117
+
118
+
119
+ class NoContextChunker(ContextChunker):
120
+ def __init__(self):
121
+ """
122
+ This class does not provide any context.
123
+ """
124
+ super().__init__()
125
+
126
+ def fit(self, text:str, units:List[FrameExtractionUnit]):
127
+ """
128
+ Parameters:
129
+ ----------
130
+ text : str
131
+ The document text.
132
+ """
133
+ pass
134
+
135
+ def chunk(self, unit:FrameExtractionUnit) -> str:
136
+ return ""
137
+
138
+ class WholeDocumentContextChunker(ContextChunker):
139
+ def __init__(self):
140
+ """
141
+ This class provides the whole document as context.
142
+ """
143
+ super().__init__()
144
+ self.text = None
145
+
146
+ def fit(self, text:str, units:List[FrameExtractionUnit]):
147
+ """
148
+ Parameters:
149
+ ----------
150
+ text : str
151
+ The document text.
152
+ """
153
+ self.text = text
154
+
155
+ def chunk(self, unit:FrameExtractionUnit) -> str:
156
+ if self.text is None:
157
+ raise ValueError("The context chunker has not been fitted yet. Please call fit() before chunk().")
158
+ return self.text
159
+
160
+ class SlideWindowContextChunker(ContextChunker):
161
+ def __init__(self, window_size:int):
162
+ """
163
+ This class provides a sliding window context. For example, +-2 sentences around a unit sentence.
164
+ """
165
+ super().__init__()
166
+ self.window_size = window_size
167
+ self.units = None
168
+
169
+ def fit(self, text:str, units:List[FrameExtractionUnit]):
170
+ """
171
+ Parameters:
172
+ ----------
173
+ units : List[FrameExtractionUnit]
174
+ The list of frame extraction units.
175
+ """
176
+ self.units = sorted(units)
177
+
178
+
179
+ def chunk(self, unit:FrameExtractionUnit) -> str:
180
+ if self.units is None:
181
+ raise ValueError("The context chunker has not been fitted yet. Please call fit() before chunk().")
182
+
183
+ index = self.units.index(unit)
184
+ start = max(0, index - self.window_size)
185
+ end = min(len(self.units), index + self.window_size + 1)
186
+ context = []
187
+ for i in range(start, end):
188
+ context.append(self.units[i].text)
189
+
190
+ return " ".join(context)
191
+
llm_ie/data_types.py CHANGED
@@ -1,9 +1,83 @@
1
+ from dataclasses import dataclass
1
2
  from typing import List, Dict, Tuple, Iterable, Callable
2
3
  import importlib.util
3
4
  import warnings
4
5
  import json
5
6
 
6
7
 
8
+ @dataclass
9
+ class FrameExtractionUnit:
10
+ def __init__(self, start:int, end:int, text:str):
11
+ """
12
+ This class holds the unit text for frame extraction, for example, a sentence.
13
+ FrameExtractor prompt it one at a time to extract frames.
14
+
15
+ Parameters
16
+ ----------
17
+ start : int
18
+ start character position of the unit text, relative to the whole document
19
+ end : int
20
+ end character position of the unit text, relative to the whole document
21
+ text : str
22
+ the unit text. Should be the exact string by [start:end]
23
+ """
24
+ self.start = start
25
+ self.end = end
26
+ self.text = text
27
+
28
+ def __eq__(self, other):
29
+ if not isinstance(other, FrameExtractionUnit):
30
+ return NotImplemented
31
+ return (self.start == other.start and self.end == other.end)
32
+
33
+ def __hash__(self):
34
+ return hash((self.start, self.end))
35
+
36
+ def __lt__(self, other):
37
+ if not isinstance(other, FrameExtractionUnit):
38
+ return NotImplemented
39
+ return self.start < other.start
40
+
41
+ def __repr__(self):
42
+ return f"FrameExtractionUnit(start={self.start}, end={self.end}, text='{self.text[:100]}...')"
43
+
44
+
45
+ @dataclass
46
+ class FrameExtractionUnitResult:
47
+ def __init__(self, start:int, end:int, text:str, gen_text:str):
48
+ """
49
+ This class holds the unit text for frame extraction, for example, a sentence.
50
+ FrameExtractor prompt it one at a time to extract frames.
51
+
52
+ Parameters
53
+ ----------
54
+ start : int
55
+ start character position of the unit text, relative to the whole document
56
+ end : int
57
+ end character position of the unit text, relative to the whole document
58
+ text : str
59
+ the unit text. Should be the exact string by [start:end]
60
+ gen_text : str
61
+ the generated text by LLM (ideally) following '[{"entity_text": "xxx", "attr": {"key": "value"}}]' format. Does not contain spans (start/end).
62
+ """
63
+ self.start = start
64
+ self.end = end
65
+ self.text = text
66
+ self.gen_text = gen_text
67
+
68
+ def __eq__(self, other):
69
+ if not isinstance(other, FrameExtractionUnit):
70
+ return NotImplemented
71
+ return (self.start == other.start and self.end == other.end and self.text == other.text and self.gen_text == other.gen_text)
72
+
73
+ def __hash__(self):
74
+ return hash((self.start, self.end, self.text, self.gen_text))
75
+
76
+ def __repr__(self):
77
+ return f"FrameExtractionUnitResult(start={self.start}, end={self.end}, text='{self.text[:100]}...', gen_text='{self.gen_text[:100]}...')"
78
+
79
+
80
+ @dataclass
7
81
  class LLMInformationExtractionFrame:
8
82
  def __init__(self, frame_id:str, start:int, end:int, entity_text:str, attr:Dict[str,str]=None):
9
83
  """
@@ -33,7 +107,7 @@ class LLMInformationExtractionFrame:
33
107
  if attr:
34
108
  self.attr = attr.copy()
35
109
  else:
36
- self.attr = None
110
+ self.attr = {}
37
111
 
38
112
  def is_equal(self, frame:"LLMInformationExtractionFrame") -> bool:
39
113
  """