llm-ie 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py
CHANGED
|
@@ -143,7 +143,7 @@ class FrameExtractor(Extractor):
|
|
|
143
143
|
self.tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
|
|
144
144
|
|
|
145
145
|
|
|
146
|
-
def _jaccard_score(self, s1:
|
|
146
|
+
def _jaccard_score(self, s1:Set[str], s2:Set[str]) -> float:
|
|
147
147
|
"""
|
|
148
148
|
This method calculates the Jaccard score between two sets of word tokens.
|
|
149
149
|
"""
|
|
@@ -166,6 +166,8 @@ class FrameExtractor(Extractor):
|
|
|
166
166
|
def _get_closest_substring(self, text:str, pattern:str, buffer_size:float=0.2) -> Tuple[Tuple[int, int], float]:
|
|
167
167
|
"""
|
|
168
168
|
This method finds the closest (highest Jaccard score) substring in text that matches the pattern.
|
|
169
|
+
the substring must start with the same word token as the pattern. This is due to the observation that
|
|
170
|
+
LLM often generate the first few words consistently.
|
|
169
171
|
|
|
170
172
|
Parameters
|
|
171
173
|
----------
|
|
@@ -179,25 +181,29 @@ class FrameExtractor(Extractor):
|
|
|
179
181
|
Returns : Tuple[Tuple[int, int], float]
|
|
180
182
|
a tuple of 2-tuple span and Jaccard score.
|
|
181
183
|
"""
|
|
184
|
+
if not text or not pattern:
|
|
185
|
+
return None, 0
|
|
186
|
+
|
|
182
187
|
text_tokens, text_spans = self._get_word_tokens(text)
|
|
183
188
|
pattern_tokens, _ = self._get_word_tokens(pattern)
|
|
184
|
-
|
|
189
|
+
pattern_tokens_set = set(pattern_tokens)
|
|
185
190
|
window_size = len(pattern_tokens)
|
|
186
|
-
window_size_min = int(window_size * (1 - buffer_size))
|
|
187
|
-
window_size_max = int(window_size * (1 + buffer_size))
|
|
188
|
-
|
|
191
|
+
window_size_min = max(1, int(window_size * (1 - buffer_size)))
|
|
192
|
+
window_size_max = int(window_size * (1 + buffer_size)) + 1
|
|
193
|
+
closest_substring_span = None
|
|
189
194
|
best_score = 0
|
|
190
195
|
|
|
191
196
|
for i in range(len(text_tokens) - window_size_max):
|
|
192
197
|
for w in range(window_size_min, window_size_max):
|
|
193
|
-
sub_str_tokens =
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
198
|
+
sub_str_tokens = text_tokens[i:i + w]
|
|
199
|
+
if len(sub_str_tokens) > 0 and sub_str_tokens[0] == pattern_tokens[0]:
|
|
200
|
+
score = self._jaccard_score(set(sub_str_tokens), pattern_tokens_set)
|
|
201
|
+
if score > best_score:
|
|
202
|
+
best_score = score
|
|
203
|
+
sub_string_word_spans = text_spans[i:i + w]
|
|
204
|
+
closest_substring_span = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
|
|
199
205
|
|
|
200
|
-
return
|
|
206
|
+
return closest_substring_span, best_score
|
|
201
207
|
|
|
202
208
|
|
|
203
209
|
def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
|
|
@@ -235,7 +241,7 @@ class FrameExtractor(Extractor):
|
|
|
235
241
|
|
|
236
242
|
# Exact match
|
|
237
243
|
match = re.search(re.escape(entity), text)
|
|
238
|
-
if match:
|
|
244
|
+
if match and entity:
|
|
239
245
|
start, end = match.span()
|
|
240
246
|
entity_spans.append((start, end))
|
|
241
247
|
# Replace the found entity with spaces to avoid finding the same instance again
|
|
@@ -243,7 +249,7 @@ class FrameExtractor(Extractor):
|
|
|
243
249
|
# Fuzzy match
|
|
244
250
|
elif fuzzy_match:
|
|
245
251
|
closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
|
|
246
|
-
if best_score >= fuzzy_score_cutoff:
|
|
252
|
+
if closest_substring_span and best_score >= fuzzy_score_cutoff:
|
|
247
253
|
entity_spans.append(closest_substring_span)
|
|
248
254
|
# Replace the found entity with spaces to avoid finding the same instance again
|
|
249
255
|
text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
|
|
@@ -432,7 +438,7 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
432
438
|
frame = LLMInformationExtractionFrame(frame_id=f"{i}",
|
|
433
439
|
start=start,
|
|
434
440
|
end=end,
|
|
435
|
-
entity_text=
|
|
441
|
+
entity_text=text[start:end],
|
|
436
442
|
attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
|
|
437
443
|
frame_list.append(frame)
|
|
438
444
|
return frame_list
|
|
@@ -731,12 +737,13 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
731
737
|
for ent, span in zip(entity_json, spans):
|
|
732
738
|
if span is not None:
|
|
733
739
|
start, end = span
|
|
740
|
+
entity_text = sent['sentence_text'][start:end]
|
|
734
741
|
start += sent['sentence_start']
|
|
735
742
|
end += sent['sentence_start']
|
|
736
743
|
frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
|
|
737
744
|
start=start,
|
|
738
745
|
end=end,
|
|
739
|
-
entity_text=
|
|
746
|
+
entity_text=entity_text,
|
|
740
747
|
attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
|
|
741
748
|
frame_list.append(frame)
|
|
742
749
|
return frame_list
|
|
@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=oKH_QeD
|
|
|
16
16
|
llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=oKH_QeDgpw771ZdHk3L7DYz2Jvfm7OolUoTiJyMJI30,9541
|
|
17
17
|
llm_ie/data_types.py,sha256=hPz3WOeAzfn2QKmb0CxHmRdQWZQ4G9zq8U-RJBVFdYk,14329
|
|
18
18
|
llm_ie/engines.py,sha256=PTYs7s_iCPmI-yFUCVCPY_cMGS77ma2VGoz4rdNkODI,9308
|
|
19
|
-
llm_ie/extractors.py,sha256=
|
|
19
|
+
llm_ie/extractors.py,sha256=j9L9USybJBmYZM4RAjGO6DR4StYBgzhqCN6nEFZZQVQ,64523
|
|
20
20
|
llm_ie/prompt_editor.py,sha256=Xc5ZHsEnM8-YYITokIsM6BVsf2Ec_8ajJDaldPf-P8U,8577
|
|
21
|
-
llm_ie-0.3.
|
|
22
|
-
llm_ie-0.3.
|
|
23
|
-
llm_ie-0.3.
|
|
21
|
+
llm_ie-0.3.4.dist-info/METADATA,sha256=-5Tf9TCWczCVOsdMavkBZ-KnYPGnbNrV1rsU-pMHfPA,41266
|
|
22
|
+
llm_ie-0.3.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
23
|
+
llm_ie-0.3.4.dist-info/RECORD,,
|
|
File without changes
|