llm-ie 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py
CHANGED
|
@@ -143,7 +143,7 @@ class FrameExtractor(Extractor):
|
|
|
143
143
|
self.tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
|
|
144
144
|
|
|
145
145
|
|
|
146
|
-
def _jaccard_score(self, s1:
|
|
146
|
+
def _jaccard_score(self, s1:Set[str], s2:Set[str]) -> float:
|
|
147
147
|
"""
|
|
148
148
|
This method calculates the Jaccard score between two sets of word tokens.
|
|
149
149
|
"""
|
|
@@ -166,6 +166,8 @@ class FrameExtractor(Extractor):
|
|
|
166
166
|
def _get_closest_substring(self, text:str, pattern:str, buffer_size:float=0.2) -> Tuple[Tuple[int, int], float]:
|
|
167
167
|
"""
|
|
168
168
|
This method finds the closest (highest Jaccard score) substring in text that matches the pattern.
|
|
169
|
+
the substring must start with the same word token as the pattern. This is due to the observation that
|
|
170
|
+
LLM often generate the first few words consistently.
|
|
169
171
|
|
|
170
172
|
Parameters
|
|
171
173
|
----------
|
|
@@ -181,23 +183,24 @@ class FrameExtractor(Extractor):
|
|
|
181
183
|
"""
|
|
182
184
|
text_tokens, text_spans = self._get_word_tokens(text)
|
|
183
185
|
pattern_tokens, _ = self._get_word_tokens(pattern)
|
|
184
|
-
|
|
186
|
+
pattern_tokens_set = set(pattern_tokens)
|
|
185
187
|
window_size = len(pattern_tokens)
|
|
186
188
|
window_size_min = int(window_size * (1 - buffer_size))
|
|
187
189
|
window_size_max = int(window_size * (1 + buffer_size))
|
|
188
|
-
|
|
190
|
+
closest_substring_span = None
|
|
189
191
|
best_score = 0
|
|
190
192
|
|
|
191
193
|
for i in range(len(text_tokens) - window_size_max):
|
|
192
194
|
for w in range(window_size_min, window_size_max):
|
|
193
|
-
sub_str_tokens =
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
195
|
+
sub_str_tokens = text_tokens[i:i + w]
|
|
196
|
+
if sub_str_tokens[0] == pattern_tokens[0]:
|
|
197
|
+
score = self._jaccard_score(set(sub_str_tokens), pattern_tokens_set)
|
|
198
|
+
if score > best_score:
|
|
199
|
+
best_score = score
|
|
200
|
+
sub_string_word_spans = text_spans[i:i + w]
|
|
201
|
+
closest_substring_span = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
|
|
199
202
|
|
|
200
|
-
return
|
|
203
|
+
return closest_substring_span, best_score
|
|
201
204
|
|
|
202
205
|
|
|
203
206
|
def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
|
|
@@ -243,7 +246,7 @@ class FrameExtractor(Extractor):
|
|
|
243
246
|
# Fuzzy match
|
|
244
247
|
elif fuzzy_match:
|
|
245
248
|
closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
|
|
246
|
-
if best_score >= fuzzy_score_cutoff:
|
|
249
|
+
if best_score >= fuzzy_score_cutoff and closest_substring_span:
|
|
247
250
|
entity_spans.append(closest_substring_span)
|
|
248
251
|
# Replace the found entity with spaces to avoid finding the same instance again
|
|
249
252
|
text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
|
|
@@ -432,7 +435,7 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
432
435
|
frame = LLMInformationExtractionFrame(frame_id=f"{i}",
|
|
433
436
|
start=start,
|
|
434
437
|
end=end,
|
|
435
|
-
entity_text=
|
|
438
|
+
entity_text=text[start:end],
|
|
436
439
|
attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
|
|
437
440
|
frame_list.append(frame)
|
|
438
441
|
return frame_list
|
|
@@ -731,12 +734,13 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
731
734
|
for ent, span in zip(entity_json, spans):
|
|
732
735
|
if span is not None:
|
|
733
736
|
start, end = span
|
|
737
|
+
entity_text = sent['sentence_text'][start:end]
|
|
734
738
|
start += sent['sentence_start']
|
|
735
739
|
end += sent['sentence_start']
|
|
736
740
|
frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
|
|
737
741
|
start=start,
|
|
738
742
|
end=end,
|
|
739
|
-
entity_text=
|
|
743
|
+
entity_text=entity_text,
|
|
740
744
|
attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
|
|
741
745
|
frame_list.append(frame)
|
|
742
746
|
return frame_list
|
|
@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=oKH_QeD
|
|
|
16
16
|
llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=oKH_QeDgpw771ZdHk3L7DYz2Jvfm7OolUoTiJyMJI30,9541
|
|
17
17
|
llm_ie/data_types.py,sha256=hPz3WOeAzfn2QKmb0CxHmRdQWZQ4G9zq8U-RJBVFdYk,14329
|
|
18
18
|
llm_ie/engines.py,sha256=PTYs7s_iCPmI-yFUCVCPY_cMGS77ma2VGoz4rdNkODI,9308
|
|
19
|
-
llm_ie/extractors.py,sha256=
|
|
19
|
+
llm_ie/extractors.py,sha256=yBdIcevjMfwto85Jb0KkRMN-AjIMk92fD5yWB3Qm8MY,64408
|
|
20
20
|
llm_ie/prompt_editor.py,sha256=Xc5ZHsEnM8-YYITokIsM6BVsf2Ec_8ajJDaldPf-P8U,8577
|
|
21
|
-
llm_ie-0.3.
|
|
22
|
-
llm_ie-0.3.
|
|
23
|
-
llm_ie-0.3.
|
|
21
|
+
llm_ie-0.3.3.dist-info/METADATA,sha256=CeTsMNtWhEWCvOqHWSXu0KqOgDp3kMwN2WtBF4N-4zE,41266
|
|
22
|
+
llm_ie-0.3.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
23
|
+
llm_ie-0.3.3.dist-info/RECORD,,
|
|
File without changes
|