llm-ie 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py
CHANGED
|
@@ -181,19 +181,22 @@ class FrameExtractor(Extractor):
|
|
|
181
181
|
Returns : Tuple[Tuple[int, int], float]
|
|
182
182
|
a tuple of 2-tuple span and Jaccard score.
|
|
183
183
|
"""
|
|
184
|
+
if not text or not pattern:
|
|
185
|
+
return None, 0
|
|
186
|
+
|
|
184
187
|
text_tokens, text_spans = self._get_word_tokens(text)
|
|
185
188
|
pattern_tokens, _ = self._get_word_tokens(pattern)
|
|
186
189
|
pattern_tokens_set = set(pattern_tokens)
|
|
187
190
|
window_size = len(pattern_tokens)
|
|
188
|
-
window_size_min = int(window_size * (1 - buffer_size))
|
|
189
|
-
window_size_max = int(window_size * (1 + buffer_size))
|
|
191
|
+
window_size_min = max(1, int(window_size * (1 - buffer_size)))
|
|
192
|
+
window_size_max = int(window_size * (1 + buffer_size)) + 1
|
|
190
193
|
closest_substring_span = None
|
|
191
194
|
best_score = 0
|
|
192
195
|
|
|
193
196
|
for i in range(len(text_tokens) - window_size_max):
|
|
194
197
|
for w in range(window_size_min, window_size_max):
|
|
195
198
|
sub_str_tokens = text_tokens[i:i + w]
|
|
196
|
-
if sub_str_tokens[0] == pattern_tokens[0]:
|
|
199
|
+
if len(sub_str_tokens) > 0 and sub_str_tokens[0] == pattern_tokens[0]:
|
|
197
200
|
score = self._jaccard_score(set(sub_str_tokens), pattern_tokens_set)
|
|
198
201
|
if score > best_score:
|
|
199
202
|
best_score = score
|
|
@@ -238,7 +241,7 @@ class FrameExtractor(Extractor):
|
|
|
238
241
|
|
|
239
242
|
# Exact match
|
|
240
243
|
match = re.search(re.escape(entity), text)
|
|
241
|
-
if match:
|
|
244
|
+
if match and entity:
|
|
242
245
|
start, end = match.span()
|
|
243
246
|
entity_spans.append((start, end))
|
|
244
247
|
# Replace the found entity with spaces to avoid finding the same instance again
|
|
@@ -246,7 +249,7 @@ class FrameExtractor(Extractor):
|
|
|
246
249
|
# Fuzzy match
|
|
247
250
|
elif fuzzy_match:
|
|
248
251
|
closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
|
|
249
|
-
if best_score >= fuzzy_score_cutoff
|
|
252
|
+
if closest_substring_span and best_score >= fuzzy_score_cutoff:
|
|
250
253
|
entity_spans.append(closest_substring_span)
|
|
251
254
|
# Replace the found entity with spaces to avoid finding the same instance again
|
|
252
255
|
text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
|
|
@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=oKH_QeD
|
|
|
16
16
|
llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=oKH_QeDgpw771ZdHk3L7DYz2Jvfm7OolUoTiJyMJI30,9541
|
|
17
17
|
llm_ie/data_types.py,sha256=hPz3WOeAzfn2QKmb0CxHmRdQWZQ4G9zq8U-RJBVFdYk,14329
|
|
18
18
|
llm_ie/engines.py,sha256=PTYs7s_iCPmI-yFUCVCPY_cMGS77ma2VGoz4rdNkODI,9308
|
|
19
|
-
llm_ie/extractors.py,sha256=
|
|
19
|
+
llm_ie/extractors.py,sha256=j9L9USybJBmYZM4RAjGO6DR4StYBgzhqCN6nEFZZQVQ,64523
|
|
20
20
|
llm_ie/prompt_editor.py,sha256=Xc5ZHsEnM8-YYITokIsM6BVsf2Ec_8ajJDaldPf-P8U,8577
|
|
21
|
-
llm_ie-0.3.
|
|
22
|
-
llm_ie-0.3.
|
|
23
|
-
llm_ie-0.3.
|
|
21
|
+
llm_ie-0.3.4.dist-info/METADATA,sha256=-5Tf9TCWczCVOsdMavkBZ-KnYPGnbNrV1rsU-pMHfPA,41266
|
|
22
|
+
llm_ie-0.3.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
23
|
+
llm_ie-0.3.4.dist-info/RECORD,,
|
|
File without changes
|