llm-ie 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
import re
|
|
3
3
|
import json
|
|
4
|
+
import json_repair
|
|
4
5
|
import inspect
|
|
5
6
|
import importlib.resources
|
|
6
7
|
import warnings
|
|
@@ -117,7 +118,12 @@ class Extractor:
|
|
|
117
118
|
dict_obj = json.loads(dict_str)
|
|
118
119
|
out.append(dict_obj)
|
|
119
120
|
except json.JSONDecodeError:
|
|
120
|
-
|
|
121
|
+
dict_obj = json_repair.repair_json(dict_str, skip_json_loads=True, return_objects=True)
|
|
122
|
+
if dict_obj:
|
|
123
|
+
warnings.warn(f'JSONDecodeError detected, fixed with repair_json:\n{dict_str}', RuntimeWarning)
|
|
124
|
+
out.append(dict_obj)
|
|
125
|
+
else:
|
|
126
|
+
warnings.warn(f'JSONDecodeError could not be fixed:\n{dict_str}', RuntimeWarning)
|
|
121
127
|
return out
|
|
122
128
|
|
|
123
129
|
|
|
@@ -181,19 +187,22 @@ class FrameExtractor(Extractor):
|
|
|
181
187
|
Returns : Tuple[Tuple[int, int], float]
|
|
182
188
|
a tuple of 2-tuple span and Jaccard score.
|
|
183
189
|
"""
|
|
190
|
+
if not text or not pattern:
|
|
191
|
+
return None, 0
|
|
192
|
+
|
|
184
193
|
text_tokens, text_spans = self._get_word_tokens(text)
|
|
185
194
|
pattern_tokens, _ = self._get_word_tokens(pattern)
|
|
186
195
|
pattern_tokens_set = set(pattern_tokens)
|
|
187
196
|
window_size = len(pattern_tokens)
|
|
188
|
-
window_size_min = int(window_size * (1 - buffer_size))
|
|
189
|
-
window_size_max = int(window_size * (1 + buffer_size))
|
|
197
|
+
window_size_min = max(1, int(window_size * (1 - buffer_size)))
|
|
198
|
+
window_size_max = int(window_size * (1 + buffer_size)) + 1
|
|
190
199
|
closest_substring_span = None
|
|
191
200
|
best_score = 0
|
|
192
201
|
|
|
193
202
|
for i in range(len(text_tokens) - window_size_max):
|
|
194
203
|
for w in range(window_size_min, window_size_max):
|
|
195
204
|
sub_str_tokens = text_tokens[i:i + w]
|
|
196
|
-
if sub_str_tokens[0] == pattern_tokens[0]:
|
|
205
|
+
if len(sub_str_tokens) > 0 and sub_str_tokens[0] == pattern_tokens[0]:
|
|
197
206
|
score = self._jaccard_score(set(sub_str_tokens), pattern_tokens_set)
|
|
198
207
|
if score > best_score:
|
|
199
208
|
best_score = score
|
|
@@ -238,7 +247,7 @@ class FrameExtractor(Extractor):
|
|
|
238
247
|
|
|
239
248
|
# Exact match
|
|
240
249
|
match = re.search(re.escape(entity), text)
|
|
241
|
-
if match:
|
|
250
|
+
if match and entity:
|
|
242
251
|
start, end = match.span()
|
|
243
252
|
entity_spans.append((start, end))
|
|
244
253
|
# Replace the found entity with spaces to avoid finding the same instance again
|
|
@@ -246,7 +255,7 @@ class FrameExtractor(Extractor):
|
|
|
246
255
|
# Fuzzy match
|
|
247
256
|
elif fuzzy_match:
|
|
248
257
|
closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
|
|
249
|
-
if best_score >= fuzzy_score_cutoff
|
|
258
|
+
if closest_substring_span and best_score >= fuzzy_score_cutoff:
|
|
250
259
|
entity_spans.append(closest_substring_span)
|
|
251
260
|
# Replace the found entity with spaces to avoid finding the same instance again
|
|
252
261
|
text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llm-ie
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.5
|
|
4
4
|
Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Enshuo (David) Hsu
|
|
@@ -10,6 +10,7 @@ Classifier: Programming Language :: Python :: 3
|
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.11
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.12
|
|
12
12
|
Requires-Dist: colorama (>=0.4.6,<0.5.0)
|
|
13
|
+
Requires-Dist: json_repair (>=0.30,<0.31)
|
|
13
14
|
Requires-Dist: nltk (>=3.8,<4.0)
|
|
14
15
|
Description-Content-Type: text/markdown
|
|
15
16
|
|
|
@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=oKH_QeD
|
|
|
16
16
|
llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=oKH_QeDgpw771ZdHk3L7DYz2Jvfm7OolUoTiJyMJI30,9541
|
|
17
17
|
llm_ie/data_types.py,sha256=hPz3WOeAzfn2QKmb0CxHmRdQWZQ4G9zq8U-RJBVFdYk,14329
|
|
18
18
|
llm_ie/engines.py,sha256=PTYs7s_iCPmI-yFUCVCPY_cMGS77ma2VGoz4rdNkODI,9308
|
|
19
|
-
llm_ie/extractors.py,sha256=
|
|
19
|
+
llm_ie/extractors.py,sha256=WM-9ZmhGYCYzXLPiShfF42grezjRz42JbgXwueixZRI,64870
|
|
20
20
|
llm_ie/prompt_editor.py,sha256=Xc5ZHsEnM8-YYITokIsM6BVsf2Ec_8ajJDaldPf-P8U,8577
|
|
21
|
-
llm_ie-0.3.
|
|
22
|
-
llm_ie-0.3.
|
|
23
|
-
llm_ie-0.3.
|
|
21
|
+
llm_ie-0.3.5.dist-info/METADATA,sha256=y5nABgPeeMPEkZ58WoOBR3sgFrKOE_mF4fqaHK59K1w,41308
|
|
22
|
+
llm_ie-0.3.5.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
23
|
+
llm_ie-0.3.5.dist-info/RECORD,,
|
|
File without changes
|