llm-ie 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import abc
2
2
  import re
3
3
  import json
4
+ import json_repair
4
5
  import inspect
5
6
  import importlib.resources
6
7
  import warnings
@@ -117,7 +118,12 @@ class Extractor:
117
118
  dict_obj = json.loads(dict_str)
118
119
  out.append(dict_obj)
119
120
  except json.JSONDecodeError:
120
- warnings.warn(f'Post-processing failed:\n{dict_str}', RuntimeWarning)
121
+ dict_obj = json_repair.repair_json(dict_str, skip_json_loads=True, return_objects=True)
122
+ if dict_obj:
123
+ warnings.warn(f'JSONDecodeError detected, fixed with repair_json:\n{dict_str}', RuntimeWarning)
124
+ out.append(dict_obj)
125
+ else:
126
+ warnings.warn(f'JSONDecodeError could not be fixed:\n{dict_str}', RuntimeWarning)
121
127
  return out
122
128
 
123
129
 
@@ -181,19 +187,22 @@ class FrameExtractor(Extractor):
181
187
  Returns : Tuple[Tuple[int, int], float]
182
188
  a tuple of 2-tuple span and Jaccard score.
183
189
  """
190
+ if not text or not pattern:
191
+ return None, 0
192
+
184
193
  text_tokens, text_spans = self._get_word_tokens(text)
185
194
  pattern_tokens, _ = self._get_word_tokens(pattern)
186
195
  pattern_tokens_set = set(pattern_tokens)
187
196
  window_size = len(pattern_tokens)
188
- window_size_min = int(window_size * (1 - buffer_size))
189
- window_size_max = int(window_size * (1 + buffer_size))
197
+ window_size_min = max(1, int(window_size * (1 - buffer_size)))
198
+ window_size_max = int(window_size * (1 + buffer_size)) + 1
190
199
  closest_substring_span = None
191
200
  best_score = 0
192
201
 
193
202
  for i in range(len(text_tokens) - window_size_max):
194
203
  for w in range(window_size_min, window_size_max):
195
204
  sub_str_tokens = text_tokens[i:i + w]
196
- if sub_str_tokens[0] == pattern_tokens[0]:
205
+ if len(sub_str_tokens) > 0 and sub_str_tokens[0] == pattern_tokens[0]:
197
206
  score = self._jaccard_score(set(sub_str_tokens), pattern_tokens_set)
198
207
  if score > best_score:
199
208
  best_score = score
@@ -238,7 +247,7 @@ class FrameExtractor(Extractor):
238
247
 
239
248
  # Exact match
240
249
  match = re.search(re.escape(entity), text)
241
- if match:
250
+ if match and entity:
242
251
  start, end = match.span()
243
252
  entity_spans.append((start, end))
244
253
  # Replace the found entity with spaces to avoid finding the same instance again
@@ -246,7 +255,7 @@ class FrameExtractor(Extractor):
246
255
  # Fuzzy match
247
256
  elif fuzzy_match:
248
257
  closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
249
- if best_score >= fuzzy_score_cutoff and closest_substring_span:
258
+ if closest_substring_span and best_score >= fuzzy_score_cutoff:
250
259
  entity_spans.append(closest_substring_span)
251
260
  # Replace the found entity with spaces to avoid finding the same instance again
252
261
  text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llm-ie
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -10,6 +10,7 @@ Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Programming Language :: Python :: 3.11
11
11
  Classifier: Programming Language :: Python :: 3.12
12
12
  Requires-Dist: colorama (>=0.4.6,<0.5.0)
13
+ Requires-Dist: json_repair (>=0.30,<0.31)
13
14
  Requires-Dist: nltk (>=3.8,<4.0)
14
15
  Description-Content-Type: text/markdown
15
16
 
@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=oKH_QeD
16
16
  llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=oKH_QeDgpw771ZdHk3L7DYz2Jvfm7OolUoTiJyMJI30,9541
17
17
  llm_ie/data_types.py,sha256=hPz3WOeAzfn2QKmb0CxHmRdQWZQ4G9zq8U-RJBVFdYk,14329
18
18
  llm_ie/engines.py,sha256=PTYs7s_iCPmI-yFUCVCPY_cMGS77ma2VGoz4rdNkODI,9308
19
- llm_ie/extractors.py,sha256=yBdIcevjMfwto85Jb0KkRMN-AjIMk92fD5yWB3Qm8MY,64408
19
+ llm_ie/extractors.py,sha256=WM-9ZmhGYCYzXLPiShfF42grezjRz42JbgXwueixZRI,64870
20
20
  llm_ie/prompt_editor.py,sha256=Xc5ZHsEnM8-YYITokIsM6BVsf2Ec_8ajJDaldPf-P8U,8577
21
- llm_ie-0.3.3.dist-info/METADATA,sha256=CeTsMNtWhEWCvOqHWSXu0KqOgDp3kMwN2WtBF4N-4zE,41266
22
- llm_ie-0.3.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
23
- llm_ie-0.3.3.dist-info/RECORD,,
21
+ llm_ie-0.3.5.dist-info/METADATA,sha256=y5nABgPeeMPEkZ58WoOBR3sgFrKOE_mF4fqaHK59K1w,41308
22
+ llm_ie-0.3.5.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
23
+ llm_ie-0.3.5.dist-info/RECORD,,
File without changes