llm-ie 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py CHANGED
@@ -143,7 +143,7 @@ class FrameExtractor(Extractor):
143
143
  self.tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
144
144
 
145
145
 
146
- def _jaccard_score(self, s1:set, s2:set) -> float:
146
+ def _jaccard_score(self, s1:Set[str], s2:Set[str]) -> float:
147
147
  """
148
148
  This method calculates the Jaccard score between two sets of word tokens.
149
149
  """
@@ -166,6 +166,8 @@ class FrameExtractor(Extractor):
166
166
  def _get_closest_substring(self, text:str, pattern:str, buffer_size:float=0.2) -> Tuple[Tuple[int, int], float]:
167
167
  """
168
168
  This method finds the closest (highest Jaccard score) substring in text that matches the pattern.
169
+ the substring must start with the same word token as the pattern. This is due to the observation that
170
+ LLM often generate the first few words consistently.
169
171
 
170
172
  Parameters
171
173
  ----------
@@ -179,25 +181,29 @@ class FrameExtractor(Extractor):
179
181
  Returns : Tuple[Tuple[int, int], float]
180
182
  a tuple of 2-tuple span and Jaccard score.
181
183
  """
184
+ if not text or not pattern:
185
+ return None, 0
186
+
182
187
  text_tokens, text_spans = self._get_word_tokens(text)
183
188
  pattern_tokens, _ = self._get_word_tokens(pattern)
184
- pattern_tokens = set(pattern_tokens)
189
+ pattern_tokens_set = set(pattern_tokens)
185
190
  window_size = len(pattern_tokens)
186
- window_size_min = int(window_size * (1 - buffer_size))
187
- window_size_max = int(window_size * (1 + buffer_size))
188
- closest_substring_spans = None
191
+ window_size_min = max(1, int(window_size * (1 - buffer_size)))
192
+ window_size_max = int(window_size * (1 + buffer_size)) + 1
193
+ closest_substring_span = None
189
194
  best_score = 0
190
195
 
191
196
  for i in range(len(text_tokens) - window_size_max):
192
197
  for w in range(window_size_min, window_size_max):
193
- sub_str_tokens = set(text_tokens[i:i + w])
194
- score = self._jaccard_score(sub_str_tokens, pattern_tokens)
195
- if score > best_score:
196
- best_score = score
197
- sub_string_word_spans = text_spans[i:i + w]
198
- closest_substring_spans = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
198
+ sub_str_tokens = text_tokens[i:i + w]
199
+ if len(sub_str_tokens) > 0 and sub_str_tokens[0] == pattern_tokens[0]:
200
+ score = self._jaccard_score(set(sub_str_tokens), pattern_tokens_set)
201
+ if score > best_score:
202
+ best_score = score
203
+ sub_string_word_spans = text_spans[i:i + w]
204
+ closest_substring_span = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
199
205
 
200
- return closest_substring_spans, best_score
206
+ return closest_substring_span, best_score
201
207
 
202
208
 
203
209
  def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
@@ -235,7 +241,7 @@ class FrameExtractor(Extractor):
235
241
 
236
242
  # Exact match
237
243
  match = re.search(re.escape(entity), text)
238
- if match:
244
+ if match and entity:
239
245
  start, end = match.span()
240
246
  entity_spans.append((start, end))
241
247
  # Replace the found entity with spaces to avoid finding the same instance again
@@ -243,7 +249,7 @@ class FrameExtractor(Extractor):
243
249
  # Fuzzy match
244
250
  elif fuzzy_match:
245
251
  closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
246
- if best_score >= fuzzy_score_cutoff:
252
+ if closest_substring_span and best_score >= fuzzy_score_cutoff:
247
253
  entity_spans.append(closest_substring_span)
248
254
  # Replace the found entity with spaces to avoid finding the same instance again
249
255
  text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
@@ -432,7 +438,7 @@ class BasicFrameExtractor(FrameExtractor):
432
438
  frame = LLMInformationExtractionFrame(frame_id=f"{i}",
433
439
  start=start,
434
440
  end=end,
435
- entity_text=ent[entity_key],
441
+ entity_text=text[start:end],
436
442
  attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
437
443
  frame_list.append(frame)
438
444
  return frame_list
@@ -731,12 +737,13 @@ class SentenceFrameExtractor(FrameExtractor):
731
737
  for ent, span in zip(entity_json, spans):
732
738
  if span is not None:
733
739
  start, end = span
740
+ entity_text = sent['sentence_text'][start:end]
734
741
  start += sent['sentence_start']
735
742
  end += sent['sentence_start']
736
743
  frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
737
744
  start=start,
738
745
  end=end,
739
- entity_text=ent[entity_key],
746
+ entity_text=entity_text,
740
747
  attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
741
748
  frame_list.append(frame)
742
749
  return frame_list
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llm-ie
3
- Version: 0.3.2
3
+ Version: 0.3.4
4
4
  Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=oKH_QeD
16
16
  llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=oKH_QeDgpw771ZdHk3L7DYz2Jvfm7OolUoTiJyMJI30,9541
17
17
  llm_ie/data_types.py,sha256=hPz3WOeAzfn2QKmb0CxHmRdQWZQ4G9zq8U-RJBVFdYk,14329
18
18
  llm_ie/engines.py,sha256=PTYs7s_iCPmI-yFUCVCPY_cMGS77ma2VGoz4rdNkODI,9308
19
- llm_ie/extractors.py,sha256=bZQh_qZP1yIwNLXIx1ZzVGN702q3qzrlwiTcHuMsJt0,64051
19
+ llm_ie/extractors.py,sha256=j9L9USybJBmYZM4RAjGO6DR4StYBgzhqCN6nEFZZQVQ,64523
20
20
  llm_ie/prompt_editor.py,sha256=Xc5ZHsEnM8-YYITokIsM6BVsf2Ec_8ajJDaldPf-P8U,8577
21
- llm_ie-0.3.2.dist-info/METADATA,sha256=Lpzaq0n-kpfiIpzic3qUQncVDoCpfw6CGmxc5NftXro,41266
22
- llm_ie-0.3.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
23
- llm_ie-0.3.2.dist-info/RECORD,,
21
+ llm_ie-0.3.4.dist-info/METADATA,sha256=-5Tf9TCWczCVOsdMavkBZ-KnYPGnbNrV1rsU-pMHfPA,41266
22
+ llm_ie-0.3.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
23
+ llm_ie-0.3.4.dist-info/RECORD,,
File without changes