llm-ie 0.3.2__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {llm_ie-0.3.2 → llm_ie-0.3.3}/PKG-INFO +1 -1
  2. {llm_ie-0.3.2 → llm_ie-0.3.3}/pyproject.toml +1 -1
  3. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/extractors.py +17 -13
  4. {llm_ie-0.3.2 → llm_ie-0.3.3}/README.md +0 -0
  5. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/__init__.py +0 -0
  6. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/PromptEditor_prompts/chat.txt +0 -0
  7. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/PromptEditor_prompts/comment.txt +0 -0
  8. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/PromptEditor_prompts/rewrite.txt +0 -0
  9. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/PromptEditor_prompts/system.txt +0 -0
  10. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +0 -0
  11. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +0 -0
  12. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt +0 -0
  13. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_revision_review_prompt.txt +0 -0
  14. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +0 -0
  15. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt +0 -0
  16. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt +0 -0
  17. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +0 -0
  18. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/prompt_guide/SentenceCoTFrameExtractor_prompt_guide.txt +0 -0
  19. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +0 -0
  20. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +0 -0
  21. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/data_types.py +0 -0
  22. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/engines.py +0 -0
  23. {llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/prompt_editor.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llm-ie
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "llm-ie"
3
- version = "0.3.2"
3
+ version = "0.3.3"
4
4
  description = "An LLM-powered tool that transforms everyday language into robust information extraction pipelines."
5
5
  authors = ["Enshuo (David) Hsu"]
6
6
  license = "MIT"
@@ -143,7 +143,7 @@ class FrameExtractor(Extractor):
143
143
  self.tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
144
144
 
145
145
 
146
- def _jaccard_score(self, s1:set, s2:set) -> float:
146
+ def _jaccard_score(self, s1:Set[str], s2:Set[str]) -> float:
147
147
  """
148
148
  This method calculates the Jaccard score between two sets of word tokens.
149
149
  """
@@ -166,6 +166,8 @@ class FrameExtractor(Extractor):
166
166
  def _get_closest_substring(self, text:str, pattern:str, buffer_size:float=0.2) -> Tuple[Tuple[int, int], float]:
167
167
  """
168
168
  This method finds the closest (highest Jaccard score) substring in text that matches the pattern.
169
+ the substring must start with the same word token as the pattern. This is due to the observation that
170
+ LLM often generate the first few words consistently.
169
171
 
170
172
  Parameters
171
173
  ----------
@@ -181,23 +183,24 @@ class FrameExtractor(Extractor):
181
183
  """
182
184
  text_tokens, text_spans = self._get_word_tokens(text)
183
185
  pattern_tokens, _ = self._get_word_tokens(pattern)
184
- pattern_tokens = set(pattern_tokens)
186
+ pattern_tokens_set = set(pattern_tokens)
185
187
  window_size = len(pattern_tokens)
186
188
  window_size_min = int(window_size * (1 - buffer_size))
187
189
  window_size_max = int(window_size * (1 + buffer_size))
188
- closest_substring_spans = None
190
+ closest_substring_span = None
189
191
  best_score = 0
190
192
 
191
193
  for i in range(len(text_tokens) - window_size_max):
192
194
  for w in range(window_size_min, window_size_max):
193
- sub_str_tokens = set(text_tokens[i:i + w])
194
- score = self._jaccard_score(sub_str_tokens, pattern_tokens)
195
- if score > best_score:
196
- best_score = score
197
- sub_string_word_spans = text_spans[i:i + w]
198
- closest_substring_spans = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
195
+ sub_str_tokens = text_tokens[i:i + w]
196
+ if sub_str_tokens[0] == pattern_tokens[0]:
197
+ score = self._jaccard_score(set(sub_str_tokens), pattern_tokens_set)
198
+ if score > best_score:
199
+ best_score = score
200
+ sub_string_word_spans = text_spans[i:i + w]
201
+ closest_substring_span = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
199
202
 
200
- return closest_substring_spans, best_score
203
+ return closest_substring_span, best_score
201
204
 
202
205
 
203
206
  def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
@@ -243,7 +246,7 @@ class FrameExtractor(Extractor):
243
246
  # Fuzzy match
244
247
  elif fuzzy_match:
245
248
  closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
246
- if best_score >= fuzzy_score_cutoff:
249
+ if best_score >= fuzzy_score_cutoff and closest_substring_span:
247
250
  entity_spans.append(closest_substring_span)
248
251
  # Replace the found entity with spaces to avoid finding the same instance again
249
252
  text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
@@ -432,7 +435,7 @@ class BasicFrameExtractor(FrameExtractor):
432
435
  frame = LLMInformationExtractionFrame(frame_id=f"{i}",
433
436
  start=start,
434
437
  end=end,
435
- entity_text=ent[entity_key],
438
+ entity_text=text[start:end],
436
439
  attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
437
440
  frame_list.append(frame)
438
441
  return frame_list
@@ -731,12 +734,13 @@ class SentenceFrameExtractor(FrameExtractor):
731
734
  for ent, span in zip(entity_json, spans):
732
735
  if span is not None:
733
736
  start, end = span
737
+ entity_text = sent['sentence_text'][start:end]
734
738
  start += sent['sentence_start']
735
739
  end += sent['sentence_start']
736
740
  frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
737
741
  start=start,
738
742
  end=end,
739
- entity_text=ent[entity_key],
743
+ entity_text=entity_text,
740
744
  attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
741
745
  frame_list.append(frame)
742
746
  return frame_list
File without changes
File without changes
File without changes
File without changes