PyPI - llm-ie - Versions diffs - 0.3.2__tar.gz → 0.3.3__tar.gz - Mend

llm-ie 0.3.2tar.gz → 0.3.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{llm_ie-0.3.2 → llm_ie-0.3.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llm-ie
-Version: 0.3.2
+Version: 0.3.3
 Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
 License: MIT
 Author: Enshuo (David) Hsu

{llm_ie-0.3.2 → llm_ie-0.3.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llm-ie"
-version = "0.3.2"
+version = "0.3.3"
 description = "An LLM-powered tool that transforms everyday language into robust information extraction pipelines."
 authors = ["Enshuo (David) Hsu"]
 license = "MIT"

{llm_ie-0.3.2 → llm_ie-0.3.3}/src/llm_ie/extractors.py RENAMED Viewed

@@ -143,7 +143,7 @@ class FrameExtractor(Extractor):
         self.tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
-    def _jaccard_score(self, s1:set, s2:set) -> float:
+    def _jaccard_score(self, s1:Set[str], s2:Set[str]) -> float:
         """
         This method calculates the Jaccard score between two sets of word tokens.
         """
@@ -166,6 +166,8 @@ class FrameExtractor(Extractor):
     def _get_closest_substring(self, text:str, pattern:str, buffer_size:float=0.2) -> Tuple[Tuple[int, int], float]:
         """
         This method finds the closest (highest Jaccard score) substring in text that matches the pattern.
+        the substring must start with the same word token as the pattern. This is due to the observation that
+        LLM often generate the first few words consistently.
         Parameters
         ----------
@@ -181,23 +183,24 @@ class FrameExtractor(Extractor):
         """
         text_tokens, text_spans = self._get_word_tokens(text)
         pattern_tokens, _ = self._get_word_tokens(pattern)
-        pattern_tokens = set(pattern_tokens)
+        pattern_tokens_set = set(pattern_tokens)
         window_size = len(pattern_tokens)
         window_size_min = int(window_size * (1 - buffer_size))
         window_size_max = int(window_size * (1 + buffer_size))
-        closest_substring_spans = None
+        closest_substring_span = None
         best_score = 0
         for i in range(len(text_tokens) - window_size_max):
             for w in range(window_size_min, window_size_max):
-                sub_str_tokens = set(text_tokens[i:i + w])
-                score = self._jaccard_score(sub_str_tokens, pattern_tokens)
-                if score > best_score:
-                    best_score = score
-                    sub_string_word_spans = text_spans[i:i + w]
-                    closest_substring_spans = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
+                sub_str_tokens = text_tokens[i:i + w]
+                if sub_str_tokens[0] == pattern_tokens[0]:
+                    score = self._jaccard_score(set(sub_str_tokens), pattern_tokens_set)
+                    if score > best_score:
+                        best_score = score
+                        sub_string_word_spans = text_spans[i:i + w]
+                        closest_substring_span = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
-        return closest_substring_spans, best_score
+        return closest_substring_span, best_score
     def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
@@ -243,7 +246,7 @@ class FrameExtractor(Extractor):
             # Fuzzy match
             elif fuzzy_match:
                 closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
-                if best_score >= fuzzy_score_cutoff:
+                if best_score >= fuzzy_score_cutoff and closest_substring_span:
                     entity_spans.append(closest_substring_span)
                     # Replace the found entity with spaces to avoid finding the same instance again
                     text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
@@ -432,7 +435,7 @@ class BasicFrameExtractor(FrameExtractor):
                 frame = LLMInformationExtractionFrame(frame_id=f"{i}",
                             start=start,
                             end=end,
-                            entity_text=ent[entity_key],
+                            entity_text=text[start:end],
                             attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
                 frame_list.append(frame)
         return frame_list
@@ -731,12 +734,13 @@ class SentenceFrameExtractor(FrameExtractor):
             for ent, span in zip(entity_json, spans):
                 if span is not None:
                     start, end = span
+                    entity_text = sent['sentence_text'][start:end]
                     start += sent['sentence_start']
                     end += sent['sentence_start']
                     frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
                                 start=start,
                                 end=end,
-                                entity_text=ent[entity_key],
+                                entity_text=entity_text,
                                 attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
                     frame_list.append(frame)
         return frame_list