PyPI - llm-ie - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

llm-ie 0.3.2py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

llm_ie/extractors.py CHANGED Viewed

@@ -143,7 +143,7 @@ class FrameExtractor(Extractor):
         self.tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
-    def _jaccard_score(self, s1:set, s2:set) -> float:
+    def _jaccard_score(self, s1:Set[str], s2:Set[str]) -> float:
         """
         This method calculates the Jaccard score between two sets of word tokens.
         """
@@ -166,6 +166,8 @@ class FrameExtractor(Extractor):
     def _get_closest_substring(self, text:str, pattern:str, buffer_size:float=0.2) -> Tuple[Tuple[int, int], float]:
         """
         This method finds the closest (highest Jaccard score) substring in text that matches the pattern.
+        the substring must start with the same word token as the pattern. This is due to the observation that
+        LLM often generate the first few words consistently.
         Parameters
         ----------
@@ -181,23 +183,24 @@ class FrameExtractor(Extractor):
         """
         text_tokens, text_spans = self._get_word_tokens(text)
         pattern_tokens, _ = self._get_word_tokens(pattern)
-        pattern_tokens = set(pattern_tokens)
+        pattern_tokens_set = set(pattern_tokens)
         window_size = len(pattern_tokens)
         window_size_min = int(window_size * (1 - buffer_size))
         window_size_max = int(window_size * (1 + buffer_size))
-        closest_substring_spans = None
+        closest_substring_span = None
         best_score = 0
         for i in range(len(text_tokens) - window_size_max):
             for w in range(window_size_min, window_size_max):
-                sub_str_tokens = set(text_tokens[i:i + w])
-                score = self._jaccard_score(sub_str_tokens, pattern_tokens)
-                if score > best_score:
-                    best_score = score
-                    sub_string_word_spans = text_spans[i:i + w]
-                    closest_substring_spans = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
+                sub_str_tokens = text_tokens[i:i + w]
+                if sub_str_tokens[0] == pattern_tokens[0]:
+                    score = self._jaccard_score(set(sub_str_tokens), pattern_tokens_set)
+                    if score > best_score:
+                        best_score = score
+                        sub_string_word_spans = text_spans[i:i + w]
+                        closest_substring_span = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
-        return closest_substring_spans, best_score
+        return closest_substring_span, best_score
     def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
@@ -243,7 +246,7 @@ class FrameExtractor(Extractor):
             # Fuzzy match
             elif fuzzy_match:
                 closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
-                if best_score >= fuzzy_score_cutoff:
+                if best_score >= fuzzy_score_cutoff and closest_substring_span:
                     entity_spans.append(closest_substring_span)
                     # Replace the found entity with spaces to avoid finding the same instance again
                     text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
@@ -432,7 +435,7 @@ class BasicFrameExtractor(FrameExtractor):
                 frame = LLMInformationExtractionFrame(frame_id=f"{i}",
                             start=start,
                             end=end,
-                            entity_text=ent[entity_key],
+                            entity_text=text[start:end],
                             attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
                 frame_list.append(frame)
         return frame_list
@@ -731,12 +734,13 @@ class SentenceFrameExtractor(FrameExtractor):
             for ent, span in zip(entity_json, spans):
                 if span is not None:
                     start, end = span
+                    entity_text = sent['sentence_text'][start:end]
                     start += sent['sentence_start']
                     end += sent['sentence_start']
                     frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
                                 start=start,
                                 end=end,
-                                entity_text=ent[entity_key],
+                                entity_text=entity_text,
                                 attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
                     frame_list.append(frame)
         return frame_list

{llm_ie-0.3.2.dist-info → llm_ie-0.3.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llm-ie
-Version: 0.3.2
+Version: 0.3.3
 Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
 License: MIT
 Author: Enshuo (David) Hsu

{llm_ie-0.3.2.dist-info → llm_ie-0.3.3.dist-info}/RECORD RENAMED Viewed

@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=oKH_QeD
 llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=oKH_QeDgpw771ZdHk3L7DYz2Jvfm7OolUoTiJyMJI30,9541
 llm_ie/data_types.py,sha256=hPz3WOeAzfn2QKmb0CxHmRdQWZQ4G9zq8U-RJBVFdYk,14329
 llm_ie/engines.py,sha256=PTYs7s_iCPmI-yFUCVCPY_cMGS77ma2VGoz4rdNkODI,9308
-llm_ie/extractors.py,sha256=bZQh_qZP1yIwNLXIx1ZzVGN702q3qzrlwiTcHuMsJt0,64051
+llm_ie/extractors.py,sha256=yBdIcevjMfwto85Jb0KkRMN-AjIMk92fD5yWB3Qm8MY,64408
 llm_ie/prompt_editor.py,sha256=Xc5ZHsEnM8-YYITokIsM6BVsf2Ec_8ajJDaldPf-P8U,8577
-llm_ie-0.3.2.dist-info/METADATA,sha256=Lpzaq0n-kpfiIpzic3qUQncVDoCpfw6CGmxc5NftXro,41266
-llm_ie-0.3.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-llm_ie-0.3.2.dist-info/RECORD,,
+llm_ie-0.3.3.dist-info/METADATA,sha256=CeTsMNtWhEWCvOqHWSXu0KqOgDp3kMwN2WtBF4N-4zE,41266
+llm_ie-0.3.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+llm_ie-0.3.3.dist-info/RECORD,,

{llm_ie-0.3.2.dist-info → llm_ie-0.3.3.dist-info}/WHEEL RENAMED Viewed

File without changes

llm-ie 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

llm-ie 0.3.2py3-none-any.whl → 0.3.3py3-none-any.whl