PyPI - llm-ie - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

llm-ie 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

llm_ie/extractors.py CHANGED Viewed

@@ -143,7 +143,7 @@ class FrameExtractor(Extractor):
         self.tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
-    def _jaccard_score(self, s1:set, s2:set) -> float:
+    def _jaccard_score(self, s1:Set[str], s2:Set[str]) -> float:
         """
         This method calculates the Jaccard score between two sets of word tokens.
         """
@@ -166,6 +166,8 @@ class FrameExtractor(Extractor):
     def _get_closest_substring(self, text:str, pattern:str, buffer_size:float=0.2) -> Tuple[Tuple[int, int], float]:
         """
         This method finds the closest (highest Jaccard score) substring in text that matches the pattern.
+        the substring must start with the same word token as the pattern. This is due to the observation that
+        LLM often generate the first few words consistently.
         Parameters
         ----------
@@ -179,25 +181,29 @@ class FrameExtractor(Extractor):
         Returns : Tuple[Tuple[int, int], float]
             a tuple of 2-tuple span and Jaccard score.
         """
+        if not text or not pattern:
+            return None, 0
         text_tokens, text_spans = self._get_word_tokens(text)
         pattern_tokens, _ = self._get_word_tokens(pattern)
-        pattern_tokens = set(pattern_tokens)
+        pattern_tokens_set = set(pattern_tokens)
         window_size = len(pattern_tokens)
-        window_size_min = int(window_size * (1 - buffer_size))
-        window_size_max = int(window_size * (1 + buffer_size))
-        closest_substring_spans = None
+        window_size_min = max(1, int(window_size * (1 - buffer_size)))
+        window_size_max = int(window_size * (1 + buffer_size)) + 1
+        closest_substring_span = None
         best_score = 0
         for i in range(len(text_tokens) - window_size_max):
             for w in range(window_size_min, window_size_max):
-                sub_str_tokens = set(text_tokens[i:i + w])
-                score = self._jaccard_score(sub_str_tokens, pattern_tokens)
-                if score > best_score:
-                    best_score = score
-                    sub_string_word_spans = text_spans[i:i + w]
-                    closest_substring_spans = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
+                sub_str_tokens = text_tokens[i:i + w]
+                if len(sub_str_tokens) > 0 and sub_str_tokens[0] == pattern_tokens[0]:
+                    score = self._jaccard_score(set(sub_str_tokens), pattern_tokens_set)
+                    if score > best_score:
+                        best_score = score
+                        sub_string_word_spans = text_spans[i:i + w]
+                        closest_substring_span = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
-        return closest_substring_spans, best_score
+        return closest_substring_span, best_score
     def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
@@ -235,7 +241,7 @@ class FrameExtractor(Extractor):
             # Exact match
             match = re.search(re.escape(entity), text)
-            if match:
+            if match and entity:
                 start, end = match.span()
                 entity_spans.append((start, end))
                 # Replace the found entity with spaces to avoid finding the same instance again
@@ -243,7 +249,7 @@ class FrameExtractor(Extractor):
             # Fuzzy match
             elif fuzzy_match:
                 closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
-                if best_score >= fuzzy_score_cutoff:
+                if closest_substring_span and best_score >= fuzzy_score_cutoff:
                     entity_spans.append(closest_substring_span)
                     # Replace the found entity with spaces to avoid finding the same instance again
                     text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
@@ -432,7 +438,7 @@ class BasicFrameExtractor(FrameExtractor):
                 frame = LLMInformationExtractionFrame(frame_id=f"{i}",
                             start=start,
                             end=end,
-                            entity_text=ent[entity_key],
+                            entity_text=text[start:end],
                             attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
                 frame_list.append(frame)
         return frame_list
@@ -731,12 +737,13 @@ class SentenceFrameExtractor(FrameExtractor):
             for ent, span in zip(entity_json, spans):
                 if span is not None:
                     start, end = span
+                    entity_text = sent['sentence_text'][start:end]
                     start += sent['sentence_start']
                     end += sent['sentence_start']
                     frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
                                 start=start,
                                 end=end,
-                                entity_text=ent[entity_key],
+                                entity_text=entity_text,
                                 attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
                     frame_list.append(frame)
         return frame_list

{llm_ie-0.3.2.dist-info → llm_ie-0.3.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llm-ie
-Version: 0.3.2
+Version: 0.3.4
 Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
 License: MIT
 Author: Enshuo (David) Hsu

{llm_ie-0.3.2.dist-info → llm_ie-0.3.4.dist-info}/RECORD RENAMED Viewed

@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=oKH_QeD
 llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=oKH_QeDgpw771ZdHk3L7DYz2Jvfm7OolUoTiJyMJI30,9541
 llm_ie/data_types.py,sha256=hPz3WOeAzfn2QKmb0CxHmRdQWZQ4G9zq8U-RJBVFdYk,14329
 llm_ie/engines.py,sha256=PTYs7s_iCPmI-yFUCVCPY_cMGS77ma2VGoz4rdNkODI,9308
-llm_ie/extractors.py,sha256=bZQh_qZP1yIwNLXIx1ZzVGN702q3qzrlwiTcHuMsJt0,64051
+llm_ie/extractors.py,sha256=j9L9USybJBmYZM4RAjGO6DR4StYBgzhqCN6nEFZZQVQ,64523
 llm_ie/prompt_editor.py,sha256=Xc5ZHsEnM8-YYITokIsM6BVsf2Ec_8ajJDaldPf-P8U,8577
-llm_ie-0.3.2.dist-info/METADATA,sha256=Lpzaq0n-kpfiIpzic3qUQncVDoCpfw6CGmxc5NftXro,41266
-llm_ie-0.3.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-llm_ie-0.3.2.dist-info/RECORD,,
+llm_ie-0.3.4.dist-info/METADATA,sha256=-5Tf9TCWczCVOsdMavkBZ-KnYPGnbNrV1rsU-pMHfPA,41266
+llm_ie-0.3.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+llm_ie-0.3.4.dist-info/RECORD,,

{llm_ie-0.3.2.dist-info → llm_ie-0.3.4.dist-info}/WHEEL RENAMED Viewed

File without changes

llm-ie 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

llm-ie 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl