llm-ie 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py CHANGED
@@ -5,10 +5,11 @@ import inspect
5
5
  import importlib.resources
6
6
  import warnings
7
7
  import itertools
8
- from typing import List, Dict, Tuple, Union, Callable
8
+ from typing import Set, List, Dict, Tuple, Union, Callable
9
9
  from llm_ie.data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
10
10
  from llm_ie.engines import InferenceEngine
11
11
  from colorama import Fore, Style
12
+ from nltk.tokenize import RegexpTokenizer
12
13
 
13
14
 
14
15
  class Extractor:
@@ -37,7 +38,7 @@ class Extractor:
37
38
  This method returns the pre-defined prompt guideline for the extractor from the package asset.
38
39
  """
39
40
  file_path = importlib.resources.files('llm_ie.asset.prompt_guide').joinpath(f"{cls.__name__}_prompt_guide.txt")
40
- with open(file_path, 'r') as f:
41
+ with open(file_path, 'r', encoding="utf-8") as f:
41
42
  return f.read()
42
43
 
43
44
 
@@ -139,9 +140,71 @@ class FrameExtractor(Extractor):
139
140
  prompt_template=prompt_template,
140
141
  system_prompt=system_prompt,
141
142
  **kwrs)
142
-
143
+ self.tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
144
+
145
+
146
+ def _jaccard_score(self, s1:Set[str], s2:Set[str]) -> float:
147
+ """
148
+ This method calculates the Jaccard score between two sets of word tokens.
149
+ """
150
+ return len(s1.intersection(s2)) / len(s1.union(s2))
151
+
152
+
153
+ def _get_word_tokens(self, text) -> Tuple[List[str], List[Tuple[int]]]:
154
+ """
155
+ This method tokenizes the input text into a list of word tokens and their spans.
156
+ """
157
+ tokens = []
158
+ spans = []
159
+ for span in self.tokenizer.span_tokenize(text):
160
+ spans.append(span)
161
+ start, end = span
162
+ tokens.append(text[start:end])
163
+ return tokens, spans
164
+
165
+
166
+ def _get_closest_substring(self, text:str, pattern:str, buffer_size:float=0.2) -> Tuple[Tuple[int, int], float]:
167
+ """
168
+ This method finds the closest (highest Jaccard score) substring in text that matches the pattern.
169
+ the substring must start with the same word token as the pattern. This is due to the observation that
170
+ LLM often generate the first few words consistently.
171
+
172
+ Parameters
173
+ ----------
174
+ text : str
175
+ the input text.
176
+ pattern : str
177
+ the pattern to match.
178
+ buffer_size : float, Optional
179
+ the buffer size for the matching window. Default is 20% of pattern length.
180
+
181
+ Returns : Tuple[Tuple[int, int], float]
182
+ a tuple of 2-tuple span and Jaccard score.
183
+ """
184
+ text_tokens, text_spans = self._get_word_tokens(text)
185
+ pattern_tokens, _ = self._get_word_tokens(pattern)
186
+ pattern_tokens_set = set(pattern_tokens)
187
+ window_size = len(pattern_tokens)
188
+ window_size_min = int(window_size * (1 - buffer_size))
189
+ window_size_max = int(window_size * (1 + buffer_size))
190
+ closest_substring_span = None
191
+ best_score = 0
192
+
193
+ for i in range(len(text_tokens) - window_size_max):
194
+ for w in range(window_size_min, window_size_max):
195
+ sub_str_tokens = text_tokens[i:i + w]
196
+ if sub_str_tokens[0] == pattern_tokens[0]:
197
+ score = self._jaccard_score(set(sub_str_tokens), pattern_tokens_set)
198
+ if score > best_score:
199
+ best_score = score
200
+ sub_string_word_spans = text_spans[i:i + w]
201
+ closest_substring_span = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
143
202
 
144
- def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False) -> List[Tuple[int]]:
203
+ return closest_substring_span, best_score
204
+
205
+
206
+ def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
207
+ fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8) -> List[Tuple[int]]:
145
208
  """
146
209
  This function inputs a text and a list of entity text,
147
210
  outputs a list of spans (2-tuple) for each entity.
@@ -151,19 +214,46 @@ class FrameExtractor(Extractor):
151
214
  ----------
152
215
  text : str
153
216
  text that contains entities
217
+ entities : List[str]
218
+ a list of entity text to find in the text
219
+ case_sensitive : bool, Optional
220
+ if True, entity text matching will be case-sensitive.
221
+ fuzzy_match : bool, Optional
222
+ if True, fuzzy matching will be applied to find entity text.
223
+ fuzzy_buffer_size : float, Optional
224
+ the buffer size for fuzzy matching. Default is 20% of entity text length.
225
+ fuzzy_score_cutoff : float, Optional
226
+ the Jaccard score cutoff for fuzzy matching.
227
+ Matched entity text must have a score higher than this value or a None will be returned.
154
228
  """
229
+ # Handle case sensitivity
230
+ if not case_sensitive:
231
+ text = text.lower()
232
+
233
+ # Match entities
155
234
  entity_spans = []
156
- for entity in entities:
157
- if case_sensitive:
158
- match = re.search(re.escape(entity), text)
159
- else:
160
- match = re.search(re.escape(entity), text, re.IGNORECASE)
161
-
235
+ for entity in entities:
236
+ if not case_sensitive:
237
+ entity = entity.lower()
238
+
239
+ # Exact match
240
+ match = re.search(re.escape(entity), text)
162
241
  if match:
163
242
  start, end = match.span()
164
243
  entity_spans.append((start, end))
165
244
  # Replace the found entity with spaces to avoid finding the same instance again
166
245
  text = text[:start] + ' ' * (end - start) + text[end:]
246
+ # Fuzzy match
247
+ elif fuzzy_match:
248
+ closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
249
+ if best_score >= fuzzy_score_cutoff and closest_substring_span:
250
+ entity_spans.append(closest_substring_span)
251
+ # Replace the found entity with spaces to avoid finding the same instance again
252
+ text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
253
+ else:
254
+ entity_spans.append(None)
255
+
256
+ # No match
167
257
  else:
168
258
  entity_spans.append(None)
169
259
 
@@ -276,7 +366,9 @@ class BasicFrameExtractor(FrameExtractor):
276
366
 
277
367
 
278
368
  def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
279
- temperature:float=0.0, case_sensitive:bool=False, document_key:str=None, **kwrs) -> List[LLMInformationExtractionFrame]:
369
+ temperature:float=0.0, document_key:str=None, stream:bool=False,
370
+ case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2,
371
+ fuzzy_score_cutoff:float=0.8, **kwrs) -> List[LLMInformationExtractionFrame]:
280
372
  """
281
373
  This method inputs a text and outputs a list of LLMInformationExtractionFrame
282
374
  It use the extract() method and post-process outputs into frames.
@@ -293,18 +385,30 @@ class BasicFrameExtractor(FrameExtractor):
293
385
  the max number of new tokens LLM should generate.
294
386
  temperature : float, Optional
295
387
  the temperature for token sampling.
296
- case_sensitive : bool, Optional
297
- if True, entity text matching will be case-sensitive.
298
388
  document_key : str, Optional
299
389
  specify the key in text_content where document text is.
300
390
  If text_content is str, this parameter will be ignored.
391
+ stream : bool, Optional
392
+ if True, LLM generated text will be printed in terminal in real-time.
393
+ case_sensitive : bool, Optional
394
+ if True, entity text matching will be case-sensitive.
395
+ fuzzy_match : bool, Optional
396
+ if True, fuzzy matching will be applied to find entity text.
397
+ fuzzy_buffer_size : float, Optional
398
+ the buffer size for fuzzy matching. Default is 20% of entity text length.
399
+ fuzzy_score_cutoff : float, Optional
400
+ the Jaccard score cutoff for fuzzy matching.
401
+ Matched entity text must have a score higher than this value or a None will be returned.
301
402
 
302
403
  Return : str
303
404
  a list of frames.
304
405
  """
305
406
  frame_list = []
306
407
  gen_text = self.extract(text_content=text_content,
307
- max_new_tokens=max_new_tokens, temperature=temperature, **kwrs)
408
+ max_new_tokens=max_new_tokens,
409
+ temperature=temperature,
410
+ stream=stream,
411
+ **kwrs)
308
412
 
309
413
  entity_json = []
310
414
  for entity in self._extract_json(gen_text=gen_text):
@@ -320,7 +424,10 @@ class BasicFrameExtractor(FrameExtractor):
320
424
 
321
425
  spans = self._find_entity_spans(text=text,
322
426
  entities=[e[entity_key] for e in entity_json],
323
- case_sensitive=case_sensitive)
427
+ case_sensitive=case_sensitive,
428
+ fuzzy_match=fuzzy_match,
429
+ fuzzy_buffer_size=fuzzy_buffer_size,
430
+ fuzzy_score_cutoff=fuzzy_score_cutoff)
324
431
 
325
432
  for i, (ent, span) in enumerate(zip(entity_json, spans)):
326
433
  if span is not None:
@@ -328,7 +435,7 @@ class BasicFrameExtractor(FrameExtractor):
328
435
  frame = LLMInformationExtractionFrame(frame_id=f"{i}",
329
436
  start=start,
330
437
  end=end,
331
- entity_text=ent[entity_key],
438
+ entity_text=text[start:end],
332
439
  attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
333
440
  frame_list.append(frame)
334
441
  return frame_list
@@ -370,7 +477,7 @@ class ReviewFrameExtractor(BasicFrameExtractor):
370
477
  else:
371
478
  file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
372
479
  joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
373
- with open(file_path, 'r') as f:
480
+ with open(file_path, 'r', encoding="utf-8") as f:
374
481
  self.review_prompt = f.read()
375
482
 
376
483
  warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
@@ -559,8 +666,9 @@ class SentenceFrameExtractor(FrameExtractor):
559
666
 
560
667
 
561
668
  def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=512,
562
- document_key:str=None, multi_turn:bool=False, temperature:float=0.0, case_sensitive:bool=False,
563
- stream:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
669
+ document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False,
670
+ case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
671
+ **kwrs) -> List[LLMInformationExtractionFrame]:
564
672
  """
565
673
  This method inputs a text and outputs a list of LLMInformationExtractionFrame
566
674
  It use the extract() method and post-process outputs into frames.
@@ -586,10 +694,17 @@ class SentenceFrameExtractor(FrameExtractor):
586
694
  can better utilize the KV caching.
587
695
  temperature : float, Optional
588
696
  the temperature for token sampling.
589
- case_sensitive : bool, Optional
590
- if True, entity text matching will be case-sensitive.
591
697
  stream : bool, Optional
592
698
  if True, LLM generated text will be printed in terminal in real-time.
699
+ case_sensitive : bool, Optional
700
+ if True, entity text matching will be case-sensitive.
701
+ fuzzy_match : bool, Optional
702
+ if True, fuzzy matching will be applied to find entity text.
703
+ fuzzy_buffer_size : float, Optional
704
+ the buffer size for fuzzy matching. Default is 20% of entity text length.
705
+ fuzzy_score_cutoff : float, Optional
706
+ the Jaccard score cutoff for fuzzy matching.
707
+ Matched entity text must have a score higher than this value or a None will be returned.
593
708
 
594
709
  Return : str
595
710
  a list of frames.
@@ -611,16 +726,21 @@ class SentenceFrameExtractor(FrameExtractor):
611
726
  warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
612
727
 
613
728
  spans = self._find_entity_spans(text=sent['sentence_text'],
614
- entities=[e[entity_key] for e in entity_json], case_sensitive=case_sensitive)
729
+ entities=[e[entity_key] for e in entity_json],
730
+ case_sensitive=case_sensitive,
731
+ fuzzy_match=fuzzy_match,
732
+ fuzzy_buffer_size=fuzzy_buffer_size,
733
+ fuzzy_score_cutoff=fuzzy_score_cutoff)
615
734
  for ent, span in zip(entity_json, spans):
616
735
  if span is not None:
617
736
  start, end = span
737
+ entity_text = sent['sentence_text'][start:end]
618
738
  start += sent['sentence_start']
619
739
  end += sent['sentence_start']
620
740
  frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
621
741
  start=start,
622
742
  end=end,
623
- entity_text=ent[entity_key],
743
+ entity_text=entity_text,
624
744
  attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
625
745
  frame_list.append(frame)
626
746
  return frame_list
@@ -663,7 +783,7 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
663
783
  else:
664
784
  file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
665
785
  joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
666
- with open(file_path, 'r') as f:
786
+ with open(file_path, 'r', encoding="utf-8") as f:
667
787
  self.review_prompt = f.read()
668
788
 
669
789
  warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
llm_ie/prompt_editor.py CHANGED
@@ -5,8 +5,6 @@ from llm_ie.engines import InferenceEngine
5
5
  from llm_ie.extractors import FrameExtractor
6
6
  import re
7
7
  from colorama import Fore, Style
8
- import ipywidgets as widgets
9
- from IPython.display import display, HTML
10
8
 
11
9
 
12
10
  class PromptEditor:
@@ -121,6 +119,16 @@ class PromptEditor:
121
119
  """
122
120
  This method runs an interactive chat session in Jupyter/IPython using ipywidgets to help users write prompt templates.
123
121
  """
122
+ # Check if ipywidgets is installed
123
+ if importlib.util.find_spec("ipywidgets") is None:
124
+ raise ImportError("ipywidgets not found. Please install ipywidgets (```pip install ipywidgets```).")
125
+ import ipywidgets as widgets
126
+
127
+ # Check if IPython is installed
128
+ if importlib.util.find_spec("IPython") is None:
129
+ raise ImportError("IPython not found. Please install IPython (```pip install ipython```).")
130
+ from IPython.display import display, HTML
131
+
124
132
  # Load the chat prompt template from the resources
125
133
  file_path = importlib.resources.files('llm_ie.asset.PromptEditor_prompts').joinpath('chat.txt')
126
134
  with open(file_path, 'r') as f:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llm-ie
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=oKH_QeD
16
16
  llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=oKH_QeDgpw771ZdHk3L7DYz2Jvfm7OolUoTiJyMJI30,9541
17
17
  llm_ie/data_types.py,sha256=hPz3WOeAzfn2QKmb0CxHmRdQWZQ4G9zq8U-RJBVFdYk,14329
18
18
  llm_ie/engines.py,sha256=PTYs7s_iCPmI-yFUCVCPY_cMGS77ma2VGoz4rdNkODI,9308
19
- llm_ie/extractors.py,sha256=xgkicRzBPRaQPiKWmQJ5b_aiNv9VEc85jzBA7cQXic8,58331
20
- llm_ie/prompt_editor.py,sha256=3h_2yIe7OV4auv4Vb9Zdx2q26UhC0xp9c4tt_yDr78I,8144
21
- llm_ie-0.3.1.dist-info/METADATA,sha256=eJCzg7G_ivz0CcP9KycSeHo986se6tqA8cKLtQyTtw4,41266
22
- llm_ie-0.3.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
23
- llm_ie-0.3.1.dist-info/RECORD,,
19
+ llm_ie/extractors.py,sha256=yBdIcevjMfwto85Jb0KkRMN-AjIMk92fD5yWB3Qm8MY,64408
20
+ llm_ie/prompt_editor.py,sha256=Xc5ZHsEnM8-YYITokIsM6BVsf2Ec_8ajJDaldPf-P8U,8577
21
+ llm_ie-0.3.3.dist-info/METADATA,sha256=CeTsMNtWhEWCvOqHWSXu0KqOgDp3kMwN2WtBF4N-4zE,41266
22
+ llm_ie-0.3.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
23
+ llm_ie-0.3.3.dist-info/RECORD,,
File without changes