llm-ie 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {llm_ie-0.3.1 → llm_ie-0.3.2}/PKG-INFO +1 -1
  2. {llm_ie-0.3.1 → llm_ie-0.3.2}/pyproject.toml +1 -1
  3. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/extractors.py +138 -22
  4. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/prompt_editor.py +10 -2
  5. {llm_ie-0.3.1 → llm_ie-0.3.2}/README.md +0 -0
  6. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/__init__.py +0 -0
  7. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/PromptEditor_prompts/chat.txt +0 -0
  8. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/PromptEditor_prompts/comment.txt +0 -0
  9. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/PromptEditor_prompts/rewrite.txt +0 -0
  10. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/PromptEditor_prompts/system.txt +0 -0
  11. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +0 -0
  12. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +0 -0
  13. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt +0 -0
  14. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_revision_review_prompt.txt +0 -0
  15. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +0 -0
  16. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt +0 -0
  17. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt +0 -0
  18. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +0 -0
  19. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/SentenceCoTFrameExtractor_prompt_guide.txt +0 -0
  20. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +0 -0
  21. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +0 -0
  22. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/data_types.py +0 -0
  23. {llm_ie-0.3.1 → llm_ie-0.3.2}/src/llm_ie/engines.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llm-ie
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "llm-ie"
3
- version = "0.3.1"
3
+ version = "0.3.2"
4
4
  description = "An LLM-powered tool that transforms everyday language into robust information extraction pipelines."
5
5
  authors = ["Enshuo (David) Hsu"]
6
6
  license = "MIT"
@@ -5,10 +5,11 @@ import inspect
5
5
  import importlib.resources
6
6
  import warnings
7
7
  import itertools
8
- from typing import List, Dict, Tuple, Union, Callable
8
+ from typing import Set, List, Dict, Tuple, Union, Callable
9
9
  from llm_ie.data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
10
10
  from llm_ie.engines import InferenceEngine
11
11
  from colorama import Fore, Style
12
+ from nltk.tokenize import RegexpTokenizer
12
13
 
13
14
 
14
15
  class Extractor:
@@ -37,7 +38,7 @@ class Extractor:
37
38
  This method returns the pre-defined prompt guideline for the extractor from the package asset.
38
39
  """
39
40
  file_path = importlib.resources.files('llm_ie.asset.prompt_guide').joinpath(f"{cls.__name__}_prompt_guide.txt")
40
- with open(file_path, 'r') as f:
41
+ with open(file_path, 'r', encoding="utf-8") as f:
41
42
  return f.read()
42
43
 
43
44
 
@@ -139,9 +140,68 @@ class FrameExtractor(Extractor):
139
140
  prompt_template=prompt_template,
140
141
  system_prompt=system_prompt,
141
142
  **kwrs)
142
-
143
+ self.tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
144
+
145
+
146
+ def _jaccard_score(self, s1:set, s2:set) -> float:
147
+ """
148
+ This method calculates the Jaccard score between two sets of word tokens.
149
+ """
150
+ return len(s1.intersection(s2)) / len(s1.union(s2))
151
+
152
+
153
+ def _get_word_tokens(self, text) -> Tuple[List[str], List[Tuple[int]]]:
154
+ """
155
+ This method tokenizes the input text into a list of word tokens and their spans.
156
+ """
157
+ tokens = []
158
+ spans = []
159
+ for span in self.tokenizer.span_tokenize(text):
160
+ spans.append(span)
161
+ start, end = span
162
+ tokens.append(text[start:end])
163
+ return tokens, spans
164
+
165
+
166
+ def _get_closest_substring(self, text:str, pattern:str, buffer_size:float=0.2) -> Tuple[Tuple[int, int], float]:
167
+ """
168
+ This method finds the closest (highest Jaccard score) substring in text that matches the pattern.
169
+
170
+ Parameters
171
+ ----------
172
+ text : str
173
+ the input text.
174
+ pattern : str
175
+ the pattern to match.
176
+ buffer_size : float, Optional
177
+ the buffer size for the matching window. Default is 20% of pattern length.
178
+
179
+ Returns : Tuple[Tuple[int, int], float]
180
+ a tuple of 2-tuple span and Jaccard score.
181
+ """
182
+ text_tokens, text_spans = self._get_word_tokens(text)
183
+ pattern_tokens, _ = self._get_word_tokens(pattern)
184
+ pattern_tokens = set(pattern_tokens)
185
+ window_size = len(pattern_tokens)
186
+ window_size_min = int(window_size * (1 - buffer_size))
187
+ window_size_max = int(window_size * (1 + buffer_size))
188
+ closest_substring_spans = None
189
+ best_score = 0
190
+
191
+ for i in range(len(text_tokens) - window_size_max):
192
+ for w in range(window_size_min, window_size_max):
193
+ sub_str_tokens = set(text_tokens[i:i + w])
194
+ score = self._jaccard_score(sub_str_tokens, pattern_tokens)
195
+ if score > best_score:
196
+ best_score = score
197
+ sub_string_word_spans = text_spans[i:i + w]
198
+ closest_substring_spans = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
143
199
 
144
- def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False) -> List[Tuple[int]]:
200
+ return closest_substring_spans, best_score
201
+
202
+
203
+ def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
204
+ fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8) -> List[Tuple[int]]:
145
205
  """
146
206
  This function inputs a text and a list of entity text,
147
207
  outputs a list of spans (2-tuple) for each entity.
@@ -151,19 +211,46 @@ class FrameExtractor(Extractor):
151
211
  ----------
152
212
  text : str
153
213
  text that contains entities
214
+ entities : List[str]
215
+ a list of entity text to find in the text
216
+ case_sensitive : bool, Optional
217
+ if True, entity text matching will be case-sensitive.
218
+ fuzzy_match : bool, Optional
219
+ if True, fuzzy matching will be applied to find entity text.
220
+ fuzzy_buffer_size : float, Optional
221
+ the buffer size for fuzzy matching. Default is 20% of entity text length.
222
+ fuzzy_score_cutoff : float, Optional
223
+ the Jaccard score cutoff for fuzzy matching.
224
+ Matched entity text must have a score higher than this value or a None will be returned.
154
225
  """
226
+ # Handle case sensitivity
227
+ if not case_sensitive:
228
+ text = text.lower()
229
+
230
+ # Match entities
155
231
  entity_spans = []
156
- for entity in entities:
157
- if case_sensitive:
158
- match = re.search(re.escape(entity), text)
159
- else:
160
- match = re.search(re.escape(entity), text, re.IGNORECASE)
161
-
232
+ for entity in entities:
233
+ if not case_sensitive:
234
+ entity = entity.lower()
235
+
236
+ # Exact match
237
+ match = re.search(re.escape(entity), text)
162
238
  if match:
163
239
  start, end = match.span()
164
240
  entity_spans.append((start, end))
165
241
  # Replace the found entity with spaces to avoid finding the same instance again
166
242
  text = text[:start] + ' ' * (end - start) + text[end:]
243
+ # Fuzzy match
244
+ elif fuzzy_match:
245
+ closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
246
+ if best_score >= fuzzy_score_cutoff:
247
+ entity_spans.append(closest_substring_span)
248
+ # Replace the found entity with spaces to avoid finding the same instance again
249
+ text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
250
+ else:
251
+ entity_spans.append(None)
252
+
253
+ # No match
167
254
  else:
168
255
  entity_spans.append(None)
169
256
 
@@ -276,7 +363,9 @@ class BasicFrameExtractor(FrameExtractor):
276
363
 
277
364
 
278
365
  def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
279
- temperature:float=0.0, case_sensitive:bool=False, document_key:str=None, **kwrs) -> List[LLMInformationExtractionFrame]:
366
+ temperature:float=0.0, document_key:str=None, stream:bool=False,
367
+ case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2,
368
+ fuzzy_score_cutoff:float=0.8, **kwrs) -> List[LLMInformationExtractionFrame]:
280
369
  """
281
370
  This method inputs a text and outputs a list of LLMInformationExtractionFrame
282
371
  It use the extract() method and post-process outputs into frames.
@@ -293,18 +382,30 @@ class BasicFrameExtractor(FrameExtractor):
293
382
  the max number of new tokens LLM should generate.
294
383
  temperature : float, Optional
295
384
  the temperature for token sampling.
296
- case_sensitive : bool, Optional
297
- if True, entity text matching will be case-sensitive.
298
385
  document_key : str, Optional
299
386
  specify the key in text_content where document text is.
300
387
  If text_content is str, this parameter will be ignored.
388
+ stream : bool, Optional
389
+ if True, LLM generated text will be printed in terminal in real-time.
390
+ case_sensitive : bool, Optional
391
+ if True, entity text matching will be case-sensitive.
392
+ fuzzy_match : bool, Optional
393
+ if True, fuzzy matching will be applied to find entity text.
394
+ fuzzy_buffer_size : float, Optional
395
+ the buffer size for fuzzy matching. Default is 20% of entity text length.
396
+ fuzzy_score_cutoff : float, Optional
397
+ the Jaccard score cutoff for fuzzy matching.
398
+ Matched entity text must have a score higher than this value or a None will be returned.
301
399
 
302
400
  Return : str
303
401
  a list of frames.
304
402
  """
305
403
  frame_list = []
306
404
  gen_text = self.extract(text_content=text_content,
307
- max_new_tokens=max_new_tokens, temperature=temperature, **kwrs)
405
+ max_new_tokens=max_new_tokens,
406
+ temperature=temperature,
407
+ stream=stream,
408
+ **kwrs)
308
409
 
309
410
  entity_json = []
310
411
  for entity in self._extract_json(gen_text=gen_text):
@@ -320,7 +421,10 @@ class BasicFrameExtractor(FrameExtractor):
320
421
 
321
422
  spans = self._find_entity_spans(text=text,
322
423
  entities=[e[entity_key] for e in entity_json],
323
- case_sensitive=case_sensitive)
424
+ case_sensitive=case_sensitive,
425
+ fuzzy_match=fuzzy_match,
426
+ fuzzy_buffer_size=fuzzy_buffer_size,
427
+ fuzzy_score_cutoff=fuzzy_score_cutoff)
324
428
 
325
429
  for i, (ent, span) in enumerate(zip(entity_json, spans)):
326
430
  if span is not None:
@@ -370,7 +474,7 @@ class ReviewFrameExtractor(BasicFrameExtractor):
370
474
  else:
371
475
  file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
372
476
  joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
373
- with open(file_path, 'r') as f:
477
+ with open(file_path, 'r', encoding="utf-8") as f:
374
478
  self.review_prompt = f.read()
375
479
 
376
480
  warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
@@ -559,8 +663,9 @@ class SentenceFrameExtractor(FrameExtractor):
559
663
 
560
664
 
561
665
  def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=512,
562
- document_key:str=None, multi_turn:bool=False, temperature:float=0.0, case_sensitive:bool=False,
563
- stream:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
666
+ document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False,
667
+ case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
668
+ **kwrs) -> List[LLMInformationExtractionFrame]:
564
669
  """
565
670
  This method inputs a text and outputs a list of LLMInformationExtractionFrame
566
671
  It use the extract() method and post-process outputs into frames.
@@ -586,10 +691,17 @@ class SentenceFrameExtractor(FrameExtractor):
586
691
  can better utilize the KV caching.
587
692
  temperature : float, Optional
588
693
  the temperature for token sampling.
589
- case_sensitive : bool, Optional
590
- if True, entity text matching will be case-sensitive.
591
694
  stream : bool, Optional
592
695
  if True, LLM generated text will be printed in terminal in real-time.
696
+ case_sensitive : bool, Optional
697
+ if True, entity text matching will be case-sensitive.
698
+ fuzzy_match : bool, Optional
699
+ if True, fuzzy matching will be applied to find entity text.
700
+ fuzzy_buffer_size : float, Optional
701
+ the buffer size for fuzzy matching. Default is 20% of entity text length.
702
+ fuzzy_score_cutoff : float, Optional
703
+ the Jaccard score cutoff for fuzzy matching.
704
+ Matched entity text must have a score higher than this value or a None will be returned.
593
705
 
594
706
  Return : str
595
707
  a list of frames.
@@ -611,7 +723,11 @@ class SentenceFrameExtractor(FrameExtractor):
611
723
  warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
612
724
 
613
725
  spans = self._find_entity_spans(text=sent['sentence_text'],
614
- entities=[e[entity_key] for e in entity_json], case_sensitive=case_sensitive)
726
+ entities=[e[entity_key] for e in entity_json],
727
+ case_sensitive=case_sensitive,
728
+ fuzzy_match=fuzzy_match,
729
+ fuzzy_buffer_size=fuzzy_buffer_size,
730
+ fuzzy_score_cutoff=fuzzy_score_cutoff)
615
731
  for ent, span in zip(entity_json, spans):
616
732
  if span is not None:
617
733
  start, end = span
@@ -663,7 +779,7 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
663
779
  else:
664
780
  file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
665
781
  joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
666
- with open(file_path, 'r') as f:
782
+ with open(file_path, 'r', encoding="utf-8") as f:
667
783
  self.review_prompt = f.read()
668
784
 
669
785
  warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
@@ -5,8 +5,6 @@ from llm_ie.engines import InferenceEngine
5
5
  from llm_ie.extractors import FrameExtractor
6
6
  import re
7
7
  from colorama import Fore, Style
8
- import ipywidgets as widgets
9
- from IPython.display import display, HTML
10
8
 
11
9
 
12
10
  class PromptEditor:
@@ -121,6 +119,16 @@ class PromptEditor:
121
119
  """
122
120
  This method runs an interactive chat session in Jupyter/IPython using ipywidgets to help users write prompt templates.
123
121
  """
122
+ # Check if ipywidgets is installed
123
+ if importlib.util.find_spec("ipywidgets") is None:
124
+ raise ImportError("ipywidgets not found. Please install ipywidgets (```pip install ipywidgets```).")
125
+ import ipywidgets as widgets
126
+
127
+ # Check if IPython is installed
128
+ if importlib.util.find_spec("IPython") is None:
129
+ raise ImportError("IPython not found. Please install IPython (```pip install ipython```).")
130
+ from IPython.display import display, HTML
131
+
124
132
  # Load the chat prompt template from the resources
125
133
  file_path = importlib.resources.files('llm_ie.asset.PromptEditor_prompts').joinpath('chat.txt')
126
134
  with open(file_path, 'r') as f:
File without changes
File without changes
File without changes
File without changes