llm-ie 0.4.5__py3-none-any.whl → 0.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py CHANGED
@@ -224,7 +224,8 @@ class FrameExtractor(Extractor):
224
224
 
225
225
 
226
226
  def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
227
- fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8) -> List[Tuple[int]]:
227
+ fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
228
+ allow_overlap_entities:bool=False) -> List[Tuple[int]]:
228
229
  """
229
230
  This function inputs a text and a list of entity text,
230
231
  outputs a list of spans (2-tuple) for each entity.
@@ -245,6 +246,8 @@ class FrameExtractor(Extractor):
245
246
  fuzzy_score_cutoff : float, Optional
246
247
  the Jaccard score cutoff for fuzzy matching.
247
248
  Matched entity text must have a score higher than this value or a None will be returned.
249
+ allow_overlap_entities : bool, Optional
250
+ if True, entities can overlap in the text.
248
251
  """
249
252
  # Handle case sensitivity
250
253
  if not case_sensitive:
@@ -264,15 +267,17 @@ class FrameExtractor(Extractor):
264
267
  if match and entity:
265
268
  start, end = match.span()
266
269
  entity_spans.append((start, end))
267
- # Replace the found entity with spaces to avoid finding the same instance again
268
- text = text[:start] + ' ' * (end - start) + text[end:]
270
+ if not allow_overlap_entities:
271
+ # Replace the found entity with spaces to avoid finding the same instance again
272
+ text = text[:start] + ' ' * (end - start) + text[end:]
269
273
  # Fuzzy match
270
274
  elif fuzzy_match:
271
275
  closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
272
276
  if closest_substring_span and best_score >= fuzzy_score_cutoff:
273
277
  entity_spans.append(closest_substring_span)
274
- # Replace the found entity with spaces to avoid finding the same instance again
275
- text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
278
+ if not allow_overlap_entities:
279
+ # Replace the found entity with spaces to avoid finding the same instance again
280
+ text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
276
281
  else:
277
282
  entity_spans.append(None)
278
283
 
@@ -391,7 +396,7 @@ class BasicFrameExtractor(FrameExtractor):
391
396
  def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
392
397
  temperature:float=0.0, document_key:str=None, stream:bool=False,
393
398
  case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2,
394
- fuzzy_score_cutoff:float=0.8, **kwrs) -> List[LLMInformationExtractionFrame]:
399
+ fuzzy_score_cutoff:float=0.8, allow_overlap_entities:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
395
400
  """
396
401
  This method inputs a text and outputs a list of LLMInformationExtractionFrame
397
402
  It use the extract() method and post-process outputs into frames.
@@ -422,6 +427,9 @@ class BasicFrameExtractor(FrameExtractor):
422
427
  fuzzy_score_cutoff : float, Optional
423
428
  the Jaccard score cutoff for fuzzy matching.
424
429
  Matched entity text must have a score higher than this value or a None will be returned.
430
+ allow_overlap_entities : bool, Optional
431
+ if True, entities can overlap in the text.
432
+ Note that this can cause multiple frames to be generated on the same entity span if they have same entity text.
425
433
 
426
434
  Return : str
427
435
  a list of frames.
@@ -452,7 +460,8 @@ class BasicFrameExtractor(FrameExtractor):
452
460
  case_sensitive=case_sensitive,
453
461
  fuzzy_match=fuzzy_match,
454
462
  fuzzy_buffer_size=fuzzy_buffer_size,
455
- fuzzy_score_cutoff=fuzzy_score_cutoff)
463
+ fuzzy_score_cutoff=fuzzy_score_cutoff,
464
+ allow_overlap_entities=allow_overlap_entities)
456
465
 
457
466
  for i, (ent, span) in enumerate(zip(entity_json, spans)):
458
467
  if span is not None:
@@ -761,8 +770,8 @@ class SentenceFrameExtractor(FrameExtractor):
761
770
  sentences = self._get_sentences(text_content[document_key])
762
771
 
763
772
  # generate sentence by sentence
764
- tasks = []
765
773
  for i in range(0, len(sentences), concurrent_batch_size):
774
+ tasks = []
766
775
  batch = sentences[i:i + concurrent_batch_size]
767
776
  for j, sent in enumerate(batch):
768
777
  # construct chat messages
@@ -797,12 +806,12 @@ class SentenceFrameExtractor(FrameExtractor):
797
806
  # Wait until the batch is done, collect results and move on to next batch
798
807
  responses = await asyncio.gather(*tasks)
799
808
 
800
- # Collect outputs
801
- for gen_text, sent in zip(responses, sentences):
802
- output.append({'sentence_start': sent['start'],
803
- 'sentence_end': sent['end'],
804
- 'sentence_text': sent['sentence_text'],
805
- 'gen_text': gen_text})
809
+ # Collect outputs
810
+ for gen_text, sent in zip(responses, batch):
811
+ output.append({'sentence_start': sent['start'],
812
+ 'sentence_end': sent['end'],
813
+ 'sentence_text': sent['sentence_text'],
814
+ 'gen_text': gen_text})
806
815
  return output
807
816
 
808
817
 
@@ -810,7 +819,7 @@ class SentenceFrameExtractor(FrameExtractor):
810
819
  document_key:str=None, temperature:float=0.0, stream:bool=False,
811
820
  concurrent:bool=False, concurrent_batch_size:int=32,
812
821
  case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
813
- **kwrs) -> List[LLMInformationExtractionFrame]:
822
+ allow_overlap_entities:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
814
823
  """
815
824
  This method inputs a text and outputs a list of LLMInformationExtractionFrame
816
825
  It use the extract() method and post-process outputs into frames.
@@ -845,6 +854,9 @@ class SentenceFrameExtractor(FrameExtractor):
845
854
  fuzzy_score_cutoff : float, Optional
846
855
  the Jaccard score cutoff for fuzzy matching.
847
856
  Matched entity text must have a score higher than this value or a None will be returned.
857
+ allow_overlap_entities : bool, Optional
858
+ if True, entities can overlap in the text.
859
+ Note that this can cause multiple frames to be generated on the same entity span if they have same entity text.
848
860
 
849
861
  Return : str
850
862
  a list of frames.
@@ -882,7 +894,8 @@ class SentenceFrameExtractor(FrameExtractor):
882
894
  case_sensitive=case_sensitive,
883
895
  fuzzy_match=fuzzy_match,
884
896
  fuzzy_buffer_size=fuzzy_buffer_size,
885
- fuzzy_score_cutoff=fuzzy_score_cutoff)
897
+ fuzzy_score_cutoff=fuzzy_score_cutoff,
898
+ allow_overlap_entities=allow_overlap_entities)
886
899
  for ent, span in zip(entity_json, spans):
887
900
  if span is not None:
888
901
  start, end = span
@@ -1083,9 +1096,10 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
1083
1096
  sentences = self._get_sentences(text_content[document_key])
1084
1097
 
1085
1098
  # generate initial outputs sentence by sentence
1086
- tasks = []
1087
- messages_list = []
1088
1099
  for i in range(0, len(sentences), concurrent_batch_size):
1100
+ messages_list = []
1101
+ init_tasks = []
1102
+ review_tasks = []
1089
1103
  batch = sentences[i:i + concurrent_batch_size]
1090
1104
  for j, sent in enumerate(batch):
1091
1105
  # construct chat messages
@@ -1116,24 +1130,21 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
1116
1130
  **kwrs
1117
1131
  )
1118
1132
  )
1119
- tasks.append(task)
1133
+ init_tasks.append(task)
1120
1134
 
1121
- # Wait until the batch is done, collect results and move on to next batch
1122
- responses = await asyncio.gather(*tasks)
1123
- # Collect initials
1124
- initials = []
1125
- for gen_text, sent, messages in zip(responses, sentences, messages_list):
1126
- initials.append({'sentence_start': sent['start'],
1127
- 'sentence_end': sent['end'],
1128
- 'sentence_text': sent['sentence_text'],
1129
- 'gen_text': gen_text,
1130
- 'messages': messages})
1135
+ # Wait until the batch is done, collect results and move on to next batch
1136
+ init_responses = await asyncio.gather(*init_tasks)
1137
+ # Collect initials
1138
+ initials = []
1139
+ for gen_text, sent, messages in zip(init_responses, batch, messages_list):
1140
+ initials.append({'sentence_start': sent['start'],
1141
+ 'sentence_end': sent['end'],
1142
+ 'sentence_text': sent['sentence_text'],
1143
+ 'gen_text': gen_text,
1144
+ 'messages': messages})
1131
1145
 
1132
- # Review
1133
- tasks = []
1134
- for i in range(0, len(initials), concurrent_batch_size):
1135
- batch = initials[i:i + concurrent_batch_size]
1136
- for init in batch:
1146
+ # Review
1147
+ for init in initials:
1137
1148
  messages = init["messages"]
1138
1149
  initial = init["gen_text"]
1139
1150
  messages.append({'role': 'assistant', 'content': initial})
@@ -1146,29 +1157,29 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
1146
1157
  **kwrs
1147
1158
  )
1148
1159
  )
1149
- tasks.append(task)
1160
+ review_tasks.append(task)
1150
1161
 
1151
- responses = await asyncio.gather(*tasks)
1152
-
1153
- # Collect reviews
1154
- reviews = []
1155
- for gen_text, sent in zip(responses, sentences):
1156
- reviews.append({'sentence_start': sent['start'],
1157
- 'sentence_end': sent['end'],
1158
- 'sentence_text': sent['sentence_text'],
1159
- 'gen_text': gen_text})
1160
-
1161
- for init, rev in zip(initials, reviews):
1162
- if self.review_mode == "revision":
1163
- gen_text = rev['gen_text']
1164
- elif self.review_mode == "addition":
1165
- gen_text = init['gen_text'] + '\n' + rev['gen_text']
1166
-
1167
- # add to output
1168
- output.append({'sentence_start': init['sentence_start'],
1169
- 'sentence_end': init['sentence_end'],
1170
- 'sentence_text': init['sentence_text'],
1171
- 'gen_text': gen_text})
1162
+ review_responses = await asyncio.gather(*review_tasks)
1163
+
1164
+ # Collect reviews
1165
+ reviews = []
1166
+ for gen_text, sent in zip(review_responses, batch):
1167
+ reviews.append({'sentence_start': sent['start'],
1168
+ 'sentence_end': sent['end'],
1169
+ 'sentence_text': sent['sentence_text'],
1170
+ 'gen_text': gen_text})
1171
+
1172
+ for init, rev in zip(initials, reviews):
1173
+ if self.review_mode == "revision":
1174
+ gen_text = rev['gen_text']
1175
+ elif self.review_mode == "addition":
1176
+ gen_text = init['gen_text'] + '\n' + rev['gen_text']
1177
+
1178
+ # add to output
1179
+ output.append({'sentence_start': init['sentence_start'],
1180
+ 'sentence_end': init['sentence_end'],
1181
+ 'sentence_text': init['sentence_text'],
1182
+ 'gen_text': gen_text})
1172
1183
  return output
1173
1184
 
1174
1185
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llm-ie
3
- Version: 0.4.5
3
+ Version: 0.4.6
4
4
  Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -44,7 +44,7 @@ An LLM-powered tool that transforms everyday language into robust information ex
44
44
  - [v0.4.5](https://github.com/daviden1013/llm-ie/releases/tag/v0.4.5) (Feb 16, 2025):
45
45
  - Added option to adjust number of context sentences in sentence-based extractors.
46
46
  - Added support for OpenAI reasoning models ("o" series).
47
-
47
+ - [v0.4.6](https://github.com/daviden1013/llm-ie/releases/tag/v0.4.6) (Mar 1, 2025): Allow LLM to output overlapping frames.
48
48
 
49
49
  ## Table of Contents
50
50
  - [Overview](#overview)
@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=YomMhiA
16
16
  llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=YomMhiA6BoBIJFOZ1jcoKRqY7_T3PGUsM0t0vd1IuZA,9325
17
17
  llm_ie/data_types.py,sha256=_Kt4Er1SMj1jg8U8TCXFJH_64prur-IbFngHKmZgWr8,15717
18
18
  llm_ie/engines.py,sha256=nWQzV7mcRCOkJ-U1iP-xrT9dVahVbj-nAjQci4XaRjY,22609
19
- llm_ie/extractors.py,sha256=JgMtENUXJMXPpPyVBg-QiaSRJdQul6ASnQmAg1nYmiI,88758
19
+ llm_ie/extractors.py,sha256=xdg1aYN_My9N9HrzKuMniqFCzNCq7E3DKI7Ru9gawFs,89822
20
20
  llm_ie/prompt_editor.py,sha256=pHRbg_yFZdoV63r3pvf0TsLfgH2EVJvzUQEVDH1Hj0s,9570
21
- llm_ie-0.4.5.dist-info/METADATA,sha256=Ff_g89MgEFANtaFMilNAS7FXNpdOg7C89XX3uLdR4WE,55621
22
- llm_ie-0.4.5.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
23
- llm_ie-0.4.5.dist-info/RECORD,,
21
+ llm_ie-0.4.6.dist-info/METADATA,sha256=wYKoOc0fvKvEAOvHGnMGOGgaKP49rmDUnHhA8vKBrgg,55745
22
+ llm_ie-0.4.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
23
+ llm_ie-0.4.6.dist-info/RECORD,,
File without changes