llm-ie 1.2.2__tar.gz → 1.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {llm_ie-1.2.2 → llm_ie-1.2.3}/PKG-INFO +1 -1
  2. {llm_ie-1.2.2 → llm_ie-1.2.3}/pyproject.toml +1 -1
  3. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/__init__.py +5 -4
  4. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/chunkers.py +44 -5
  5. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/data_types.py +23 -37
  6. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/engines.py +577 -61
  7. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/extractors.py +335 -219
  8. {llm_ie-1.2.2 → llm_ie-1.2.3}/README.md +0 -0
  9. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/PromptEditor_prompts/chat.txt +0 -0
  10. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/PromptEditor_prompts/comment.txt +0 -0
  11. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/PromptEditor_prompts/rewrite.txt +0 -0
  12. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/PromptEditor_prompts/system.txt +0 -0
  13. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/default_prompts/BasicReviewFrameExtractor_addition_review_prompt.txt +0 -0
  14. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/default_prompts/BasicReviewFrameExtractor_revision_review_prompt.txt +0 -0
  15. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +0 -0
  16. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +0 -0
  17. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt +0 -0
  18. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_revision_review_prompt.txt +0 -0
  19. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt +0 -0
  20. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +0 -0
  21. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/BasicReviewFrameExtractor_prompt_guide.txt +0 -0
  22. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt +0 -0
  23. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/DirectFrameExtractor_prompt_guide.txt +0 -0
  24. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt +0 -0
  25. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +0 -0
  26. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +0 -0
  27. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +0 -0
  28. {llm_ie-1.2.2 → llm_ie-1.2.3}/src/llm_ie/prompt_editor.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llm-ie
3
- Version: 1.2.2
3
+ Version: 1.2.3
4
4
  Summary: A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "llm-ie"
3
- version = "1.2.2"
3
+ version = "1.2.3"
4
4
  description = "A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines."
5
5
  authors = ["Enshuo (David) Hsu"]
6
6
  license = "MIT"
@@ -1,11 +1,12 @@
1
1
  from .data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
2
- from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig, LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
2
+ from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
3
+ from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
3
4
  from .extractors import DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
4
- from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
5
+ from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
5
6
  from .prompt_editor import PromptEditor
6
7
 
7
8
  __all__ = ["LLMInformationExtractionFrame", "LLMInformationExtractionDocument",
8
- "BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
9
+ "BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
9
10
  "DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
10
- "UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
11
+ "UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
11
12
  "PromptEditor"]
@@ -1,5 +1,7 @@
1
1
  import abc
2
2
  from typing import Set, List, Dict, Tuple, Union, Callable
3
+ import asyncio
4
+ import uuid
3
5
  from llm_ie.data_types import FrameExtractionUnit
4
6
 
5
7
 
@@ -11,7 +13,8 @@ class UnitChunker(abc.ABC):
11
13
  """
12
14
  pass
13
15
 
14
- def chunk(self, text:str) -> List[FrameExtractionUnit]:
16
+ @abc.abstractmethod
17
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
15
18
  """
16
19
  Parameters:
17
20
  ----------
@@ -20,6 +23,12 @@ class UnitChunker(abc.ABC):
20
23
  """
21
24
  return NotImplemented
22
25
 
26
+ async def chunk_async(self, text:str, doc_id:str=None, executor=None) -> List[FrameExtractionUnit]:
27
+ """
28
+ asynchronous version of chunk method.
29
+ """
30
+ loop = asyncio.get_running_loop()
31
+ return await loop.run_in_executor(executor, self.chunk, text, doc_id)
23
32
 
24
33
  class WholeDocumentUnitChunker(UnitChunker):
25
34
  def __init__(self):
@@ -28,7 +37,7 @@ class WholeDocumentUnitChunker(UnitChunker):
28
37
  """
29
38
  super().__init__()
30
39
 
31
- def chunk(self, text:str) -> List[FrameExtractionUnit]:
40
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
32
41
  """
33
42
  Parameters:
34
43
  ----------
@@ -36,6 +45,7 @@ class WholeDocumentUnitChunker(UnitChunker):
36
45
  The document text.
37
46
  """
38
47
  return [FrameExtractionUnit(
48
+ doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
39
49
  start=0,
40
50
  end=len(text),
41
51
  text=text
@@ -57,7 +67,7 @@ class SeparatorUnitChunker(UnitChunker):
57
67
 
58
68
  self.sep = sep
59
69
 
60
- def chunk(self, text:str) -> List[FrameExtractionUnit]:
70
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
61
71
  """
62
72
  Parameters:
63
73
  ----------
@@ -70,6 +80,7 @@ class SeparatorUnitChunker(UnitChunker):
70
80
  for paragraph in paragraphs:
71
81
  end = start + len(paragraph)
72
82
  paragraph_units.append(FrameExtractionUnit(
83
+ doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
73
84
  start=start,
74
85
  end=end,
75
86
  text=paragraph
@@ -77,6 +88,7 @@ class SeparatorUnitChunker(UnitChunker):
77
88
  start = end + len(self.sep)
78
89
  return paragraph_units
79
90
 
91
+
80
92
  class SentenceUnitChunker(UnitChunker):
81
93
  from nltk.tokenize.punkt import PunktSentenceTokenizer
82
94
  def __init__(self):
@@ -85,7 +97,7 @@ class SentenceUnitChunker(UnitChunker):
85
97
  """
86
98
  super().__init__()
87
99
 
88
- def chunk(self, text:str) -> List[FrameExtractionUnit]:
100
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
89
101
  """
90
102
  Parameters:
91
103
  ----------
@@ -95,6 +107,7 @@ class SentenceUnitChunker(UnitChunker):
95
107
  sentences = []
96
108
  for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
97
109
  sentences.append(FrameExtractionUnit(
110
+ doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
98
111
  start=start,
99
112
  end=end,
100
113
  text=text[start:end]
@@ -109,7 +122,7 @@ class TextLineUnitChunker(UnitChunker):
109
122
  """
110
123
  super().__init__()
111
124
 
112
- def chunk(self, text:str) -> List[FrameExtractionUnit]:
125
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
113
126
  """
114
127
  Parameters:
115
128
  ----------
@@ -122,6 +135,7 @@ class TextLineUnitChunker(UnitChunker):
122
135
  for line in lines:
123
136
  end = start + len(line)
124
137
  line_units.append(FrameExtractionUnit(
138
+ doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
125
139
  start=start,
126
140
  end=end,
127
141
  text=line
@@ -138,6 +152,24 @@ class ContextChunker(abc.ABC):
138
152
  """
139
153
  pass
140
154
 
155
+ @abc.abstractmethod
156
+ def fit(self, text:str, units:List[FrameExtractionUnit]):
157
+ """
158
+ Parameters:
159
+ ----------
160
+ text : str
161
+ The document text.
162
+ """
163
+ pass
164
+
165
+ async def fit_async(self, text:str, units:List[FrameExtractionUnit], executor=None):
166
+ """
167
+ asynchronous version of fit method.
168
+ """
169
+ loop = asyncio.get_running_loop()
170
+ return await loop.run_in_executor(executor, self.fit, text, units)
171
+
172
+ @abc.abstractmethod
141
173
  def chunk(self, unit:FrameExtractionUnit) -> str:
142
174
  """
143
175
  Parameters:
@@ -150,6 +182,13 @@ class ContextChunker(abc.ABC):
150
182
  """
151
183
  return NotImplemented
152
184
 
185
+ async def chunk_async(self, unit:FrameExtractionUnit, executor=None) -> str:
186
+ """
187
+ asynchronous version of chunk method.
188
+ """
189
+ loop = asyncio.get_running_loop()
190
+ return await loop.run_in_executor(executor, self.chunk, unit)
191
+
153
192
 
154
193
  class NoContextChunker(ContextChunker):
155
194
  def __init__(self):
@@ -7,13 +7,15 @@ import json
7
7
 
8
8
  @dataclass
9
9
  class FrameExtractionUnit:
10
- def __init__(self, start:int, end:int, text:str):
10
+ def __init__(self, doc_id:str, start:int, end:int, text:str):
11
11
  """
12
12
  This class holds the unit text for frame extraction, for example, a sentence.
13
13
  FrameExtractor prompt it one at a time to extract frames.
14
14
 
15
15
  Parameters
16
16
  ----------
17
+ doc_id : str, Optional
18
+ document ID.
17
19
  start : int
18
20
  start character position of the unit text, relative to the whole document
19
21
  end : int
@@ -21,9 +23,28 @@ class FrameExtractionUnit:
21
23
  text : str
22
24
  the unit text. Should be the exact string by [start:end]
23
25
  """
26
+ self.doc_id = doc_id
24
27
  self.start = start
25
28
  self.end = end
26
29
  self.text = text
30
+ # status: "pending", "success", "fail"
31
+ self.status = "pending"
32
+ # generated text by LLM
33
+ self.gen_text = None
34
+
35
+ def get_status(self) -> str:
36
+ return self.status
37
+
38
+ def set_status(self, status:str):
39
+ if status not in {"pending", "success", "fail"}:
40
+ raise ValueError('status must be one of {"pending", "success", "fail"}.')
41
+ self.status = status
42
+
43
+ def get_generated_text(self) -> str:
44
+ return self.gen_text
45
+
46
+ def set_generated_text(self, gen_text:str):
47
+ self.gen_text = gen_text
27
48
 
28
49
  def __eq__(self, other):
29
50
  if not isinstance(other, FrameExtractionUnit):
@@ -39,43 +60,8 @@ class FrameExtractionUnit:
39
60
  return self.start < other.start
40
61
 
41
62
  def __repr__(self):
42
- return f"FrameExtractionUnit(start={self.start}, end={self.end}, text='{self.text[:100]}...')"
43
-
44
-
45
- @dataclass
46
- class FrameExtractionUnitResult:
47
- def __init__(self, start:int, end:int, text:str, gen_text:str):
48
- """
49
- This class holds the unit text for frame extraction, for example, a sentence.
50
- FrameExtractor prompt it one at a time to extract frames.
51
-
52
- Parameters
53
- ----------
54
- start : int
55
- start character position of the unit text, relative to the whole document
56
- end : int
57
- end character position of the unit text, relative to the whole document
58
- text : str
59
- the unit text. Should be the exact string by [start:end]
60
- gen_text : str
61
- the generated text by LLM (ideally) following '[{"entity_text": "xxx", "attr": {"key": "value"}}]' format. Does not contain spans (start/end).
62
- """
63
- self.start = start
64
- self.end = end
65
- self.text = text
66
- self.gen_text = gen_text
67
-
68
- def __eq__(self, other):
69
- if not isinstance(other, FrameExtractionUnit):
70
- return NotImplemented
71
- return (self.start == other.start and self.end == other.end and self.text == other.text and self.gen_text == other.gen_text)
63
+ return f"FrameExtractionUnit(doc_id={self.doc_id}, start={self.start}, end={self.end}, status={self.status}, text='{self.text[:100]}...')"
72
64
 
73
- def __hash__(self):
74
- return hash((self.start, self.end, self.text, self.gen_text))
75
-
76
- def __repr__(self):
77
- return f"FrameExtractionUnitResult(start={self.start}, end={self.end}, text='{self.text[:100]}...', gen_text='{self.gen_text[:100]}...')"
78
-
79
65
 
80
66
  @dataclass
81
67
  class LLMInformationExtractionFrame: