llm-ie 1.2.1__tar.gz → 1.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {llm_ie-1.2.1 → llm_ie-1.2.3}/PKG-INFO +1 -1
  2. {llm_ie-1.2.1 → llm_ie-1.2.3}/pyproject.toml +1 -1
  3. llm_ie-1.2.3/src/llm_ie/__init__.py +12 -0
  4. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/chunkers.py +78 -4
  5. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/data_types.py +23 -37
  6. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/engines.py +663 -112
  7. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/extractors.py +357 -206
  8. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/prompt_editor.py +4 -4
  9. llm_ie-1.2.1/src/llm_ie/__init__.py +0 -11
  10. {llm_ie-1.2.1 → llm_ie-1.2.3}/README.md +0 -0
  11. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/PromptEditor_prompts/chat.txt +0 -0
  12. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/PromptEditor_prompts/comment.txt +0 -0
  13. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/PromptEditor_prompts/rewrite.txt +0 -0
  14. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/PromptEditor_prompts/system.txt +0 -0
  15. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/default_prompts/BasicReviewFrameExtractor_addition_review_prompt.txt +0 -0
  16. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/default_prompts/BasicReviewFrameExtractor_revision_review_prompt.txt +0 -0
  17. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +0 -0
  18. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +0 -0
  19. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt +0 -0
  20. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_revision_review_prompt.txt +0 -0
  21. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt +0 -0
  22. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +0 -0
  23. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/BasicReviewFrameExtractor_prompt_guide.txt +0 -0
  24. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt +0 -0
  25. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/DirectFrameExtractor_prompt_guide.txt +0 -0
  26. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt +0 -0
  27. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +0 -0
  28. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +0 -0
  29. {llm_ie-1.2.1 → llm_ie-1.2.3}/src/llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llm-ie
3
- Version: 1.2.1
3
+ Version: 1.2.3
4
4
  Summary: A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "llm-ie"
3
- version = "1.2.1"
3
+ version = "1.2.3"
4
4
  description = "A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines."
5
5
  authors = ["Enshuo (David) Hsu"]
6
6
  license = "MIT"
@@ -0,0 +1,12 @@
1
+ from .data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
2
+ from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
3
+ from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
4
+ from .extractors import DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
5
+ from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
6
+ from .prompt_editor import PromptEditor
7
+
8
+ __all__ = ["LLMInformationExtractionFrame", "LLMInformationExtractionDocument",
9
+ "BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
10
+ "DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
11
+ "UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
12
+ "PromptEditor"]
@@ -1,5 +1,7 @@
1
1
  import abc
2
2
  from typing import Set, List, Dict, Tuple, Union, Callable
3
+ import asyncio
4
+ import uuid
3
5
  from llm_ie.data_types import FrameExtractionUnit
4
6
 
5
7
 
@@ -11,7 +13,8 @@ class UnitChunker(abc.ABC):
11
13
  """
12
14
  pass
13
15
 
14
- def chunk(self, text:str) -> List[FrameExtractionUnit]:
16
+ @abc.abstractmethod
17
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
15
18
  """
16
19
  Parameters:
17
20
  ----------
@@ -20,6 +23,12 @@ class UnitChunker(abc.ABC):
20
23
  """
21
24
  return NotImplemented
22
25
 
26
+ async def chunk_async(self, text:str, doc_id:str=None, executor=None) -> List[FrameExtractionUnit]:
27
+ """
28
+ asynchronous version of chunk method.
29
+ """
30
+ loop = asyncio.get_running_loop()
31
+ return await loop.run_in_executor(executor, self.chunk, text, doc_id)
23
32
 
24
33
  class WholeDocumentUnitChunker(UnitChunker):
25
34
  def __init__(self):
@@ -28,7 +37,7 @@ class WholeDocumentUnitChunker(UnitChunker):
28
37
  """
29
38
  super().__init__()
30
39
 
31
- def chunk(self, text:str) -> List[FrameExtractionUnit]:
40
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
32
41
  """
33
42
  Parameters:
34
43
  ----------
@@ -36,11 +45,49 @@ class WholeDocumentUnitChunker(UnitChunker):
36
45
  The document text.
37
46
  """
38
47
  return [FrameExtractionUnit(
48
+ doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
39
49
  start=0,
40
50
  end=len(text),
41
51
  text=text
42
52
  )]
43
53
 
54
+ class SeparatorUnitChunker(UnitChunker):
55
+ def __init__(self, sep:str):
56
+ """
57
+ This class chunks a document by separator provided.
58
+
59
+ Parameters:
60
+ ----------
61
+ sep : str
62
+ a separator string.
63
+ """
64
+ super().__init__()
65
+ if not isinstance(sep, str):
66
+ raise ValueError("sep must be a string")
67
+
68
+ self.sep = sep
69
+
70
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
71
+ """
72
+ Parameters:
73
+ ----------
74
+ text : str
75
+ The document text.
76
+ """
77
+ paragraphs = text.split(self.sep)
78
+ paragraph_units = []
79
+ start = 0
80
+ for paragraph in paragraphs:
81
+ end = start + len(paragraph)
82
+ paragraph_units.append(FrameExtractionUnit(
83
+ doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
84
+ start=start,
85
+ end=end,
86
+ text=paragraph
87
+ ))
88
+ start = end + len(self.sep)
89
+ return paragraph_units
90
+
44
91
 
45
92
  class SentenceUnitChunker(UnitChunker):
46
93
  from nltk.tokenize.punkt import PunktSentenceTokenizer
@@ -50,7 +97,7 @@ class SentenceUnitChunker(UnitChunker):
50
97
  """
51
98
  super().__init__()
52
99
 
53
- def chunk(self, text:str) -> List[FrameExtractionUnit]:
100
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
54
101
  """
55
102
  Parameters:
56
103
  ----------
@@ -60,6 +107,7 @@ class SentenceUnitChunker(UnitChunker):
60
107
  sentences = []
61
108
  for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
62
109
  sentences.append(FrameExtractionUnit(
110
+ doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
63
111
  start=start,
64
112
  end=end,
65
113
  text=text[start:end]
@@ -74,7 +122,7 @@ class TextLineUnitChunker(UnitChunker):
74
122
  """
75
123
  super().__init__()
76
124
 
77
- def chunk(self, text:str) -> List[FrameExtractionUnit]:
125
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
78
126
  """
79
127
  Parameters:
80
128
  ----------
@@ -87,6 +135,7 @@ class TextLineUnitChunker(UnitChunker):
87
135
  for line in lines:
88
136
  end = start + len(line)
89
137
  line_units.append(FrameExtractionUnit(
138
+ doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
90
139
  start=start,
91
140
  end=end,
92
141
  text=line
@@ -103,6 +152,24 @@ class ContextChunker(abc.ABC):
103
152
  """
104
153
  pass
105
154
 
155
+ @abc.abstractmethod
156
+ def fit(self, text:str, units:List[FrameExtractionUnit]):
157
+ """
158
+ Parameters:
159
+ ----------
160
+ text : str
161
+ The document text.
162
+ """
163
+ pass
164
+
165
+ async def fit_async(self, text:str, units:List[FrameExtractionUnit], executor=None):
166
+ """
167
+ asynchronous version of fit method.
168
+ """
169
+ loop = asyncio.get_running_loop()
170
+ return await loop.run_in_executor(executor, self.fit, text, units)
171
+
172
+ @abc.abstractmethod
106
173
  def chunk(self, unit:FrameExtractionUnit) -> str:
107
174
  """
108
175
  Parameters:
@@ -115,6 +182,13 @@ class ContextChunker(abc.ABC):
115
182
  """
116
183
  return NotImplemented
117
184
 
185
+ async def chunk_async(self, unit:FrameExtractionUnit, executor=None) -> str:
186
+ """
187
+ asynchronous version of chunk method.
188
+ """
189
+ loop = asyncio.get_running_loop()
190
+ return await loop.run_in_executor(executor, self.chunk, unit)
191
+
118
192
 
119
193
  class NoContextChunker(ContextChunker):
120
194
  def __init__(self):
@@ -7,13 +7,15 @@ import json
7
7
 
8
8
  @dataclass
9
9
  class FrameExtractionUnit:
10
- def __init__(self, start:int, end:int, text:str):
10
+ def __init__(self, doc_id:str, start:int, end:int, text:str):
11
11
  """
12
12
  This class holds the unit text for frame extraction, for example, a sentence.
13
13
  FrameExtractor prompt it one at a time to extract frames.
14
14
 
15
15
  Parameters
16
16
  ----------
17
+ doc_id : str, Optional
18
+ document ID.
17
19
  start : int
18
20
  start character position of the unit text, relative to the whole document
19
21
  end : int
@@ -21,9 +23,28 @@ class FrameExtractionUnit:
21
23
  text : str
22
24
  the unit text. Should be the exact string by [start:end]
23
25
  """
26
+ self.doc_id = doc_id
24
27
  self.start = start
25
28
  self.end = end
26
29
  self.text = text
30
+ # status: "pending", "success", "fail"
31
+ self.status = "pending"
32
+ # generated text by LLM
33
+ self.gen_text = None
34
+
35
+ def get_status(self) -> str:
36
+ return self.status
37
+
38
+ def set_status(self, status:str):
39
+ if status not in {"pending", "success", "fail"}:
40
+ raise ValueError('status must be one of {"pending", "success", "fail"}.')
41
+ self.status = status
42
+
43
+ def get_generated_text(self) -> str:
44
+ return self.gen_text
45
+
46
+ def set_generated_text(self, gen_text:str):
47
+ self.gen_text = gen_text
27
48
 
28
49
  def __eq__(self, other):
29
50
  if not isinstance(other, FrameExtractionUnit):
@@ -39,43 +60,8 @@ class FrameExtractionUnit:
39
60
  return self.start < other.start
40
61
 
41
62
  def __repr__(self):
42
- return f"FrameExtractionUnit(start={self.start}, end={self.end}, text='{self.text[:100]}...')"
43
-
44
-
45
- @dataclass
46
- class FrameExtractionUnitResult:
47
- def __init__(self, start:int, end:int, text:str, gen_text:str):
48
- """
49
- This class holds the unit text for frame extraction, for example, a sentence.
50
- FrameExtractor prompt it one at a time to extract frames.
51
-
52
- Parameters
53
- ----------
54
- start : int
55
- start character position of the unit text, relative to the whole document
56
- end : int
57
- end character position of the unit text, relative to the whole document
58
- text : str
59
- the unit text. Should be the exact string by [start:end]
60
- gen_text : str
61
- the generated text by LLM (ideally) following '[{"entity_text": "xxx", "attr": {"key": "value"}}]' format. Does not contain spans (start/end).
62
- """
63
- self.start = start
64
- self.end = end
65
- self.text = text
66
- self.gen_text = gen_text
67
-
68
- def __eq__(self, other):
69
- if not isinstance(other, FrameExtractionUnit):
70
- return NotImplemented
71
- return (self.start == other.start and self.end == other.end and self.text == other.text and self.gen_text == other.gen_text)
63
+ return f"FrameExtractionUnit(doc_id={self.doc_id}, start={self.start}, end={self.end}, status={self.status}, text='{self.text[:100]}...')"
72
64
 
73
- def __hash__(self):
74
- return hash((self.start, self.end, self.text, self.gen_text))
75
-
76
- def __repr__(self):
77
- return f"FrameExtractionUnitResult(start={self.start}, end={self.end}, text='{self.text[:100]}...', gen_text='{self.gen_text[:100]}...')"
78
-
79
65
 
80
66
  @dataclass
81
67
  class LLMInformationExtractionFrame: