OntoLearner 1.4.6__tar.gz → 1.4.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ontolearner-1.4.6 → ontolearner-1.4.7}/PKG-INFO +2 -1
- ontolearner-1.4.7/ontolearner/VERSION +1 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/base/learner.py +5 -2
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/learner/__init__.py +1 -1
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/learner/llm.py +73 -3
- {ontolearner-1.4.6 → ontolearner-1.4.7}/pyproject.toml +2 -1
- ontolearner-1.4.6/ontolearner/VERSION +0 -1
- {ontolearner-1.4.6 → ontolearner-1.4.7}/LICENSE +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/README.md +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/images/logo.png +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/__init__.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/_learner.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/_ontology.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/base/__init__.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/base/ontology.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/base/text2onto.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/data_structure/__init__.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/data_structure/data.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/data_structure/metric.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/evaluation/__init__.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/evaluation/evaluate.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/evaluation/metrics.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/learner/label_mapper.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/learner/prompt.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/learner/rag.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/learner/retriever.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/__init__.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/agriculture.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/arts_humanities.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/biology.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/chemistry.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/ecology_environment.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/education.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/events.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/finance.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/food_beverage.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/general.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/geography.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/industry.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/law.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/library_cultural_heritage.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/material_science_engineering.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/medicine.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/news_media.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/scholarly_knowledge.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/social_sciences.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/units_measurements.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/upper_ontologies.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/web.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/processor.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/text2onto/__init__.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/text2onto/batchifier.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/text2onto/general.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/text2onto/splitter.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/text2onto/synthesizer.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/tools/__init__.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/tools/analyzer.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/tools/visualizer.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/utils/__init__.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/utils/io.py +0 -0
- {ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/utils/train_test_split.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: OntoLearner
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.7
|
|
4
4
|
Summary: OntoLearner: A Modular Python Library for Ontology Learning with LLMs.
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -17,6 +17,7 @@ Requires-Dist: bitsandbytes (>=0.45.1,<0.46.0)
|
|
|
17
17
|
Requires-Dist: dspy (>=2.6.14,<3.0.0)
|
|
18
18
|
Requires-Dist: huggingface-hub (>=0.34.4,<0.35.0)
|
|
19
19
|
Requires-Dist: matplotlib
|
|
20
|
+
Requires-Dist: mistral-common[sentencepiece] (>=1.8.5,<2.0.0)
|
|
20
21
|
Requires-Dist: networkx (==3.2.1)
|
|
21
22
|
Requires-Dist: numpy
|
|
22
23
|
Requires-Dist: openpyxl
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
1.4.7
|
|
@@ -238,7 +238,7 @@ class AutoLLM(ABC):
|
|
|
238
238
|
if self.device == "cpu":
|
|
239
239
|
device_map = "cpu"
|
|
240
240
|
else:
|
|
241
|
-
device_map = "
|
|
241
|
+
device_map = "balanced"
|
|
242
242
|
self.model = AutoModelForCausalLM.from_pretrained(
|
|
243
243
|
model_id,
|
|
244
244
|
device_map=device_map,
|
|
@@ -271,7 +271,10 @@ class AutoLLM(ABC):
|
|
|
271
271
|
Responses include the original input plus generated continuation.
|
|
272
272
|
"""
|
|
273
273
|
# Tokenize inputs and move to device
|
|
274
|
-
encoded_inputs = self.tokenizer(inputs,
|
|
274
|
+
encoded_inputs = self.tokenizer(inputs,
|
|
275
|
+
return_tensors="pt",
|
|
276
|
+
padding=True,
|
|
277
|
+
truncation=True).to(self.model.device)
|
|
275
278
|
input_ids = encoded_inputs["input_ids"]
|
|
276
279
|
input_length = input_ids.shape[1]
|
|
277
280
|
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from .llm import AutoLLMLearner
|
|
15
|
+
from .llm import AutoLLMLearner, FalconLLM, MistralLLM
|
|
16
16
|
from .retriever import AutoRetrieverLearner
|
|
17
17
|
from .rag import AutoRAGLearner
|
|
18
18
|
from .prompt import StandardizedPrompting
|
|
@@ -13,23 +13,27 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
from ..base import AutoLLM, AutoLearner
|
|
16
|
-
from typing import Any
|
|
16
|
+
from typing import Any, List
|
|
17
17
|
import warnings
|
|
18
18
|
from tqdm import tqdm
|
|
19
19
|
from torch.utils.data import DataLoader
|
|
20
|
-
|
|
20
|
+
import torch
|
|
21
|
+
from transformers import Mistral3ForConditionalGeneration
|
|
22
|
+
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
|
23
|
+
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
|
21
24
|
|
|
22
25
|
class AutoLLMLearner(AutoLearner):
|
|
23
26
|
|
|
24
27
|
def __init__(self,
|
|
25
28
|
prompting,
|
|
26
29
|
label_mapper,
|
|
30
|
+
llm: AutoLLM = AutoLLM,
|
|
27
31
|
token: str = "",
|
|
28
32
|
max_new_tokens: int = 5,
|
|
29
33
|
batch_size: int = 10,
|
|
30
34
|
device='cpu') -> None:
|
|
31
35
|
super().__init__()
|
|
32
|
-
self.llm =
|
|
36
|
+
self.llm = llm(token=token, label_mapper=label_mapper, device=device)
|
|
33
37
|
self.prompting = prompting
|
|
34
38
|
self.batch_size = batch_size
|
|
35
39
|
self.max_new_tokens = max_new_tokens
|
|
@@ -136,3 +140,69 @@ class AutoLLMLearner(AutoLearner):
|
|
|
136
140
|
return self._non_taxonomic_re_predict(dataset=dataset)
|
|
137
141
|
else:
|
|
138
142
|
warnings.warn("No requirement for fiting the non-taxonomic-re model, the predict module will use the input data to do the task.")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class FalconLLM(AutoLLM):
|
|
146
|
+
|
|
147
|
+
def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
|
|
148
|
+
encoded_inputs = self.tokenizer(inputs,
|
|
149
|
+
return_tensors="pt",
|
|
150
|
+
padding=True,
|
|
151
|
+
truncation=True).to(self.model.device)
|
|
152
|
+
input_ids = encoded_inputs["input_ids"]
|
|
153
|
+
input_length = input_ids.shape[1]
|
|
154
|
+
outputs = self.model.generate(
|
|
155
|
+
input_ids,
|
|
156
|
+
max_new_tokens=max_new_tokens,
|
|
157
|
+
pad_token_id=self.tokenizer.eos_token_id
|
|
158
|
+
)
|
|
159
|
+
generated_tokens = outputs[:, input_length:]
|
|
160
|
+
decoded_outputs = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated_tokens]
|
|
161
|
+
return self.label_mapper.predict(decoded_outputs)
|
|
162
|
+
|
|
163
|
+
class MistralLLM(AutoLLM):
|
|
164
|
+
|
|
165
|
+
def load(self, model_id: str) -> None:
|
|
166
|
+
self.tokenizer = MistralTokenizer.from_hf_hub(model_id)
|
|
167
|
+
if self.device == "cpu":
|
|
168
|
+
device_map = "cpu"
|
|
169
|
+
else:
|
|
170
|
+
device_map = "balanced"
|
|
171
|
+
self.model = Mistral3ForConditionalGeneration.from_pretrained(
|
|
172
|
+
model_id,
|
|
173
|
+
device_map=device_map,
|
|
174
|
+
torch_dtype=torch.bfloat16,
|
|
175
|
+
token=self.token
|
|
176
|
+
)
|
|
177
|
+
if not hasattr(self.tokenizer, "pad_token_id") or self.tokenizer.pad_token_id is None:
|
|
178
|
+
self.tokenizer.pad_token_id = self.model.generation_config.eos_token_id
|
|
179
|
+
self.label_mapper.fit()
|
|
180
|
+
|
|
181
|
+
def generate(self, inputs: List[str], max_new_tokens: int = 50) -> List[str]:
|
|
182
|
+
tokenized_list = []
|
|
183
|
+
for prompt in inputs:
|
|
184
|
+
messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
|
|
185
|
+
tokenized = self.tokenizer.encode_chat_completion(ChatCompletionRequest(messages=messages))
|
|
186
|
+
tokenized_list.append(tokenized.tokens)
|
|
187
|
+
max_len = max(len(tokens) for tokens in tokenized_list)
|
|
188
|
+
input_ids, attention_masks = [], []
|
|
189
|
+
for tokens in tokenized_list:
|
|
190
|
+
pad_length = max_len - len(tokens)
|
|
191
|
+
input_ids.append(tokens + [self.tokenizer.pad_token_id] * pad_length)
|
|
192
|
+
attention_masks.append([1] * len(tokens) + [0] * pad_length)
|
|
193
|
+
|
|
194
|
+
input_ids = torch.tensor(input_ids).to(self.model.device)
|
|
195
|
+
attention_masks = torch.tensor(attention_masks).to(self.model.device)
|
|
196
|
+
|
|
197
|
+
outputs =self.model.generate(
|
|
198
|
+
input_ids=input_ids,
|
|
199
|
+
attention_mask=attention_masks,
|
|
200
|
+
eos_token_id=self.model.generation_config.eos_token_id,
|
|
201
|
+
pad_token_id=self.tokenizer.pad_token_id,
|
|
202
|
+
max_new_tokens=max_new_tokens,
|
|
203
|
+
)
|
|
204
|
+
decoded_outputs = []
|
|
205
|
+
for i, tokens in enumerate(outputs):
|
|
206
|
+
output_text = self.tokenizer.decode(tokens[len(tokenized_list[i]):])
|
|
207
|
+
decoded_outputs.append(output_text)
|
|
208
|
+
return self.label_mapper.predict(decoded_outputs)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "OntoLearner"
|
|
3
|
-
version = "1.4.
|
|
3
|
+
version = "1.4.7"
|
|
4
4
|
description = "OntoLearner: A Modular Python Library for Ontology Learning with LLMs."
|
|
5
5
|
authors = ["Hamed Babaei Giglou <hamedbabaeigiglou@gmail.com>", "Andrei C. Aioanei <andrei.c.aioanei@gmail.com>"]
|
|
6
6
|
license = "MIT License"
|
|
@@ -29,6 +29,7 @@ transformers = "^4.56.0"
|
|
|
29
29
|
sentence-transformers = "^5.1.0"
|
|
30
30
|
dspy = "^2.6.14"
|
|
31
31
|
bitsandbytes="^0.45.1"
|
|
32
|
+
mistral-common = { version = "^1.8.5", extras = ["sentencepiece"] }
|
|
32
33
|
|
|
33
34
|
[tool.poetry.dev-dependencies]
|
|
34
35
|
ruff = "*"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
1.4.6
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ontolearner-1.4.6 → ontolearner-1.4.7}/ontolearner/ontology/material_science_engineering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|