wikontic 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wikontic/create_wikidata_ontology_db.py +17 -4
- wikontic/utils/base_inference_with_db.py +9 -5
- wikontic/utils/inference_with_db.py +1 -0
- wikontic/utils/openai_utils.py +15 -8
- wikontic/utils/structured_aligner.py +25 -6
- wikontic/utils/structured_inference_with_db.py +1 -0
- {wikontic-0.0.3.dist-info → wikontic-0.0.4.dist-info}/METADATA +4 -3
- {wikontic-0.0.3.dist-info → wikontic-0.0.4.dist-info}/RECORD +11 -11
- {wikontic-0.0.3.dist-info → wikontic-0.0.4.dist-info}/WHEEL +0 -0
- {wikontic-0.0.3.dist-info → wikontic-0.0.4.dist-info}/licenses/LICENSE +0 -0
- {wikontic-0.0.3.dist-info → wikontic-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,9 @@ import logging
|
|
|
13
13
|
import os
|
|
14
14
|
from pathlib import Path
|
|
15
15
|
import torch
|
|
16
|
+
from dotenv import load_dotenv, find_dotenv
|
|
16
17
|
|
|
18
|
+
_ = load_dotenv(find_dotenv())
|
|
17
19
|
# Configure logging
|
|
18
20
|
logging.basicConfig(
|
|
19
21
|
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
@@ -21,10 +23,21 @@ logging.basicConfig(
|
|
|
21
23
|
logger = logging.getLogger(__name__)
|
|
22
24
|
|
|
23
25
|
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
|
24
|
-
|
|
25
|
-
model
|
|
26
|
-
|
|
27
|
-
)
|
|
26
|
+
|
|
27
|
+
# Check for local model first, then fall back to remote
|
|
28
|
+
model_name = "facebook/contriever"
|
|
29
|
+
# local_model_path = os.getenv("HF_MODEL_PATH") or str(
|
|
30
|
+
# Path(__file__).parent.parent.parent.parent / "models" / "facebook--contriever"
|
|
31
|
+
# )
|
|
32
|
+
|
|
33
|
+
# if os.path.exists(local_model_path) and os.path.isdir(local_model_path):
|
|
34
|
+
# model_path = local_model_path
|
|
35
|
+
# else:
|
|
36
|
+
model_path = model_name
|
|
37
|
+
|
|
38
|
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
39
|
+
model = AutoModel.from_pretrained(model_path, use_safetensors=True).to(device)
|
|
40
|
+
# model = AutoModel.from_pretrained(model_path).to(device)
|
|
28
41
|
|
|
29
42
|
|
|
30
43
|
class EntityType(BaseModel):
|
|
@@ -113,6 +113,8 @@ class BaseInferenceWithDB:
|
|
|
113
113
|
Returns:
|
|
114
114
|
A list of dictionaries with the subject, relation, object, and qualifiers that correspond to the 1-hop supporting triplets for the given entities.
|
|
115
115
|
"""
|
|
116
|
+
if len(entities4search) == 0:
|
|
117
|
+
return []
|
|
116
118
|
or_conditions = []
|
|
117
119
|
for ent in entities4search:
|
|
118
120
|
or_conditions.append({"$and": [{"subject": ent}]})
|
|
@@ -187,21 +189,23 @@ class BaseInferenceWithDB:
|
|
|
187
189
|
supporting_triplets = []
|
|
188
190
|
|
|
189
191
|
for _ in range(hop_depth):
|
|
190
|
-
new_entities4search =
|
|
192
|
+
new_entities4search = set()
|
|
191
193
|
new_supporting_triplets = self.get_1_hop_supporting_triplets(
|
|
192
194
|
entities4search, sample_id, use_qualifiers, use_filtered_triplets
|
|
193
195
|
)
|
|
194
|
-
|
|
196
|
+
for triplet in new_supporting_triplets:
|
|
197
|
+
if triplet not in supporting_triplets:
|
|
198
|
+
supporting_triplets.append(triplet)
|
|
195
199
|
|
|
196
200
|
for doc in supporting_triplets:
|
|
197
201
|
if doc["subject"] not in entities4search:
|
|
198
|
-
new_entities4search.
|
|
202
|
+
new_entities4search.add(doc["subject"])
|
|
199
203
|
if doc["object"] not in entities4search:
|
|
200
|
-
new_entities4search.
|
|
204
|
+
new_entities4search.add(doc["object"])
|
|
201
205
|
if use_qualifiers:
|
|
202
206
|
for q in doc["qualifiers"]:
|
|
203
207
|
if q["object"] not in entities4search:
|
|
204
|
-
new_entities4search.
|
|
208
|
+
new_entities4search.add(q["object"])
|
|
205
209
|
|
|
206
210
|
entities4search = list(set(new_entities4search))
|
|
207
211
|
|
|
@@ -29,6 +29,7 @@ class InferenceWithDB(BaseInferenceWithDB):
|
|
|
29
29
|
self.get_1_hop_supporting_triplets_tool = tool(
|
|
30
30
|
self.get_1_hop_supporting_triplets
|
|
31
31
|
)
|
|
32
|
+
self.answer_question_with_llm_tool = tool(self.answer_question_with_llm)
|
|
32
33
|
|
|
33
34
|
def sanitize_string(self, s):
|
|
34
35
|
s = str(s).strip().replace('\\"', "")
|
wikontic/utils/openai_utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import openai
|
|
2
|
-
|
|
3
|
-
|
|
2
|
+
|
|
3
|
+
# import os
|
|
4
|
+
# from dotenv import load_dotenv, find_dotenv
|
|
4
5
|
from tenacity import (
|
|
5
6
|
retry,
|
|
6
7
|
wait_random_exponential,
|
|
@@ -19,13 +20,11 @@ import httpx
|
|
|
19
20
|
# Configure logging
|
|
20
21
|
logging.basicConfig(stream=sys.stderr, level=logging.WARNING)
|
|
21
22
|
logger = logging.getLogger("OpenAIUtils")
|
|
22
|
-
logger.setLevel(logging.
|
|
23
|
-
logging.getLogger("httpx").setLevel(logging.
|
|
23
|
+
logger.setLevel(logging.DEBUG)
|
|
24
|
+
logging.getLogger("httpx").setLevel(logging.ERROR)
|
|
24
25
|
|
|
25
|
-
_ = load_dotenv(find_dotenv())
|
|
26
|
+
# _ = load_dotenv(find_dotenv())
|
|
26
27
|
# OpenAI
|
|
27
|
-
client = openai.OpenAI(api_key=os.getenv("KEY"))
|
|
28
|
-
|
|
29
28
|
MAX_ATTEMPTS = 1
|
|
30
29
|
|
|
31
30
|
|
|
@@ -43,11 +42,19 @@ class LLMTripletExtractor:
|
|
|
43
42
|
|
|
44
43
|
def __init__(
|
|
45
44
|
self,
|
|
45
|
+
api_key: str,
|
|
46
46
|
prompt_folder_path: str = str(Path(__file__).parent / "prompts"),
|
|
47
47
|
system_prompt_paths: Optional[Dict[str, str]] = None,
|
|
48
48
|
model: str = "gpt-4o",
|
|
49
49
|
max_attempts=MAX_ATTEMPTS,
|
|
50
|
+
proxy: str = None,
|
|
50
51
|
):
|
|
52
|
+
if proxy:
|
|
53
|
+
http_client = httpx.Client(proxy=proxy)
|
|
54
|
+
self.client = openai.OpenAI(api_key=api_key, http_client=http_client)
|
|
55
|
+
else:
|
|
56
|
+
self.client = openai.OpenAI(api_key=api_key)
|
|
57
|
+
|
|
51
58
|
"""
|
|
52
59
|
Initialize the LLMTripletExtractor.
|
|
53
60
|
|
|
@@ -141,7 +148,7 @@ class LLMTripletExtractor:
|
|
|
141
148
|
{"role": "user", "content": user_prompt},
|
|
142
149
|
]
|
|
143
150
|
|
|
144
|
-
response = client.chat.completions.create(
|
|
151
|
+
response = self.client.chat.completions.create(
|
|
145
152
|
model=self.model, messages=messages, temperature=0
|
|
146
153
|
)
|
|
147
154
|
self.completion_tokens_num += response.usage.completion_tokens
|
|
@@ -6,6 +6,7 @@ from pymongo import MongoClient, UpdateOne
|
|
|
6
6
|
import torch
|
|
7
7
|
from dotenv import load_dotenv, find_dotenv
|
|
8
8
|
import os
|
|
9
|
+
from pathlib import Path
|
|
9
10
|
|
|
10
11
|
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
|
11
12
|
_ = load_dotenv(find_dotenv())
|
|
@@ -47,12 +48,30 @@ class Aligner:
|
|
|
47
48
|
self.entities_vector_index_name = "entity_aliases"
|
|
48
49
|
|
|
49
50
|
self.device = torch.device(device)
|
|
50
|
-
# self.tokenizer = AutoTokenizer.from_pretrained(
|
|
51
|
-
|
|
52
|
-
#
|
|
53
|
-
self.model = AutoModel.from_pretrained(
|
|
54
|
-
|
|
55
|
-
).to(self.device)
|
|
51
|
+
# self.tokenizer = AutoTokenizer.from_pretrained(
|
|
52
|
+
# "facebook/contriever", token=os.getenv("HF_KEY")
|
|
53
|
+
# )
|
|
54
|
+
# self.model = AutoModel.from_pretrained(
|
|
55
|
+
# "facebook/contriever", token=os.getenv("HF_KEY"), use_safetensors=True
|
|
56
|
+
# ).to(self.device)
|
|
57
|
+
# Check for local model first, then fall back to remote
|
|
58
|
+
model_name = "facebook/contriever"
|
|
59
|
+
# local_model_path = os.getenv("HF_MODEL_PATH") or str(
|
|
60
|
+
# Path(__file__).parent.parent.parent.parent
|
|
61
|
+
# / "models"
|
|
62
|
+
# / "facebook--contriever"
|
|
63
|
+
# )
|
|
64
|
+
|
|
65
|
+
# if os.path.exists(local_model_path) and os.path.isdir(local_model_path):
|
|
66
|
+
# model_path = local_model_path
|
|
67
|
+
# else:
|
|
68
|
+
model_path = model_name
|
|
69
|
+
|
|
70
|
+
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
71
|
+
self.model = AutoModel.from_pretrained(model_path, use_safetensors=True).to(
|
|
72
|
+
self.device
|
|
73
|
+
)
|
|
74
|
+
# self.model = AutoModel.from_pretrained(model_path).to(self.device)
|
|
56
75
|
|
|
57
76
|
def get_embedding(self, text):
|
|
58
77
|
|
|
@@ -33,6 +33,7 @@ class StructuredInferenceWithDB(BaseInferenceWithDB):
|
|
|
33
33
|
self.get_1_hop_supporting_triplets_tool = tool(
|
|
34
34
|
self.get_1_hop_supporting_triplets
|
|
35
35
|
)
|
|
36
|
+
self.answer_question_with_llm_tool = tool(self.answer_question_with_llm)
|
|
36
37
|
# 1st step extraction without database
|
|
37
38
|
|
|
38
39
|
def _refine_entity_types(self, text, triplet):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wikontic
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Extract a knowledge graph with LLM from texts and perform QA over the resulted KG
|
|
5
5
|
Author-email: Alla Chepurova <chepurova.data@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -27,6 +27,7 @@ Requires-Dist: dataclasses
|
|
|
27
27
|
Requires-Dist: pydantic
|
|
28
28
|
Requires-Dist: accelerate
|
|
29
29
|
Requires-Dist: langchain
|
|
30
|
+
Requires-Dist: langchain_openai
|
|
30
31
|
Dynamic: license-file
|
|
31
32
|
|
|
32
33
|

|
|
@@ -78,9 +79,9 @@ Knowledge Graphs (KGs) provide structured, verifiable representations of knowled
|
|
|
78
79
|
- `Aligner` class: entity and relation name refinement
|
|
79
80
|
|
|
80
81
|
### Evaluation:
|
|
81
|
-
- `inference_and_eval
|
|
82
|
+
- `inference_and_eval/`
|
|
82
83
|
- Scripts for building KGs for MuSiQue and HotPot datasets and evaluation of QA performance
|
|
83
|
-
- `analysis
|
|
84
|
+
- `analysis/`
|
|
84
85
|
- Notebooks with downstream analysis of the resulted KG
|
|
85
86
|
|
|
86
87
|
### Use Wikontic as a service:
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
wikontic/__init__.py,sha256=9zw-dHDIyJ49TJ0PI3vHuW4wucW7_EhSyX98XE3_rys,483
|
|
2
2
|
wikontic/create_ontological_triplets_db.py,sha256=yz2Nc1kxbtAagZuovKpxc2P3OH4qBalQNd7m7s9kpWo,6764
|
|
3
3
|
wikontic/create_triplets_db.py,sha256=MsrNQmzkk6wxwy8gMV19FdRFCMiFmF13Z3bn2P9ZAQQ,8845
|
|
4
|
-
wikontic/create_wikidata_ontology_db.py,sha256=
|
|
4
|
+
wikontic/create_wikidata_ontology_db.py,sha256=hDI1prU4eU6DBe90EA_dN0ZqyWNyQOHlVRTxOipzq64,18559
|
|
5
5
|
wikontic/utils/__init__.py,sha256=U41kQFNPpfYV6KJpMnkqgqLkozqXiG4tgV6rj8IW1BU,7
|
|
6
|
-
wikontic/utils/base_inference_with_db.py,sha256=
|
|
6
|
+
wikontic/utils/base_inference_with_db.py,sha256=utG-ykcM88Y6JYCbbgOtc7HUOqtC3O3gSmKGY4WN_5E,13118
|
|
7
7
|
wikontic/utils/dynamic_aligner.py,sha256=xKw0spAHn6lxNRX_9xuLY5FtWEHAoIfwB2HYaAEgKJY,9616
|
|
8
|
-
wikontic/utils/inference_with_db.py,sha256=
|
|
9
|
-
wikontic/utils/openai_utils.py,sha256=
|
|
10
|
-
wikontic/utils/structured_aligner.py,sha256=
|
|
11
|
-
wikontic/utils/structured_inference_with_db.py,sha256=
|
|
8
|
+
wikontic/utils/inference_with_db.py,sha256=E6xUNfzOHS7mKbo_20dXiy6zgpVnMxegKi3zfw-p2vI,8905
|
|
9
|
+
wikontic/utils/openai_utils.py,sha256=zq9Z87FEbuqo3upqn4GSEkzggm7Yy69JDTQxfnLkTwo,20695
|
|
10
|
+
wikontic/utils/structured_aligner.py,sha256=ixRKKgtcLcsRkoLkfsXmYhDATiV0DonDBpPc9pbDPcM,22728
|
|
11
|
+
wikontic/utils/structured_inference_with_db.py,sha256=3vE3sZI2aVunRzr08S2IwFfb_MXmXYYBiksq-VD2DwM,22997
|
|
12
12
|
wikontic/utils/ontology_mappings/entity_hierarchy.json,sha256=QG-uGxDlgw_wHI40Y57wTUUbd5fPeKYtDsYGzdwP448,951258
|
|
13
13
|
wikontic/utils/ontology_mappings/entity_names.json,sha256=if_hnOL4RGgZyYgDmgKL41GL6zq0rG6WZ6EKIf6UXr0,144317
|
|
14
14
|
wikontic/utils/ontology_mappings/entity_type2aliases.json,sha256=r0QVK70KXAWVfDJ5ii0oKfyK59oy5YNzCu5LPiM8jZI,219271
|
|
@@ -46,8 +46,8 @@ wikontic/utils/prompts/qa/question_decomposition_1.txt,sha256=lDhkPRugox4zlqJxjr
|
|
|
46
46
|
wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench.txt,sha256=1cnJxjTXvOMKIplJKaUgx-tLxe1hpZJtwNbcNxX8fuw,3885
|
|
47
47
|
wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench_in_russian.txt,sha256=CqoM-9iMcbihwqkh38lVdZF3We1GYpju6u2cyKsb_AU,5425
|
|
48
48
|
wikontic/utils/prompts/triplet_extraction/propmt_1_types_qualifiers.txt,sha256=Nq940rMcrNZmVTOxMKL3xYB0bDhxINVWi2ShJkJ2xRo,4034
|
|
49
|
-
wikontic-0.0.
|
|
50
|
-
wikontic-0.0.
|
|
51
|
-
wikontic-0.0.
|
|
52
|
-
wikontic-0.0.
|
|
53
|
-
wikontic-0.0.
|
|
49
|
+
wikontic-0.0.4.dist-info/licenses/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
|
|
50
|
+
wikontic-0.0.4.dist-info/METADATA,sha256=Mx2aQ__vpco8NL6QnqQd68DpE4WclKKqO2Xu-1ZOS94,3346
|
|
51
|
+
wikontic-0.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
52
|
+
wikontic-0.0.4.dist-info/top_level.txt,sha256=VkTVWaTtu5zD7QL2iF2cS4LOQAiPp_P0pssCxETRB_o,9
|
|
53
|
+
wikontic-0.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|