wikontic 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,7 +13,9 @@ import logging
13
13
  import os
14
14
  from pathlib import Path
15
15
  import torch
16
+ from dotenv import load_dotenv, find_dotenv
16
17
 
18
+ _ = load_dotenv(find_dotenv())
17
19
  # Configure logging
18
20
  logging.basicConfig(
19
21
  level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
@@ -21,10 +23,21 @@ logging.basicConfig(
21
23
  logger = logging.getLogger(__name__)
22
24
 
23
25
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
24
- tokenizer = AutoTokenizer.from_pretrained("facebook/contriever")
25
- model = AutoModel.from_pretrained("facebook/contriever", use_safetensors=True).to(
26
- device
27
- )
26
+
27
+ # Check for local model first, then fall back to remote
28
+ model_name = "facebook/contriever"
29
+ # local_model_path = os.getenv("HF_MODEL_PATH") or str(
30
+ # Path(__file__).parent.parent.parent.parent / "models" / "facebook--contriever"
31
+ # )
32
+
33
+ # if os.path.exists(local_model_path) and os.path.isdir(local_model_path):
34
+ # model_path = local_model_path
35
+ # else:
36
+ model_path = model_name
37
+
38
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
39
+ model = AutoModel.from_pretrained(model_path, use_safetensors=True).to(device)
40
+ # model = AutoModel.from_pretrained(model_path).to(device)
28
41
 
29
42
 
30
43
  class EntityType(BaseModel):
@@ -113,6 +113,8 @@ class BaseInferenceWithDB:
113
113
  Returns:
114
114
  A list of dictionaries with the subject, relation, object, and qualifiers that correspond to the 1-hop supporting triplets for the given entities.
115
115
  """
116
+ if len(entities4search) == 0:
117
+ return []
116
118
  or_conditions = []
117
119
  for ent in entities4search:
118
120
  or_conditions.append({"$and": [{"subject": ent}]})
@@ -187,21 +189,23 @@ class BaseInferenceWithDB:
187
189
  supporting_triplets = []
188
190
 
189
191
  for _ in range(hop_depth):
190
- new_entities4search = []
192
+ new_entities4search = set()
191
193
  new_supporting_triplets = self.get_1_hop_supporting_triplets(
192
194
  entities4search, sample_id, use_qualifiers, use_filtered_triplets
193
195
  )
194
- supporting_triplets.extend(new_supporting_triplets)
196
+ for triplet in new_supporting_triplets:
197
+ if triplet not in supporting_triplets:
198
+ supporting_triplets.append(triplet)
195
199
 
196
200
  for doc in supporting_triplets:
197
201
  if doc["subject"] not in entities4search:
198
- new_entities4search.append(doc["subject"])
202
+ new_entities4search.add(doc["subject"])
199
203
  if doc["object"] not in entities4search:
200
- new_entities4search.append(doc["object"])
204
+ new_entities4search.add(doc["object"])
201
205
  if use_qualifiers:
202
206
  for q in doc["qualifiers"]:
203
207
  if q["object"] not in entities4search:
204
- new_entities4search.append(q["object"])
208
+ new_entities4search.add(q["object"])
205
209
 
206
210
  entities4search = list(set(new_entities4search))
207
211
 
@@ -29,6 +29,7 @@ class InferenceWithDB(BaseInferenceWithDB):
29
29
  self.get_1_hop_supporting_triplets_tool = tool(
30
30
  self.get_1_hop_supporting_triplets
31
31
  )
32
+ self.answer_question_with_llm_tool = tool(self.answer_question_with_llm)
32
33
 
33
34
  def sanitize_string(self, s):
34
35
  s = str(s).strip().replace('\\"', "")
@@ -1,6 +1,7 @@
1
1
  import openai
2
- import os
3
- from dotenv import load_dotenv, find_dotenv
2
+
3
+ # import os
4
+ # from dotenv import load_dotenv, find_dotenv
4
5
  from tenacity import (
5
6
  retry,
6
7
  wait_random_exponential,
@@ -19,13 +20,11 @@ import httpx
19
20
  # Configure logging
20
21
  logging.basicConfig(stream=sys.stderr, level=logging.WARNING)
21
22
  logger = logging.getLogger("OpenAIUtils")
22
- logger.setLevel(logging.ERROR)
23
- logging.getLogger("httpx").setLevel(logging.WARNING)
23
+ logger.setLevel(logging.DEBUG)
24
+ logging.getLogger("httpx").setLevel(logging.ERROR)
24
25
 
25
- _ = load_dotenv(find_dotenv())
26
+ # _ = load_dotenv(find_dotenv())
26
27
  # OpenAI
27
- client = openai.OpenAI(api_key=os.getenv("KEY"))
28
-
29
28
  MAX_ATTEMPTS = 1
30
29
 
31
30
 
@@ -43,11 +42,19 @@ class LLMTripletExtractor:
43
42
 
44
43
  def __init__(
45
44
  self,
45
+ api_key: str,
46
46
  prompt_folder_path: str = str(Path(__file__).parent / "prompts"),
47
47
  system_prompt_paths: Optional[Dict[str, str]] = None,
48
48
  model: str = "gpt-4o",
49
49
  max_attempts=MAX_ATTEMPTS,
50
+ proxy: str = None,
50
51
  ):
52
+ if proxy:
53
+ http_client = httpx.Client(proxy=proxy)
54
+ self.client = openai.OpenAI(api_key=api_key, http_client=http_client)
55
+ else:
56
+ self.client = openai.OpenAI(api_key=api_key)
57
+
51
58
  """
52
59
  Initialize the LLMTripletExtractor.
53
60
 
@@ -141,7 +148,7 @@ class LLMTripletExtractor:
141
148
  {"role": "user", "content": user_prompt},
142
149
  ]
143
150
 
144
- response = client.chat.completions.create(
151
+ response = self.client.chat.completions.create(
145
152
  model=self.model, messages=messages, temperature=0
146
153
  )
147
154
  self.completion_tokens_num += response.usage.completion_tokens
@@ -6,6 +6,7 @@ from pymongo import MongoClient, UpdateOne
6
6
  import torch
7
7
  from dotenv import load_dotenv, find_dotenv
8
8
  import os
9
+ from pathlib import Path
9
10
 
10
11
  # os.environ["CUDA_VISIBLE_DEVICES"] = "1"
11
12
  _ = load_dotenv(find_dotenv())
@@ -47,12 +48,30 @@ class Aligner:
47
48
  self.entities_vector_index_name = "entity_aliases"
48
49
 
49
50
  self.device = torch.device(device)
50
- # self.tokenizer = AutoTokenizer.from_pretrained('facebook/contriever', token=os.getenv("HF_KEY"))
51
- self.tokenizer = AutoTokenizer.from_pretrained("facebook/contriever")
52
- # self.model = AutoModel.from_pretrained('facebook/contriever', token=os.getenv("HF_KEY")).to(self.device)
53
- self.model = AutoModel.from_pretrained(
54
- "facebook/contriever", use_safetensors=True
55
- ).to(self.device)
51
+ # self.tokenizer = AutoTokenizer.from_pretrained(
52
+ # "facebook/contriever", token=os.getenv("HF_KEY")
53
+ # )
54
+ # self.model = AutoModel.from_pretrained(
55
+ # "facebook/contriever", token=os.getenv("HF_KEY"), use_safetensors=True
56
+ # ).to(self.device)
57
+ # Check for local model first, then fall back to remote
58
+ model_name = "facebook/contriever"
59
+ # local_model_path = os.getenv("HF_MODEL_PATH") or str(
60
+ # Path(__file__).parent.parent.parent.parent
61
+ # / "models"
62
+ # / "facebook--contriever"
63
+ # )
64
+
65
+ # if os.path.exists(local_model_path) and os.path.isdir(local_model_path):
66
+ # model_path = local_model_path
67
+ # else:
68
+ model_path = model_name
69
+
70
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
71
+ self.model = AutoModel.from_pretrained(model_path, use_safetensors=True).to(
72
+ self.device
73
+ )
74
+ # self.model = AutoModel.from_pretrained(model_path).to(self.device)
56
75
 
57
76
  def get_embedding(self, text):
58
77
 
@@ -33,6 +33,7 @@ class StructuredInferenceWithDB(BaseInferenceWithDB):
33
33
  self.get_1_hop_supporting_triplets_tool = tool(
34
34
  self.get_1_hop_supporting_triplets
35
35
  )
36
+ self.answer_question_with_llm_tool = tool(self.answer_question_with_llm)
36
37
  # 1st step extraction without database
37
38
 
38
39
  def _refine_entity_types(self, text, triplet):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wikontic
3
- Version: 0.0.3
3
+ Version: 0.0.4
4
4
  Summary: Extract a knowledge graph with LLM from texts and perform QA over the resulted KG
5
5
  Author-email: Alla Chepurova <chepurova.data@gmail.com>
6
6
  License-Expression: MIT
@@ -27,6 +27,7 @@ Requires-Dist: dataclasses
27
27
  Requires-Dist: pydantic
28
28
  Requires-Dist: accelerate
29
29
  Requires-Dist: langchain
30
+ Requires-Dist: langchain_openai
30
31
  Dynamic: license-file
31
32
 
32
33
  ![Wikontic logo](/media/wikontic.png)
@@ -78,9 +79,9 @@ Knowledge Graphs (KGs) provide structured, verifiable representations of knowled
78
79
  - `Aligner` class: entity and relation name refinement
79
80
 
80
81
  ### Evaluation:
81
- - `inference_and_eval`
82
+ - `inference_and_eval/`
82
83
  - Scripts for building KGs for MuSiQue and HotPot datasets and evaluation of QA performance
83
- - `analysis`
84
+ - `analysis/`
84
85
  - Notebooks with downstream analysis of the resulted KG
85
86
 
86
87
  ### Use Wikontic as a service:
@@ -1,14 +1,14 @@
1
1
  wikontic/__init__.py,sha256=9zw-dHDIyJ49TJ0PI3vHuW4wucW7_EhSyX98XE3_rys,483
2
2
  wikontic/create_ontological_triplets_db.py,sha256=yz2Nc1kxbtAagZuovKpxc2P3OH4qBalQNd7m7s9kpWo,6764
3
3
  wikontic/create_triplets_db.py,sha256=MsrNQmzkk6wxwy8gMV19FdRFCMiFmF13Z3bn2P9ZAQQ,8845
4
- wikontic/create_wikidata_ontology_db.py,sha256=O_dDMtqTlVGhaLCDy9yIbFUhQXCdg5pGohR6MmONkAI,18071
4
+ wikontic/create_wikidata_ontology_db.py,sha256=hDI1prU4eU6DBe90EA_dN0ZqyWNyQOHlVRTxOipzq64,18559
5
5
  wikontic/utils/__init__.py,sha256=U41kQFNPpfYV6KJpMnkqgqLkozqXiG4tgV6rj8IW1BU,7
6
- wikontic/utils/base_inference_with_db.py,sha256=Jv8HxHwg2mBtqDHZTCzQdYY3Jjv8jDMr8nMR9FI6rWc,12965
6
+ wikontic/utils/base_inference_with_db.py,sha256=utG-ykcM88Y6JYCbbgOtc7HUOqtC3O3gSmKGY4WN_5E,13118
7
7
  wikontic/utils/dynamic_aligner.py,sha256=xKw0spAHn6lxNRX_9xuLY5FtWEHAoIfwB2HYaAEgKJY,9616
8
- wikontic/utils/inference_with_db.py,sha256=iuvF08DQ16SThovSGLCLyApme-AidLLWO9DPP0ozM3c,8824
9
- wikontic/utils/openai_utils.py,sha256=93f9w7V-zAVub4b79_zFvRWnyPyjqR4qBYRPUv6fVE8,20462
10
- wikontic/utils/structured_aligner.py,sha256=WzX_J0MaAUip1w3nXGXF8AZJ2n6UZraKBVgLa69Br9A,22093
11
- wikontic/utils/structured_inference_with_db.py,sha256=kbDC1q87cdqqtWEnJmopJCvG9Da75R_Xj2EOt0USAtA,22916
8
+ wikontic/utils/inference_with_db.py,sha256=E6xUNfzOHS7mKbo_20dXiy6zgpVnMxegKi3zfw-p2vI,8905
9
+ wikontic/utils/openai_utils.py,sha256=zq9Z87FEbuqo3upqn4GSEkzggm7Yy69JDTQxfnLkTwo,20695
10
+ wikontic/utils/structured_aligner.py,sha256=ixRKKgtcLcsRkoLkfsXmYhDATiV0DonDBpPc9pbDPcM,22728
11
+ wikontic/utils/structured_inference_with_db.py,sha256=3vE3sZI2aVunRzr08S2IwFfb_MXmXYYBiksq-VD2DwM,22997
12
12
  wikontic/utils/ontology_mappings/entity_hierarchy.json,sha256=QG-uGxDlgw_wHI40Y57wTUUbd5fPeKYtDsYGzdwP448,951258
13
13
  wikontic/utils/ontology_mappings/entity_names.json,sha256=if_hnOL4RGgZyYgDmgKL41GL6zq0rG6WZ6EKIf6UXr0,144317
14
14
  wikontic/utils/ontology_mappings/entity_type2aliases.json,sha256=r0QVK70KXAWVfDJ5ii0oKfyK59oy5YNzCu5LPiM8jZI,219271
@@ -46,8 +46,8 @@ wikontic/utils/prompts/qa/question_decomposition_1.txt,sha256=lDhkPRugox4zlqJxjr
46
46
  wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench.txt,sha256=1cnJxjTXvOMKIplJKaUgx-tLxe1hpZJtwNbcNxX8fuw,3885
47
47
  wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench_in_russian.txt,sha256=CqoM-9iMcbihwqkh38lVdZF3We1GYpju6u2cyKsb_AU,5425
48
48
  wikontic/utils/prompts/triplet_extraction/propmt_1_types_qualifiers.txt,sha256=Nq940rMcrNZmVTOxMKL3xYB0bDhxINVWi2ShJkJ2xRo,4034
49
- wikontic-0.0.3.dist-info/licenses/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
50
- wikontic-0.0.3.dist-info/METADATA,sha256=DEQ-rt1sOOLzqOAvvmBFsufOOcnDzzDWO893P9KIApo,3312
51
- wikontic-0.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
52
- wikontic-0.0.3.dist-info/top_level.txt,sha256=VkTVWaTtu5zD7QL2iF2cS4LOQAiPp_P0pssCxETRB_o,9
53
- wikontic-0.0.3.dist-info/RECORD,,
49
+ wikontic-0.0.4.dist-info/licenses/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
50
+ wikontic-0.0.4.dist-info/METADATA,sha256=Mx2aQ__vpco8NL6QnqQd68DpE4WclKKqO2Xu-1ZOS94,3346
51
+ wikontic-0.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
52
+ wikontic-0.0.4.dist-info/top_level.txt,sha256=VkTVWaTtu5zD7QL2iF2cS4LOQAiPp_P0pssCxETRB_o,9
53
+ wikontic-0.0.4.dist-info/RECORD,,