wikontic 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wikontic/__init__.py +16 -0
- wikontic/create_ontological_triplets_db.py +193 -0
- wikontic/create_triplets_db.py +259 -0
- wikontic/create_wikidata_ontology_db.py +555 -0
- wikontic/utils/__init__.py +7 -0
- wikontic/utils/base_inference_with_db.py +329 -0
- wikontic/utils/dynamic_aligner.py +281 -0
- wikontic/utils/inference_with_db.py +224 -0
- wikontic/utils/ontology_mappings/entity_hierarchy.json +1 -0
- wikontic/utils/ontology_mappings/entity_names.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2aliases.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2hierarchy.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2label.json +1 -0
- wikontic/utils/ontology_mappings/enum_entity_ids.json +1 -0
- wikontic/utils/ontology_mappings/enum_prop_ids.json +1 -0
- wikontic/utils/ontology_mappings/label2entity.json +1 -0
- wikontic/utils/ontology_mappings/obj_constraint2prop.json +1 -0
- wikontic/utils/ontology_mappings/prop2aliases.json +1 -0
- wikontic/utils/ontology_mappings/prop2constraints.json +1 -0
- wikontic/utils/ontology_mappings/prop2data_type.json +1 -0
- wikontic/utils/ontology_mappings/prop2label.json +1 -0
- wikontic/utils/ontology_mappings/propid2enum.json +1 -0
- wikontic/utils/ontology_mappings/subj_constraint2prop.json +1 -0
- wikontic/utils/ontology_mappings/subject_object_constraints.json +1 -0
- wikontic/utils/openai_utils.py +517 -0
- wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types.txt +17 -0
- wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types_dialog_bench.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_object_names.txt +17 -0
- wikontic/utils/prompts/name_refinement/rank_object_names_dialog_bench.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_object_qualifiers.txt +20 -0
- wikontic/utils/prompts/name_refinement/rank_subject_names.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_subject_names_dialog_bench.txt +20 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_entity_types.txt +26 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_relation.txt +24 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_relation_and_types.txt +28 -0
- wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question.txt +17 -0
- wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question_wo_types.txt +16 -0
- wikontic/utils/prompts/qa/prompt_entity_extraction_from_question.txt +3 -0
- wikontic/utils/prompts/qa/prompt_is_answered.txt +43 -0
- wikontic/utils/prompts/qa/qa_collapsing_prompt.txt +22 -0
- wikontic/utils/prompts/qa/qa_prompt.txt +5 -0
- wikontic/utils/prompts/qa/qa_prompt_hotpot.txt +6 -0
- wikontic/utils/prompts/qa/question_decomposition_1.txt +7 -0
- wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench.txt +75 -0
- wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench_in_russian.txt +78 -0
- wikontic/utils/prompts/triplet_extraction/propmt_1_types_qualifiers.txt +91 -0
- wikontic/utils/structured_aligner.py +606 -0
- wikontic/utils/structured_inference_with_db.py +561 -0
- wikontic-0.0.3.dist-info/METADATA +111 -0
- wikontic-0.0.3.dist-info/RECORD +53 -0
- wikontic-0.0.3.dist-info/WHEEL +5 -0
- wikontic-0.0.3.dist-info/licenses/LICENSE +19 -0
- wikontic-0.0.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,555 @@
|
|
|
1
|
+
from pymongo.mongo_client import MongoClient
|
|
2
|
+
from pymongo.operations import SearchIndexModel
|
|
3
|
+
import pymongo
|
|
4
|
+
|
|
5
|
+
from typing import List
|
|
6
|
+
from pydantic import BaseModel, ValidationError
|
|
7
|
+
from transformers import AutoTokenizer, AutoModel
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
import json
|
|
10
|
+
import time
|
|
11
|
+
import argparse
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
import torch
|
|
16
|
+
|
|
17
|
+
# Configure logging
|
|
18
|
+
logging.basicConfig(
|
|
19
|
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
20
|
+
)
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
|
24
|
+
tokenizer = AutoTokenizer.from_pretrained("facebook/contriever")
|
|
25
|
+
model = AutoModel.from_pretrained("facebook/contriever", use_safetensors=True).to(
|
|
26
|
+
device
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class EntityType(BaseModel):
|
|
31
|
+
_id: int
|
|
32
|
+
entity_type_id: str
|
|
33
|
+
label: str
|
|
34
|
+
parent_type_ids: List[str]
|
|
35
|
+
valid_subject_property_ids: List[str]
|
|
36
|
+
valid_object_property_ids: List[str]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class Property(BaseModel):
|
|
40
|
+
_id: int
|
|
41
|
+
property_id: str
|
|
42
|
+
label: str
|
|
43
|
+
valid_subject_type_ids: List[str]
|
|
44
|
+
valid_object_type_ids: List[str]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class EntityTypeAlias(BaseModel):
|
|
48
|
+
_id: int
|
|
49
|
+
entity_type_id: str
|
|
50
|
+
alias_label: str
|
|
51
|
+
alias_text_embedding: List[float]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class PropertyAlias(BaseModel):
|
|
55
|
+
_id: int
|
|
56
|
+
relation_id: str
|
|
57
|
+
alias_label: str
|
|
58
|
+
alias_text_embedding: List[float]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_embedding(text):
|
|
62
|
+
def mean_pooling(token_embeddings, mask):
|
|
63
|
+
token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.0)
|
|
64
|
+
sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
|
|
65
|
+
return sentence_embeddings
|
|
66
|
+
|
|
67
|
+
if not text or not isinstance(text, str):
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
inputs = tokenizer([text], padding=True, truncation=True, return_tensors="pt")
|
|
72
|
+
outputs = model(**inputs.to("cuda"))
|
|
73
|
+
embeddings = mean_pooling(outputs[0], inputs["attention_mask"])
|
|
74
|
+
return embeddings.detach().cpu().tolist()[0]
|
|
75
|
+
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(f"Error in get_embedding: {e}")
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_mongo_client(mongo_uri):
|
|
82
|
+
client = MongoClient(mongo_uri)
|
|
83
|
+
logger.info("Connection to MongoDB successful")
|
|
84
|
+
return client
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def populate_entity_types(
|
|
88
|
+
ENTITY_2_LABEL,
|
|
89
|
+
ENTITY_2_HIERARCHY,
|
|
90
|
+
SUBJ_2_PROP_CONSTRAINTS,
|
|
91
|
+
OBJ_2_PROP_CONSTRAINTS,
|
|
92
|
+
db,
|
|
93
|
+
collection_name="entity_types",
|
|
94
|
+
):
|
|
95
|
+
logger.info(f"Starting to populate {collection_name} collection")
|
|
96
|
+
entity_metadata_list = []
|
|
97
|
+
|
|
98
|
+
for i, entity in enumerate(ENTITY_2_LABEL.keys()):
|
|
99
|
+
label = ENTITY_2_LABEL[entity]
|
|
100
|
+
parents = ENTITY_2_HIERARCHY[entity]
|
|
101
|
+
|
|
102
|
+
valid_subject_property_ids = (
|
|
103
|
+
SUBJ_2_PROP_CONSTRAINTS[entity] if entity in SUBJ_2_PROP_CONSTRAINTS else []
|
|
104
|
+
)
|
|
105
|
+
valid_object_property_ids = (
|
|
106
|
+
OBJ_2_PROP_CONSTRAINTS[entity] if entity in OBJ_2_PROP_CONSTRAINTS else []
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
entity_metadata_list.append(
|
|
110
|
+
{
|
|
111
|
+
"_id": i,
|
|
112
|
+
"entity_type_id": entity,
|
|
113
|
+
"label": label,
|
|
114
|
+
"parent_type_ids": parents,
|
|
115
|
+
"valid_subject_property_ids": valid_subject_property_ids,
|
|
116
|
+
"valid_object_property_ids": valid_object_property_ids,
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
entity_metadata_list.append(
|
|
121
|
+
{
|
|
122
|
+
"_id": i + 1,
|
|
123
|
+
"entity_type_id": "ANY",
|
|
124
|
+
"label": "ANY",
|
|
125
|
+
"parent_type_ids": [],
|
|
126
|
+
"valid_subject_property_ids": SUBJ_2_PROP_CONSTRAINTS["<ANY SUBJECT>"],
|
|
127
|
+
"valid_object_property_ids": OBJ_2_PROP_CONSTRAINTS["<ANY OBJECT>"],
|
|
128
|
+
}
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
records = [EntityType(**record).model_dump() for record in entity_metadata_list]
|
|
133
|
+
except ValidationError as e:
|
|
134
|
+
logger.error(f"Validation error while populating {collection_name}: {e}")
|
|
135
|
+
|
|
136
|
+
collection = db.get_collection(collection_name)
|
|
137
|
+
collection.insert_many(records)
|
|
138
|
+
logger.info(f"Successfully populated {collection_name} with {len(records)} records")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def populate_entity_type_aliases(
|
|
142
|
+
ENTITY_2_LABEL, ENTITY_2_ALIASES, db, collection_name="entity_type_aliases"
|
|
143
|
+
):
|
|
144
|
+
logger.info(f"Starting to populate {collection_name} collection")
|
|
145
|
+
entity_types_list = []
|
|
146
|
+
id_count = 0
|
|
147
|
+
|
|
148
|
+
for e, aliases in tqdm(ENTITY_2_ALIASES.items()):
|
|
149
|
+
alias_embedding = get_embedding(ENTITY_2_LABEL[e])
|
|
150
|
+
entity_types_list.append(
|
|
151
|
+
{
|
|
152
|
+
"_id": id_count,
|
|
153
|
+
"entity_type_id": e,
|
|
154
|
+
"alias_label": ENTITY_2_LABEL[e],
|
|
155
|
+
"alias_text_embedding": alias_embedding,
|
|
156
|
+
}
|
|
157
|
+
)
|
|
158
|
+
id_count += 1
|
|
159
|
+
|
|
160
|
+
for alias in aliases:
|
|
161
|
+
alias_embedding = get_embedding(alias)
|
|
162
|
+
entity_types_list.append(
|
|
163
|
+
{
|
|
164
|
+
"_id": id_count,
|
|
165
|
+
"entity_type_id": e,
|
|
166
|
+
"alias_label": alias,
|
|
167
|
+
"alias_text_embedding": alias_embedding,
|
|
168
|
+
}
|
|
169
|
+
)
|
|
170
|
+
id_count += 1
|
|
171
|
+
try:
|
|
172
|
+
records = [
|
|
173
|
+
EntityTypeAlias(**record).model_dump() for record in entity_types_list
|
|
174
|
+
]
|
|
175
|
+
except ValidationError as e:
|
|
176
|
+
logger.error(f"Validation error while populating {collection_name}: {e}")
|
|
177
|
+
|
|
178
|
+
collection = db.get_collection(collection_name)
|
|
179
|
+
collection.insert_many(records)
|
|
180
|
+
logger.info(f"Successfully populated {collection_name} with {len(records)} records")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def populate_properties(
|
|
184
|
+
PROP_2_LABEL, PROP_2_CONSTRAINT, db, collection_name="properties"
|
|
185
|
+
):
|
|
186
|
+
logger.info(f"Starting to populate {collection_name} collection")
|
|
187
|
+
property_list = []
|
|
188
|
+
|
|
189
|
+
for i, prop_id in enumerate(PROP_2_LABEL.keys()):
|
|
190
|
+
property_list.append(
|
|
191
|
+
{
|
|
192
|
+
"_id": i,
|
|
193
|
+
"property_id": prop_id,
|
|
194
|
+
"label": PROP_2_LABEL[prop_id],
|
|
195
|
+
"valid_subject_type_ids": PROP_2_CONSTRAINT[prop_id][
|
|
196
|
+
"Subject type constraint"
|
|
197
|
+
],
|
|
198
|
+
"valid_object_type_ids": PROP_2_CONSTRAINT[prop_id][
|
|
199
|
+
"Value-type constraint"
|
|
200
|
+
],
|
|
201
|
+
}
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
records = [Property(**record).model_dump() for record in property_list]
|
|
206
|
+
except ValidationError as e:
|
|
207
|
+
logger.error(f"Validation error while populating {collection_name}: {e}")
|
|
208
|
+
|
|
209
|
+
collection = db.get_collection(collection_name)
|
|
210
|
+
collection.insert_many(records)
|
|
211
|
+
logger.info(f"Successfully populated {collection_name} with {len(records)} records")
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def populate_property_aliases(
|
|
215
|
+
PROP_2_LABEL, PROP_2_ALIASES, db, collection_name="property_aliases"
|
|
216
|
+
):
|
|
217
|
+
logger.info(f"Starting to populate {collection_name} collection")
|
|
218
|
+
relation_alias_id_pairs = []
|
|
219
|
+
id_count = 0
|
|
220
|
+
|
|
221
|
+
for r, aliases in tqdm(PROP_2_ALIASES.items()):
|
|
222
|
+
alias_embedding = get_embedding(PROP_2_LABEL[r])
|
|
223
|
+
relation_alias_id_pairs.append(
|
|
224
|
+
{
|
|
225
|
+
"_id": id_count,
|
|
226
|
+
"relation_id": r,
|
|
227
|
+
"alias_label": PROP_2_LABEL[r],
|
|
228
|
+
"alias_text_embedding": alias_embedding,
|
|
229
|
+
}
|
|
230
|
+
)
|
|
231
|
+
id_count += 1
|
|
232
|
+
|
|
233
|
+
for alias in aliases:
|
|
234
|
+
alias_embedding = get_embedding(alias)
|
|
235
|
+
relation_alias_id_pairs.append(
|
|
236
|
+
{
|
|
237
|
+
"_id": id_count,
|
|
238
|
+
"relation_id": r,
|
|
239
|
+
"alias_label": alias,
|
|
240
|
+
"alias_text_embedding": alias_embedding,
|
|
241
|
+
}
|
|
242
|
+
)
|
|
243
|
+
id_count += 1
|
|
244
|
+
try:
|
|
245
|
+
records = [
|
|
246
|
+
PropertyAlias(**record).model_dump() for record in relation_alias_id_pairs
|
|
247
|
+
]
|
|
248
|
+
except ValidationError as e:
|
|
249
|
+
logger.error(f"Validation error while populating {collection_name}: {e}")
|
|
250
|
+
|
|
251
|
+
collection = db.get_collection(collection_name)
|
|
252
|
+
collection.insert_many(records)
|
|
253
|
+
logger.info(f"Successfully populated {collection_name} with {len(records)} records")
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def create_search_index_for_entity_types(
|
|
257
|
+
db,
|
|
258
|
+
collection_name="entity_type_aliases",
|
|
259
|
+
embedding_field_name="alias_text_embedding",
|
|
260
|
+
index_name="entity_type_aliases",
|
|
261
|
+
):
|
|
262
|
+
logger.info(f"Starting to create index {index_name} for {collection_name}")
|
|
263
|
+
collection = db.get_collection(collection_name)
|
|
264
|
+
vector_search_index_model = SearchIndexModel(
|
|
265
|
+
definition={
|
|
266
|
+
"mappings": {
|
|
267
|
+
"dynamic": True,
|
|
268
|
+
"fields": {
|
|
269
|
+
embedding_field_name: {
|
|
270
|
+
"dimensions": 768,
|
|
271
|
+
"similarity": "cosine",
|
|
272
|
+
"type": "knnVector",
|
|
273
|
+
}
|
|
274
|
+
},
|
|
275
|
+
}
|
|
276
|
+
},
|
|
277
|
+
name=index_name,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
try:
|
|
281
|
+
result = collection.create_search_index(model=vector_search_index_model)
|
|
282
|
+
logger.info("Creating index...")
|
|
283
|
+
time.sleep(20)
|
|
284
|
+
logger.info(f"New index {index_name} created successfully: {result}")
|
|
285
|
+
except Exception as e:
|
|
286
|
+
logger.error(f"Error creating new vector search index {index_name}: {str(e)}")
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def create_search_index_for_properties(
|
|
290
|
+
db,
|
|
291
|
+
collection_name="property_aliases",
|
|
292
|
+
embedding_field_name="alias_text_embedding",
|
|
293
|
+
prop_id_field_name="relation_id",
|
|
294
|
+
index_name="property_aliases_ids",
|
|
295
|
+
):
|
|
296
|
+
logger.info(f"Starting to create index {index_name} for {collection_name}")
|
|
297
|
+
collection = db.get_collection(collection_name)
|
|
298
|
+
vector_search_index_model = SearchIndexModel(
|
|
299
|
+
definition={
|
|
300
|
+
"mappings": {
|
|
301
|
+
"dynamic": True,
|
|
302
|
+
"fields": {
|
|
303
|
+
embedding_field_name: {
|
|
304
|
+
"dimensions": 768,
|
|
305
|
+
"similarity": "cosine",
|
|
306
|
+
"type": "knnVector",
|
|
307
|
+
},
|
|
308
|
+
prop_id_field_name: {"type": "token"},
|
|
309
|
+
},
|
|
310
|
+
}
|
|
311
|
+
},
|
|
312
|
+
name=index_name,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
try:
|
|
316
|
+
result = collection.create_search_index(model=vector_search_index_model)
|
|
317
|
+
logger.info("Creating index...")
|
|
318
|
+
time.sleep(20)
|
|
319
|
+
logger.info(f"New index {index_name} created successfully: {result}")
|
|
320
|
+
except Exception as e:
|
|
321
|
+
logger.error(f"Error creating new vector search index {index_name}: {str(e)}")
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def create_indexes(db):
|
|
325
|
+
logger.info("Creating indexes for entity_types collection...")
|
|
326
|
+
db.entity_types.create_index([("entity_type_id", 1)])
|
|
327
|
+
db.entity_types.create_index([("label", 1)])
|
|
328
|
+
|
|
329
|
+
logger.info("Creating indexes for entity_type_aliases collection...")
|
|
330
|
+
db.entity_type_aliases.create_index([("entity_type_id", 1)])
|
|
331
|
+
db.entity_type_aliases.create_index([("alias_label", 1)])
|
|
332
|
+
|
|
333
|
+
logger.info("Creating indexes for properties collection...")
|
|
334
|
+
db.properties.create_index([("property_id", 1)])
|
|
335
|
+
|
|
336
|
+
# logger.info("Creating indexes for property_aliases collection...")
|
|
337
|
+
# db.property_aliases.create_index("relation_id")
|
|
338
|
+
|
|
339
|
+
logger.info("Creating indexes for entity_aliases collection...")
|
|
340
|
+
db.entity_aliases.create_index([("entity_type", 1), ("sample_id", 1)])
|
|
341
|
+
db.entity_aliases.create_index([("label", 1)])
|
|
342
|
+
|
|
343
|
+
db.create_collection("triplets")
|
|
344
|
+
logger.info("Creating indexes for triplets collection...")
|
|
345
|
+
db.triplets.create_index([("sample_id", 1)])
|
|
346
|
+
|
|
347
|
+
logger.info("All indexes created successfully")
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def create_wikidata_ontology_database(
|
|
351
|
+
mongo_uri: str = "mongodb://localhost:27018/?directConnection=true",
|
|
352
|
+
database: str = "wikidata_ontology",
|
|
353
|
+
mappings_dir: str = None,
|
|
354
|
+
entity_types_collection: str = "entity_types",
|
|
355
|
+
entity_type_aliases_collection: str = "entity_type_aliases",
|
|
356
|
+
properties_collection: str = "properties",
|
|
357
|
+
property_aliases_collection: str = "property_aliases",
|
|
358
|
+
entity_types_index: str = "entity_type_aliases",
|
|
359
|
+
property_aliases_index: str = "property_aliases",
|
|
360
|
+
drop_collections: bool = True,
|
|
361
|
+
):
|
|
362
|
+
"""
|
|
363
|
+
Populate MongoDB with Wikidata ontology data.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
mongo_uri: MongoDB connection URI
|
|
367
|
+
database: MongoDB database name
|
|
368
|
+
mappings_dir: Directory containing ontology mapping files. If None, uses default path.
|
|
369
|
+
entity_types_collection: Collection name for entity types
|
|
370
|
+
entity_type_aliases_collection: Collection name for entity type aliases
|
|
371
|
+
properties_collection: Collection name for properties
|
|
372
|
+
property_aliases_collection: Collection name for property aliases
|
|
373
|
+
entity_types_index: Index name for entity types
|
|
374
|
+
property_aliases_index: Index name for property aliases
|
|
375
|
+
drop_collections: Whether to drop existing collections before creating new ones
|
|
376
|
+
|
|
377
|
+
Returns:
|
|
378
|
+
Database object
|
|
379
|
+
"""
|
|
380
|
+
|
|
381
|
+
# Default mappings directory
|
|
382
|
+
if mappings_dir is None:
|
|
383
|
+
# Try to find the mappings directory relative to this file
|
|
384
|
+
current_file = Path(__file__).parent
|
|
385
|
+
mappings_dir = str(current_file / "utils" / "ontology_mappings" / "")
|
|
386
|
+
if not os.path.exists(mappings_dir):
|
|
387
|
+
# Fallback to relative path
|
|
388
|
+
mappings_dir = "utils/ontology_mappings/"
|
|
389
|
+
|
|
390
|
+
logger.info("Starting database population process")
|
|
391
|
+
logger.info(f"Using database: {database}")
|
|
392
|
+
logger.info(f"Loading mapping files from: {mappings_dir}")
|
|
393
|
+
|
|
394
|
+
# Load mapping files
|
|
395
|
+
with open(os.path.join(mappings_dir, "subj_constraint2prop.json"), "r") as f:
|
|
396
|
+
subj2prop_constraints = json.load(f)
|
|
397
|
+
|
|
398
|
+
with open(os.path.join(mappings_dir, "obj_constraint2prop.json"), "r") as f:
|
|
399
|
+
obj2prop_constraints = json.load(f)
|
|
400
|
+
|
|
401
|
+
with open(os.path.join(mappings_dir, "entity_type2label.json"), "r") as f:
|
|
402
|
+
ENTITY_2_LABEL = json.load(f)
|
|
403
|
+
|
|
404
|
+
with open(os.path.join(mappings_dir, "entity_type2hierarchy.json"), "r") as f:
|
|
405
|
+
ENTITY_2_HIERARCHY = json.load(f)
|
|
406
|
+
|
|
407
|
+
with open(os.path.join(mappings_dir, "entity_type2aliases.json"), "r") as f:
|
|
408
|
+
ENTITY_2_ALIASES = json.load(f)
|
|
409
|
+
|
|
410
|
+
with open(os.path.join(mappings_dir, "prop2constraints.json"), "r") as f:
|
|
411
|
+
PROP_2_CONSTRAINT = json.load(f)
|
|
412
|
+
|
|
413
|
+
with open(os.path.join(mappings_dir, "prop2label.json"), "r") as f:
|
|
414
|
+
PROP_2_LABEL = json.load(f)
|
|
415
|
+
|
|
416
|
+
with open(os.path.join(mappings_dir, "prop2aliases.json"), "r") as f:
|
|
417
|
+
PROP_2_ALIASES = json.load(f)
|
|
418
|
+
|
|
419
|
+
logger.info("Successfully loaded all mapping files")
|
|
420
|
+
|
|
421
|
+
# Connect to MongoDB
|
|
422
|
+
mongo_client = get_mongo_client(mongo_uri)
|
|
423
|
+
db = mongo_client.get_database(database)
|
|
424
|
+
|
|
425
|
+
# Drop all existing collections
|
|
426
|
+
if drop_collections:
|
|
427
|
+
logger.info("Dropping existing collections...")
|
|
428
|
+
for collection_name in db.list_collection_names():
|
|
429
|
+
logger.info(f"Dropping collection: {collection_name}")
|
|
430
|
+
db.drop_collection(collection_name)
|
|
431
|
+
logger.info("Successfully dropped all existing collections")
|
|
432
|
+
|
|
433
|
+
# Populate collections
|
|
434
|
+
populate_entity_types(
|
|
435
|
+
ENTITY_2_LABEL,
|
|
436
|
+
ENTITY_2_HIERARCHY,
|
|
437
|
+
subj2prop_constraints,
|
|
438
|
+
obj2prop_constraints,
|
|
439
|
+
db,
|
|
440
|
+
collection_name=entity_types_collection,
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
populate_entity_type_aliases(
|
|
444
|
+
ENTITY_2_LABEL,
|
|
445
|
+
ENTITY_2_ALIASES,
|
|
446
|
+
db,
|
|
447
|
+
collection_name=entity_type_aliases_collection,
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
populate_properties(
|
|
451
|
+
PROP_2_LABEL, PROP_2_CONSTRAINT, db, collection_name=properties_collection
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
populate_property_aliases(
|
|
455
|
+
PROP_2_LABEL,
|
|
456
|
+
PROP_2_ALIASES,
|
|
457
|
+
db,
|
|
458
|
+
collection_name=property_aliases_collection,
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
# Create search indexes
|
|
462
|
+
create_search_index_for_entity_types(
|
|
463
|
+
db,
|
|
464
|
+
collection_name=entity_type_aliases_collection,
|
|
465
|
+
index_name=entity_types_index,
|
|
466
|
+
)
|
|
467
|
+
create_search_index_for_properties(
|
|
468
|
+
db,
|
|
469
|
+
collection_name=property_aliases_collection,
|
|
470
|
+
index_name=property_aliases_index,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
# Create indexes
|
|
474
|
+
create_indexes(db)
|
|
475
|
+
logger.info("Database population process completed")
|
|
476
|
+
|
|
477
|
+
return db
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
if __name__ == "__main__":
|
|
481
|
+
parser = argparse.ArgumentParser(
|
|
482
|
+
description="Populate MongoDB with Wikidata ontology data"
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
parser.add_argument(
|
|
486
|
+
"--mappings_dir",
|
|
487
|
+
type=str,
|
|
488
|
+
default="utils/ontology_mappings/",
|
|
489
|
+
help="Directory containing ontology mapping files",
|
|
490
|
+
)
|
|
491
|
+
parser.add_argument(
|
|
492
|
+
"--mongo_uri",
|
|
493
|
+
type=str,
|
|
494
|
+
default="mongodb://localhost:27018/?directConnection=true",
|
|
495
|
+
help="MongoDB connection URI",
|
|
496
|
+
)
|
|
497
|
+
parser.add_argument(
|
|
498
|
+
"--database",
|
|
499
|
+
type=str,
|
|
500
|
+
default="wikidata_ontology",
|
|
501
|
+
help="MongoDB database name",
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
# Collection names
|
|
505
|
+
parser.add_argument(
|
|
506
|
+
"--entity_types_collection",
|
|
507
|
+
type=str,
|
|
508
|
+
default="entity_types",
|
|
509
|
+
help="Collection name for entity types",
|
|
510
|
+
)
|
|
511
|
+
parser.add_argument(
|
|
512
|
+
"--entity_type_aliases_collection",
|
|
513
|
+
type=str,
|
|
514
|
+
default="entity_type_aliases",
|
|
515
|
+
help="Collection name for entity type aliases",
|
|
516
|
+
)
|
|
517
|
+
parser.add_argument(
|
|
518
|
+
"--properties_collection",
|
|
519
|
+
type=str,
|
|
520
|
+
default="properties",
|
|
521
|
+
help="Collection name for properties",
|
|
522
|
+
)
|
|
523
|
+
parser.add_argument(
|
|
524
|
+
"--property_aliases_collection",
|
|
525
|
+
type=str,
|
|
526
|
+
default="property_aliases",
|
|
527
|
+
help="Collection name for property aliases",
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
# Index names
|
|
531
|
+
parser.add_argument(
|
|
532
|
+
"--entity_types_index",
|
|
533
|
+
type=str,
|
|
534
|
+
default="entity_type_aliases",
|
|
535
|
+
help="Index name for entity types",
|
|
536
|
+
)
|
|
537
|
+
parser.add_argument(
|
|
538
|
+
"--property_aliases_index",
|
|
539
|
+
type=str,
|
|
540
|
+
default="property_aliases",
|
|
541
|
+
help="Index name for property aliases",
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
args = parser.parse_args()
|
|
545
|
+
create_wikidata_ontology_database(
|
|
546
|
+
mongo_uri=args.mongo_uri,
|
|
547
|
+
database=args.database,
|
|
548
|
+
mappings_dir=args.mappings_dir,
|
|
549
|
+
entity_types_collection=args.entity_types_collection,
|
|
550
|
+
entity_type_aliases_collection=args.entity_type_aliases_collection,
|
|
551
|
+
properties_collection=args.properties_collection,
|
|
552
|
+
property_aliases_collection=args.property_aliases_collection,
|
|
553
|
+
entity_types_index=args.entity_types_index,
|
|
554
|
+
property_aliases_index=args.property_aliases_index,
|
|
555
|
+
)
|