wikontic 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wikontic/__init__.py +16 -0
- wikontic/create_ontological_triplets_db.py +193 -0
- wikontic/create_triplets_db.py +259 -0
- wikontic/create_wikidata_ontology_db.py +555 -0
- wikontic/utils/__init__.py +7 -0
- wikontic/utils/base_inference_with_db.py +329 -0
- wikontic/utils/dynamic_aligner.py +281 -0
- wikontic/utils/inference_with_db.py +224 -0
- wikontic/utils/ontology_mappings/entity_hierarchy.json +1 -0
- wikontic/utils/ontology_mappings/entity_names.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2aliases.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2hierarchy.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2label.json +1 -0
- wikontic/utils/ontology_mappings/enum_entity_ids.json +1 -0
- wikontic/utils/ontology_mappings/enum_prop_ids.json +1 -0
- wikontic/utils/ontology_mappings/label2entity.json +1 -0
- wikontic/utils/ontology_mappings/obj_constraint2prop.json +1 -0
- wikontic/utils/ontology_mappings/prop2aliases.json +1 -0
- wikontic/utils/ontology_mappings/prop2constraints.json +1 -0
- wikontic/utils/ontology_mappings/prop2data_type.json +1 -0
- wikontic/utils/ontology_mappings/prop2label.json +1 -0
- wikontic/utils/ontology_mappings/propid2enum.json +1 -0
- wikontic/utils/ontology_mappings/subj_constraint2prop.json +1 -0
- wikontic/utils/ontology_mappings/subject_object_constraints.json +1 -0
- wikontic/utils/openai_utils.py +517 -0
- wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types.txt +17 -0
- wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types_dialog_bench.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_object_names.txt +17 -0
- wikontic/utils/prompts/name_refinement/rank_object_names_dialog_bench.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_object_qualifiers.txt +20 -0
- wikontic/utils/prompts/name_refinement/rank_subject_names.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_subject_names_dialog_bench.txt +20 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_entity_types.txt +26 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_relation.txt +24 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_relation_and_types.txt +28 -0
- wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question.txt +17 -0
- wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question_wo_types.txt +16 -0
- wikontic/utils/prompts/qa/prompt_entity_extraction_from_question.txt +3 -0
- wikontic/utils/prompts/qa/prompt_is_answered.txt +43 -0
- wikontic/utils/prompts/qa/qa_collapsing_prompt.txt +22 -0
- wikontic/utils/prompts/qa/qa_prompt.txt +5 -0
- wikontic/utils/prompts/qa/qa_prompt_hotpot.txt +6 -0
- wikontic/utils/prompts/qa/question_decomposition_1.txt +7 -0
- wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench.txt +75 -0
- wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench_in_russian.txt +78 -0
- wikontic/utils/prompts/triplet_extraction/propmt_1_types_qualifiers.txt +91 -0
- wikontic/utils/structured_aligner.py +606 -0
- wikontic/utils/structured_inference_with_db.py +561 -0
- wikontic-0.0.3.dist-info/METADATA +111 -0
- wikontic-0.0.3.dist-info/RECORD +53 -0
- wikontic-0.0.3.dist-info/WHEEL +5 -0
- wikontic-0.0.3.dist-info/licenses/LICENSE +19 -0
- wikontic-0.0.3.dist-info/top_level.txt +1 -0
wikontic/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Wikontic - Extract ontology-aware, Wikidata-aligned knowledge graphs from raw text using LLMs.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .create_triplets_db import create_triplets_database
|
|
6
|
+
from .create_ontological_triplets_db import create_ontological_triplets_database
|
|
7
|
+
from .create_wikidata_ontology_db import create_wikidata_ontology_database
|
|
8
|
+
|
|
9
|
+
from . import utils
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"create_triplets_database",
|
|
13
|
+
"create_ontological_triplets_database",
|
|
14
|
+
"create_wikidata_ontology_database",
|
|
15
|
+
"utils",
|
|
16
|
+
]
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
from pymongo.mongo_client import MongoClient
|
|
2
|
+
from pymongo.operations import SearchIndexModel
|
|
3
|
+
import pymongo
|
|
4
|
+
|
|
5
|
+
from typing import List
|
|
6
|
+
from pydantic import BaseModel, ValidationError
|
|
7
|
+
from transformers import AutoTokenizer, AutoModel
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
import json
|
|
10
|
+
import time
|
|
11
|
+
import argparse
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
|
|
15
|
+
# Configure logging
|
|
16
|
+
logging.basicConfig(
|
|
17
|
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
18
|
+
)
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_mongo_client(mongo_uri):
|
|
23
|
+
client = MongoClient(mongo_uri)
|
|
24
|
+
logger.info("Connection to MongoDB successful")
|
|
25
|
+
return client
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def create_search_index_for_entities(
|
|
29
|
+
db,
|
|
30
|
+
collection_name="entity_aliases",
|
|
31
|
+
embedding_field_name="alias_text_embedding",
|
|
32
|
+
entity_type_id_field_name="entity_type",
|
|
33
|
+
index_name="entities",
|
|
34
|
+
):
|
|
35
|
+
logger.info(f"Starting to create index {index_name} for {collection_name}")
|
|
36
|
+
collection = db.get_collection(collection_name)
|
|
37
|
+
vector_search_index_model = SearchIndexModel(
|
|
38
|
+
definition={
|
|
39
|
+
"mappings": {
|
|
40
|
+
"dynamic": True,
|
|
41
|
+
"fields": {
|
|
42
|
+
embedding_field_name: {
|
|
43
|
+
"dimensions": 768,
|
|
44
|
+
"similarity": "cosine",
|
|
45
|
+
"type": "knnVector",
|
|
46
|
+
},
|
|
47
|
+
entity_type_id_field_name: {"type": "token"},
|
|
48
|
+
"sample_id": {
|
|
49
|
+
# "type": "number"
|
|
50
|
+
"type": "token"
|
|
51
|
+
},
|
|
52
|
+
},
|
|
53
|
+
}
|
|
54
|
+
},
|
|
55
|
+
name=index_name,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
result = collection.create_search_index(model=vector_search_index_model)
|
|
60
|
+
logger.info("Creating index...")
|
|
61
|
+
time.sleep(20)
|
|
62
|
+
logger.info(f"New index {index_name} created successfully: {result}")
|
|
63
|
+
except Exception as e:
|
|
64
|
+
logger.error(f"Error creating new vector search index {index_name}: {str(e)}")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def create_ontological_triplets_database(
|
|
68
|
+
mongo_uri: str = "mongodb://localhost:27018/?directConnection=true",
|
|
69
|
+
db_name: str = "triplets_db",
|
|
70
|
+
entity_aliases_collection: str = "entity_aliases",
|
|
71
|
+
triplets_collection: str = "triplets",
|
|
72
|
+
initial_triplets_collection: str = "initial_triplets",
|
|
73
|
+
filtered_triplets_collection: str = "filtered_triplets",
|
|
74
|
+
ontology_filtered_triplets_collection: str = "ontology_filtered_triplets",
|
|
75
|
+
entity_aliases_index: str = "entity_aliases",
|
|
76
|
+
drop_collections: bool = False,
|
|
77
|
+
):
|
|
78
|
+
"""
|
|
79
|
+
Create collections and indexes for the ontological triplets database.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
mongo_uri: MongoDB connection URI
|
|
83
|
+
db_name: Name of the database to create
|
|
84
|
+
entity_aliases_collection: Collection name for entity aliases
|
|
85
|
+
triplets_collection: Collection name for triplets
|
|
86
|
+
initial_triplets_collection: Collection name for initial triplets
|
|
87
|
+
filtered_triplets_collection: Collection name for filtered triplets
|
|
88
|
+
ontology_filtered_triplets_collection: Collection name for ontology filtered triplets
|
|
89
|
+
entity_aliases_index: Index name for entities
|
|
90
|
+
drop_collections: Whether to drop existing collections before creating new ones
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Database object
|
|
94
|
+
"""
|
|
95
|
+
mongo_client = get_mongo_client(mongo_uri)
|
|
96
|
+
db = mongo_client.get_database(db_name)
|
|
97
|
+
|
|
98
|
+
if drop_collections:
|
|
99
|
+
for collection_name in db.list_collection_names():
|
|
100
|
+
db.drop_collection(collection_name)
|
|
101
|
+
logger.info(f"Dropped collection: {collection_name}")
|
|
102
|
+
|
|
103
|
+
db.create_collection(entity_aliases_collection)
|
|
104
|
+
db.create_collection(initial_triplets_collection)
|
|
105
|
+
db.create_collection(filtered_triplets_collection)
|
|
106
|
+
db.create_collection(ontology_filtered_triplets_collection)
|
|
107
|
+
db.create_collection(triplets_collection)
|
|
108
|
+
|
|
109
|
+
logger.info("Collections created successfully")
|
|
110
|
+
db.entity_aliases.create_index([("entity_type", 1), ("sample_id", 1)])
|
|
111
|
+
db.entity_aliases.create_index([("label", 1)])
|
|
112
|
+
|
|
113
|
+
db.triplets.create_index([("sample_id", 1)])
|
|
114
|
+
db.initial_triplets.create_index([("sample_id", 1)])
|
|
115
|
+
db.filtered_triplets.create_index([("sample_id", 1)])
|
|
116
|
+
db.ontology_filtered_triplets.create_index([("sample_id", 1)])
|
|
117
|
+
logger.info("Indexes created successfully")
|
|
118
|
+
|
|
119
|
+
create_search_index_for_entities(
|
|
120
|
+
db,
|
|
121
|
+
collection_name=entity_aliases_collection,
|
|
122
|
+
embedding_field_name="alias_text_embedding",
|
|
123
|
+
entity_type_id_field_name="entity_type",
|
|
124
|
+
index_name=entity_aliases_index,
|
|
125
|
+
)
|
|
126
|
+
logger.info("Search index created successfully")
|
|
127
|
+
logger.info("All indexes created successfully")
|
|
128
|
+
|
|
129
|
+
return db
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
if __name__ == "__main__":
|
|
133
|
+
parser = argparse.ArgumentParser(
|
|
134
|
+
description="Create collections and indexes for the dynamic triplets database"
|
|
135
|
+
)
|
|
136
|
+
parser.add_argument(
|
|
137
|
+
"--mongo_uri",
|
|
138
|
+
type=str,
|
|
139
|
+
default="mongodb://localhost:27018/?directConnection=true",
|
|
140
|
+
)
|
|
141
|
+
parser.add_argument("--db_name", type=str, default="triplets_db")
|
|
142
|
+
parser.add_argument(
|
|
143
|
+
"--entity_aliases_collection",
|
|
144
|
+
type=str,
|
|
145
|
+
default="entity_aliases",
|
|
146
|
+
help="Collection name for entity aliases",
|
|
147
|
+
)
|
|
148
|
+
parser.add_argument(
|
|
149
|
+
"--triplets_collection",
|
|
150
|
+
type=str,
|
|
151
|
+
default="triplets",
|
|
152
|
+
help="Collection name for triplets",
|
|
153
|
+
)
|
|
154
|
+
parser.add_argument(
|
|
155
|
+
"--initial_triplets_collection",
|
|
156
|
+
type=str,
|
|
157
|
+
default="initial_triplets",
|
|
158
|
+
help="Collection name for initial triplets",
|
|
159
|
+
)
|
|
160
|
+
parser.add_argument(
|
|
161
|
+
"--filtered_triplets_collection",
|
|
162
|
+
type=str,
|
|
163
|
+
default="filtered_triplets",
|
|
164
|
+
help="Collection name for filtered triplets",
|
|
165
|
+
)
|
|
166
|
+
parser.add_argument(
|
|
167
|
+
"--ontology_filtered_triplets_collection",
|
|
168
|
+
type=str,
|
|
169
|
+
default="ontology_filtered_triplets",
|
|
170
|
+
help="Collection name for ontology filtered triplets",
|
|
171
|
+
)
|
|
172
|
+
parser.add_argument(
|
|
173
|
+
"--entity_aliases_index",
|
|
174
|
+
type=str,
|
|
175
|
+
default="entity_aliases",
|
|
176
|
+
help="Index name for entities",
|
|
177
|
+
)
|
|
178
|
+
parser.add_argument(
|
|
179
|
+
"--drop_collections", type=bool, default=False, help="Drop existing collections"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
args = parser.parse_args()
|
|
183
|
+
create_ontological_triplets_database(
|
|
184
|
+
mongo_uri=args.mongo_uri,
|
|
185
|
+
db_name=args.db_name,
|
|
186
|
+
entity_aliases_collection=args.entity_aliases_collection,
|
|
187
|
+
triplets_collection=args.triplets_collection,
|
|
188
|
+
initial_triplets_collection=args.initial_triplets_collection,
|
|
189
|
+
filtered_triplets_collection=args.filtered_triplets_collection,
|
|
190
|
+
ontology_filtered_triplets_collection=args.ontology_filtered_triplets_collection,
|
|
191
|
+
entity_aliases_index=args.entity_aliases_index,
|
|
192
|
+
drop_collections=args.drop_collections,
|
|
193
|
+
)
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
from pymongo.mongo_client import MongoClient
|
|
2
|
+
from pymongo.operations import SearchIndexModel
|
|
3
|
+
import pymongo
|
|
4
|
+
|
|
5
|
+
from typing import List
|
|
6
|
+
from pydantic import BaseModel, ValidationError
|
|
7
|
+
from transformers import AutoTokenizer, AutoModel
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
import json
|
|
10
|
+
import time
|
|
11
|
+
import argparse
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
|
|
15
|
+
# Configure logging
|
|
16
|
+
logging.basicConfig(
|
|
17
|
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
18
|
+
)
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_mongo_client(mongo_uri):
|
|
23
|
+
client = MongoClient(mongo_uri)
|
|
24
|
+
logger.info("Connection to MongoDB successful")
|
|
25
|
+
return client
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def create_search_index_for_entities(
|
|
29
|
+
db,
|
|
30
|
+
collection_name="entity_aliases",
|
|
31
|
+
embedding_field_name="alias_text_embedding",
|
|
32
|
+
index_name="entity_aliases",
|
|
33
|
+
dimensions=768,
|
|
34
|
+
):
|
|
35
|
+
logger.info(
|
|
36
|
+
f"Starting to create index {index_name} for {collection_name} with dimensions {dimensions}"
|
|
37
|
+
)
|
|
38
|
+
collection = db.get_collection(collection_name)
|
|
39
|
+
vector_search_index_model = SearchIndexModel(
|
|
40
|
+
definition={
|
|
41
|
+
"mappings": {
|
|
42
|
+
"dynamic": True,
|
|
43
|
+
"fields": {
|
|
44
|
+
embedding_field_name: {
|
|
45
|
+
"dimensions": dimensions,
|
|
46
|
+
"similarity": "cosine",
|
|
47
|
+
"type": "knnVector",
|
|
48
|
+
},
|
|
49
|
+
"sample_id": {
|
|
50
|
+
# "type": "number"
|
|
51
|
+
"type": "token"
|
|
52
|
+
},
|
|
53
|
+
},
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
name=index_name,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
result = collection.create_search_index(model=vector_search_index_model)
|
|
61
|
+
logger.info("Creating index...")
|
|
62
|
+
time.sleep(20)
|
|
63
|
+
logger.info(f"New index {index_name} created successfully: {result}")
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.error(f"Error creating new vector search index {index_name}: {str(e)}")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def create_search_index_for_properties(
|
|
69
|
+
db,
|
|
70
|
+
collection_name="property_aliases",
|
|
71
|
+
embedding_field_name="alias_text_embedding",
|
|
72
|
+
index_name="property_aliases",
|
|
73
|
+
dimensions=768,
|
|
74
|
+
):
|
|
75
|
+
logger.info(
|
|
76
|
+
f"Starting to create index {index_name} for {collection_name} with dimensions {dimensions}"
|
|
77
|
+
)
|
|
78
|
+
collection = db.get_collection(collection_name)
|
|
79
|
+
vector_search_index_model = SearchIndexModel(
|
|
80
|
+
definition={
|
|
81
|
+
"mappings": {
|
|
82
|
+
"dynamic": True,
|
|
83
|
+
"fields": {
|
|
84
|
+
embedding_field_name: {
|
|
85
|
+
"dimensions": dimensions,
|
|
86
|
+
"similarity": "cosine",
|
|
87
|
+
"type": "knnVector",
|
|
88
|
+
},
|
|
89
|
+
},
|
|
90
|
+
}
|
|
91
|
+
},
|
|
92
|
+
name=index_name,
|
|
93
|
+
)
|
|
94
|
+
try:
|
|
95
|
+
result = collection.create_search_index(model=vector_search_index_model)
|
|
96
|
+
logger.info("Creating index...")
|
|
97
|
+
time.sleep(20)
|
|
98
|
+
logger.info(f"New index {index_name} created successfully: {result}")
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.error(f"Error creating new vector search index {index_name}: {str(e)}")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def create_triplets_database(
|
|
104
|
+
mongo_uri: str = "mongodb://localhost:27018/?directConnection=true",
|
|
105
|
+
db_name: str = "triplets_db",
|
|
106
|
+
entity_aliases_collection: str = "entity_aliases",
|
|
107
|
+
property_aliases_collection: str = "property_aliases",
|
|
108
|
+
triplets_collection: str = "triplets",
|
|
109
|
+
initial_triplets_collection: str = "initial_triplets",
|
|
110
|
+
filtered_triplets_collection: str = "filtered_triplets",
|
|
111
|
+
entity_aliases_index: str = "entity_aliases",
|
|
112
|
+
property_aliases_index: str = "property_aliases",
|
|
113
|
+
embedding_dimensions: int = 768,
|
|
114
|
+
drop_collections: bool = False,
|
|
115
|
+
):
|
|
116
|
+
"""
|
|
117
|
+
Create collections and indexes for the dynamic triplets database.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
mongo_uri: MongoDB connection URI
|
|
121
|
+
db_name: Name of the database to create
|
|
122
|
+
entity_aliases_collection: Collection name for entity aliases
|
|
123
|
+
property_aliases_collection: Collection name for property aliases
|
|
124
|
+
triplets_collection: Collection name for triplets
|
|
125
|
+
initial_triplets_collection: Collection name for initial triplets
|
|
126
|
+
filtered_triplets_collection: Collection name for filtered triplets
|
|
127
|
+
entity_aliases_index: Index name for entities
|
|
128
|
+
property_aliases_index: Index name for property aliases
|
|
129
|
+
embedding_dimensions: Dimensions for embeddings (default: 768)
|
|
130
|
+
drop_collections: Whether to drop existing collections before creating new ones
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Database object
|
|
134
|
+
"""
|
|
135
|
+
mongo_client = get_mongo_client(mongo_uri)
|
|
136
|
+
db = mongo_client.get_database(db_name)
|
|
137
|
+
|
|
138
|
+
# Drop all existing collections
|
|
139
|
+
if drop_collections:
|
|
140
|
+
logger.info("Dropping existing collections...")
|
|
141
|
+
for collection_name in db.list_collection_names():
|
|
142
|
+
logger.info(f"Dropping collection: {collection_name}")
|
|
143
|
+
db.drop_collection(collection_name)
|
|
144
|
+
logger.info("Successfully dropped all existing collections")
|
|
145
|
+
|
|
146
|
+
db.create_collection(entity_aliases_collection)
|
|
147
|
+
db.create_collection(property_aliases_collection)
|
|
148
|
+
db.create_collection(triplets_collection)
|
|
149
|
+
db.create_collection(initial_triplets_collection)
|
|
150
|
+
db.create_collection(filtered_triplets_collection)
|
|
151
|
+
|
|
152
|
+
logger.info("Collections created successfully")
|
|
153
|
+
db.entity_aliases.create_index([("sample_id", 1)])
|
|
154
|
+
db.entity_aliases.create_index([("label", 1)])
|
|
155
|
+
|
|
156
|
+
db.property_aliases.create_index([("sample_id", 1)])
|
|
157
|
+
db.property_aliases.create_index([("label", 1)])
|
|
158
|
+
|
|
159
|
+
db.triplets.create_index([("sample_id", 1)])
|
|
160
|
+
db.initial_triplets.create_index([("sample_id", 1)])
|
|
161
|
+
db.filtered_triplets.create_index([("sample_id", 1)])
|
|
162
|
+
logger.info("Indexes created successfully")
|
|
163
|
+
|
|
164
|
+
create_search_index_for_entities(
|
|
165
|
+
db,
|
|
166
|
+
collection_name=entity_aliases_collection,
|
|
167
|
+
embedding_field_name="alias_text_embedding",
|
|
168
|
+
index_name=entity_aliases_index,
|
|
169
|
+
dimensions=embedding_dimensions,
|
|
170
|
+
)
|
|
171
|
+
create_search_index_for_properties(
|
|
172
|
+
db,
|
|
173
|
+
collection_name=property_aliases_collection,
|
|
174
|
+
embedding_field_name="alias_text_embedding",
|
|
175
|
+
index_name=property_aliases_index,
|
|
176
|
+
dimensions=embedding_dimensions,
|
|
177
|
+
)
|
|
178
|
+
logger.info("Search index created successfully")
|
|
179
|
+
logger.info("All indexes created successfully")
|
|
180
|
+
|
|
181
|
+
return db
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
if __name__ == "__main__":
|
|
185
|
+
parser = argparse.ArgumentParser(
|
|
186
|
+
description="Create collections and indexes for the dynamic triplets database"
|
|
187
|
+
)
|
|
188
|
+
parser.add_argument(
|
|
189
|
+
"--mongo_uri",
|
|
190
|
+
type=str,
|
|
191
|
+
default="mongodb://localhost:27018/?directConnection=true",
|
|
192
|
+
)
|
|
193
|
+
parser.add_argument("--db_name", type=str, default="triplets_db")
|
|
194
|
+
parser.add_argument(
|
|
195
|
+
"--entity_aliases_collection",
|
|
196
|
+
type=str,
|
|
197
|
+
default="entity_aliases",
|
|
198
|
+
help="Collection name for entity aliases",
|
|
199
|
+
)
|
|
200
|
+
parser.add_argument(
|
|
201
|
+
"--property_aliases_collection",
|
|
202
|
+
type=str,
|
|
203
|
+
default="property_aliases",
|
|
204
|
+
help="Collection name for property aliases",
|
|
205
|
+
)
|
|
206
|
+
parser.add_argument(
|
|
207
|
+
"--triplets_collection",
|
|
208
|
+
type=str,
|
|
209
|
+
default="triplets",
|
|
210
|
+
help="Collection name for triplets",
|
|
211
|
+
)
|
|
212
|
+
parser.add_argument(
|
|
213
|
+
"--initial_triplets_collection",
|
|
214
|
+
type=str,
|
|
215
|
+
default="initial_triplets",
|
|
216
|
+
help="Collection name for initial triplets",
|
|
217
|
+
)
|
|
218
|
+
parser.add_argument(
|
|
219
|
+
"--filtered_triplets_collection",
|
|
220
|
+
type=str,
|
|
221
|
+
default="filtered_triplets",
|
|
222
|
+
help="Collection name for filtered triplets",
|
|
223
|
+
)
|
|
224
|
+
parser.add_argument(
|
|
225
|
+
"--entity_aliases_index",
|
|
226
|
+
type=str,
|
|
227
|
+
default="entity_aliases",
|
|
228
|
+
help="Index name for entities",
|
|
229
|
+
)
|
|
230
|
+
parser.add_argument(
|
|
231
|
+
"--property_aliases_index",
|
|
232
|
+
type=str,
|
|
233
|
+
default="property_aliases",
|
|
234
|
+
help="Index name for property aliases",
|
|
235
|
+
)
|
|
236
|
+
parser.add_argument(
|
|
237
|
+
"--embedding_dimensions",
|
|
238
|
+
type=int,
|
|
239
|
+
default=768,
|
|
240
|
+
help="Dimensions for embeddings",
|
|
241
|
+
)
|
|
242
|
+
parser.add_argument(
|
|
243
|
+
"--drop_collections", type=bool, default=False, help="Drop existing collections"
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
args = parser.parse_args()
|
|
247
|
+
create_triplets_database(
|
|
248
|
+
mongo_uri=args.mongo_uri,
|
|
249
|
+
db_name=args.db_name,
|
|
250
|
+
entity_aliases_collection=args.entity_aliases_collection,
|
|
251
|
+
property_aliases_collection=args.property_aliases_collection,
|
|
252
|
+
triplets_collection=args.triplets_collection,
|
|
253
|
+
initial_triplets_collection=args.initial_triplets_collection,
|
|
254
|
+
filtered_triplets_collection=args.filtered_triplets_collection,
|
|
255
|
+
entity_aliases_index=args.entity_aliases_index,
|
|
256
|
+
property_aliases_index=args.property_aliases_index,
|
|
257
|
+
embedding_dimensions=args.embedding_dimensions,
|
|
258
|
+
drop_collections=args.drop_collections,
|
|
259
|
+
)
|