wikontic 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. wikontic/__init__.py +16 -0
  2. wikontic/create_ontological_triplets_db.py +193 -0
  3. wikontic/create_triplets_db.py +259 -0
  4. wikontic/create_wikidata_ontology_db.py +555 -0
  5. wikontic/utils/__init__.py +7 -0
  6. wikontic/utils/base_inference_with_db.py +329 -0
  7. wikontic/utils/dynamic_aligner.py +281 -0
  8. wikontic/utils/inference_with_db.py +224 -0
  9. wikontic/utils/ontology_mappings/entity_hierarchy.json +1 -0
  10. wikontic/utils/ontology_mappings/entity_names.json +1 -0
  11. wikontic/utils/ontology_mappings/entity_type2aliases.json +1 -0
  12. wikontic/utils/ontology_mappings/entity_type2hierarchy.json +1 -0
  13. wikontic/utils/ontology_mappings/entity_type2label.json +1 -0
  14. wikontic/utils/ontology_mappings/enum_entity_ids.json +1 -0
  15. wikontic/utils/ontology_mappings/enum_prop_ids.json +1 -0
  16. wikontic/utils/ontology_mappings/label2entity.json +1 -0
  17. wikontic/utils/ontology_mappings/obj_constraint2prop.json +1 -0
  18. wikontic/utils/ontology_mappings/prop2aliases.json +1 -0
  19. wikontic/utils/ontology_mappings/prop2constraints.json +1 -0
  20. wikontic/utils/ontology_mappings/prop2data_type.json +1 -0
  21. wikontic/utils/ontology_mappings/prop2label.json +1 -0
  22. wikontic/utils/ontology_mappings/propid2enum.json +1 -0
  23. wikontic/utils/ontology_mappings/subj_constraint2prop.json +1 -0
  24. wikontic/utils/ontology_mappings/subject_object_constraints.json +1 -0
  25. wikontic/utils/openai_utils.py +517 -0
  26. wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types.txt +17 -0
  27. wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types_dialog_bench.txt +18 -0
  28. wikontic/utils/prompts/name_refinement/rank_object_names.txt +17 -0
  29. wikontic/utils/prompts/name_refinement/rank_object_names_dialog_bench.txt +18 -0
  30. wikontic/utils/prompts/name_refinement/rank_object_qualifiers.txt +20 -0
  31. wikontic/utils/prompts/name_refinement/rank_subject_names.txt +18 -0
  32. wikontic/utils/prompts/name_refinement/rank_subject_names_dialog_bench.txt +20 -0
  33. wikontic/utils/prompts/ontology_refinement/prompt_choose_entity_types.txt +26 -0
  34. wikontic/utils/prompts/ontology_refinement/prompt_choose_relation.txt +24 -0
  35. wikontic/utils/prompts/ontology_refinement/prompt_choose_relation_and_types.txt +28 -0
  36. wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question.txt +17 -0
  37. wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question_wo_types.txt +16 -0
  38. wikontic/utils/prompts/qa/prompt_entity_extraction_from_question.txt +3 -0
  39. wikontic/utils/prompts/qa/prompt_is_answered.txt +43 -0
  40. wikontic/utils/prompts/qa/qa_collapsing_prompt.txt +22 -0
  41. wikontic/utils/prompts/qa/qa_prompt.txt +5 -0
  42. wikontic/utils/prompts/qa/qa_prompt_hotpot.txt +6 -0
  43. wikontic/utils/prompts/qa/question_decomposition_1.txt +7 -0
  44. wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench.txt +75 -0
  45. wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench_in_russian.txt +78 -0
  46. wikontic/utils/prompts/triplet_extraction/propmt_1_types_qualifiers.txt +91 -0
  47. wikontic/utils/structured_aligner.py +606 -0
  48. wikontic/utils/structured_inference_with_db.py +561 -0
  49. wikontic-0.0.3.dist-info/METADATA +111 -0
  50. wikontic-0.0.3.dist-info/RECORD +53 -0
  51. wikontic-0.0.3.dist-info/WHEEL +5 -0
  52. wikontic-0.0.3.dist-info/licenses/LICENSE +19 -0
  53. wikontic-0.0.3.dist-info/top_level.txt +1 -0
wikontic/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ """
2
+ Wikontic - Extract ontology-aware, Wikidata-aligned knowledge graphs from raw text using LLMs.
3
+ """
4
+
5
+ from .create_triplets_db import create_triplets_database
6
+ from .create_ontological_triplets_db import create_ontological_triplets_database
7
+ from .create_wikidata_ontology_db import create_wikidata_ontology_database
8
+
9
+ from . import utils
10
+
11
+ __all__ = [
12
+ "create_triplets_database",
13
+ "create_ontological_triplets_database",
14
+ "create_wikidata_ontology_database",
15
+ "utils",
16
+ ]
@@ -0,0 +1,193 @@
1
+ from pymongo.mongo_client import MongoClient
2
+ from pymongo.operations import SearchIndexModel
3
+ import pymongo
4
+
5
+ from typing import List
6
+ from pydantic import BaseModel, ValidationError
7
+ from transformers import AutoTokenizer, AutoModel
8
+ from tqdm import tqdm
9
+ import json
10
+ import time
11
+ import argparse
12
+ import logging
13
+ import os
14
+
15
+ # Configure logging
16
+ logging.basicConfig(
17
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def get_mongo_client(mongo_uri):
23
+ client = MongoClient(mongo_uri)
24
+ logger.info("Connection to MongoDB successful")
25
+ return client
26
+
27
+
28
+ def create_search_index_for_entities(
29
+ db,
30
+ collection_name="entity_aliases",
31
+ embedding_field_name="alias_text_embedding",
32
+ entity_type_id_field_name="entity_type",
33
+ index_name="entities",
34
+ ):
35
+ logger.info(f"Starting to create index {index_name} for {collection_name}")
36
+ collection = db.get_collection(collection_name)
37
+ vector_search_index_model = SearchIndexModel(
38
+ definition={
39
+ "mappings": {
40
+ "dynamic": True,
41
+ "fields": {
42
+ embedding_field_name: {
43
+ "dimensions": 768,
44
+ "similarity": "cosine",
45
+ "type": "knnVector",
46
+ },
47
+ entity_type_id_field_name: {"type": "token"},
48
+ "sample_id": {
49
+ # "type": "number"
50
+ "type": "token"
51
+ },
52
+ },
53
+ }
54
+ },
55
+ name=index_name,
56
+ )
57
+
58
+ try:
59
+ result = collection.create_search_index(model=vector_search_index_model)
60
+ logger.info("Creating index...")
61
+ time.sleep(20)
62
+ logger.info(f"New index {index_name} created successfully: {result}")
63
+ except Exception as e:
64
+ logger.error(f"Error creating new vector search index {index_name}: {str(e)}")
65
+
66
+
67
+ def create_ontological_triplets_database(
68
+ mongo_uri: str = "mongodb://localhost:27018/?directConnection=true",
69
+ db_name: str = "triplets_db",
70
+ entity_aliases_collection: str = "entity_aliases",
71
+ triplets_collection: str = "triplets",
72
+ initial_triplets_collection: str = "initial_triplets",
73
+ filtered_triplets_collection: str = "filtered_triplets",
74
+ ontology_filtered_triplets_collection: str = "ontology_filtered_triplets",
75
+ entity_aliases_index: str = "entity_aliases",
76
+ drop_collections: bool = False,
77
+ ):
78
+ """
79
+ Create collections and indexes for the ontological triplets database.
80
+
81
+ Args:
82
+ mongo_uri: MongoDB connection URI
83
+ db_name: Name of the database to create
84
+ entity_aliases_collection: Collection name for entity aliases
85
+ triplets_collection: Collection name for triplets
86
+ initial_triplets_collection: Collection name for initial triplets
87
+ filtered_triplets_collection: Collection name for filtered triplets
88
+ ontology_filtered_triplets_collection: Collection name for ontology filtered triplets
89
+ entity_aliases_index: Index name for entities
90
+ drop_collections: Whether to drop existing collections before creating new ones
91
+
92
+ Returns:
93
+ Database object
94
+ """
95
+ mongo_client = get_mongo_client(mongo_uri)
96
+ db = mongo_client.get_database(db_name)
97
+
98
+ if drop_collections:
99
+ for collection_name in db.list_collection_names():
100
+ db.drop_collection(collection_name)
101
+ logger.info(f"Dropped collection: {collection_name}")
102
+
103
+ db.create_collection(entity_aliases_collection)
104
+ db.create_collection(initial_triplets_collection)
105
+ db.create_collection(filtered_triplets_collection)
106
+ db.create_collection(ontology_filtered_triplets_collection)
107
+ db.create_collection(triplets_collection)
108
+
109
+ logger.info("Collections created successfully")
110
+ db.entity_aliases.create_index([("entity_type", 1), ("sample_id", 1)])
111
+ db.entity_aliases.create_index([("label", 1)])
112
+
113
+ db.triplets.create_index([("sample_id", 1)])
114
+ db.initial_triplets.create_index([("sample_id", 1)])
115
+ db.filtered_triplets.create_index([("sample_id", 1)])
116
+ db.ontology_filtered_triplets.create_index([("sample_id", 1)])
117
+ logger.info("Indexes created successfully")
118
+
119
+ create_search_index_for_entities(
120
+ db,
121
+ collection_name=entity_aliases_collection,
122
+ embedding_field_name="alias_text_embedding",
123
+ entity_type_id_field_name="entity_type",
124
+ index_name=entity_aliases_index,
125
+ )
126
+ logger.info("Search index created successfully")
127
+ logger.info("All indexes created successfully")
128
+
129
+ return db
130
+
131
+
132
+ if __name__ == "__main__":
133
+ parser = argparse.ArgumentParser(
134
+ description="Create collections and indexes for the dynamic triplets database"
135
+ )
136
+ parser.add_argument(
137
+ "--mongo_uri",
138
+ type=str,
139
+ default="mongodb://localhost:27018/?directConnection=true",
140
+ )
141
+ parser.add_argument("--db_name", type=str, default="triplets_db")
142
+ parser.add_argument(
143
+ "--entity_aliases_collection",
144
+ type=str,
145
+ default="entity_aliases",
146
+ help="Collection name for entity aliases",
147
+ )
148
+ parser.add_argument(
149
+ "--triplets_collection",
150
+ type=str,
151
+ default="triplets",
152
+ help="Collection name for triplets",
153
+ )
154
+ parser.add_argument(
155
+ "--initial_triplets_collection",
156
+ type=str,
157
+ default="initial_triplets",
158
+ help="Collection name for initial triplets",
159
+ )
160
+ parser.add_argument(
161
+ "--filtered_triplets_collection",
162
+ type=str,
163
+ default="filtered_triplets",
164
+ help="Collection name for filtered triplets",
165
+ )
166
+ parser.add_argument(
167
+ "--ontology_filtered_triplets_collection",
168
+ type=str,
169
+ default="ontology_filtered_triplets",
170
+ help="Collection name for ontology filtered triplets",
171
+ )
172
+ parser.add_argument(
173
+ "--entity_aliases_index",
174
+ type=str,
175
+ default="entity_aliases",
176
+ help="Index name for entities",
177
+ )
178
+ parser.add_argument(
179
+ "--drop_collections", type=bool, default=False, help="Drop existing collections"
180
+ )
181
+
182
+ args = parser.parse_args()
183
+ create_ontological_triplets_database(
184
+ mongo_uri=args.mongo_uri,
185
+ db_name=args.db_name,
186
+ entity_aliases_collection=args.entity_aliases_collection,
187
+ triplets_collection=args.triplets_collection,
188
+ initial_triplets_collection=args.initial_triplets_collection,
189
+ filtered_triplets_collection=args.filtered_triplets_collection,
190
+ ontology_filtered_triplets_collection=args.ontology_filtered_triplets_collection,
191
+ entity_aliases_index=args.entity_aliases_index,
192
+ drop_collections=args.drop_collections,
193
+ )
@@ -0,0 +1,259 @@
1
+ from pymongo.mongo_client import MongoClient
2
+ from pymongo.operations import SearchIndexModel
3
+ import pymongo
4
+
5
+ from typing import List
6
+ from pydantic import BaseModel, ValidationError
7
+ from transformers import AutoTokenizer, AutoModel
8
+ from tqdm import tqdm
9
+ import json
10
+ import time
11
+ import argparse
12
+ import logging
13
+ import os
14
+
15
+ # Configure logging
16
+ logging.basicConfig(
17
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def get_mongo_client(mongo_uri):
23
+ client = MongoClient(mongo_uri)
24
+ logger.info("Connection to MongoDB successful")
25
+ return client
26
+
27
+
28
+ def create_search_index_for_entities(
29
+ db,
30
+ collection_name="entity_aliases",
31
+ embedding_field_name="alias_text_embedding",
32
+ index_name="entity_aliases",
33
+ dimensions=768,
34
+ ):
35
+ logger.info(
36
+ f"Starting to create index {index_name} for {collection_name} with dimensions {dimensions}"
37
+ )
38
+ collection = db.get_collection(collection_name)
39
+ vector_search_index_model = SearchIndexModel(
40
+ definition={
41
+ "mappings": {
42
+ "dynamic": True,
43
+ "fields": {
44
+ embedding_field_name: {
45
+ "dimensions": dimensions,
46
+ "similarity": "cosine",
47
+ "type": "knnVector",
48
+ },
49
+ "sample_id": {
50
+ # "type": "number"
51
+ "type": "token"
52
+ },
53
+ },
54
+ }
55
+ },
56
+ name=index_name,
57
+ )
58
+
59
+ try:
60
+ result = collection.create_search_index(model=vector_search_index_model)
61
+ logger.info("Creating index...")
62
+ time.sleep(20)
63
+ logger.info(f"New index {index_name} created successfully: {result}")
64
+ except Exception as e:
65
+ logger.error(f"Error creating new vector search index {index_name}: {str(e)}")
66
+
67
+
68
+ def create_search_index_for_properties(
69
+ db,
70
+ collection_name="property_aliases",
71
+ embedding_field_name="alias_text_embedding",
72
+ index_name="property_aliases",
73
+ dimensions=768,
74
+ ):
75
+ logger.info(
76
+ f"Starting to create index {index_name} for {collection_name} with dimensions {dimensions}"
77
+ )
78
+ collection = db.get_collection(collection_name)
79
+ vector_search_index_model = SearchIndexModel(
80
+ definition={
81
+ "mappings": {
82
+ "dynamic": True,
83
+ "fields": {
84
+ embedding_field_name: {
85
+ "dimensions": dimensions,
86
+ "similarity": "cosine",
87
+ "type": "knnVector",
88
+ },
89
+ },
90
+ }
91
+ },
92
+ name=index_name,
93
+ )
94
+ try:
95
+ result = collection.create_search_index(model=vector_search_index_model)
96
+ logger.info("Creating index...")
97
+ time.sleep(20)
98
+ logger.info(f"New index {index_name} created successfully: {result}")
99
+ except Exception as e:
100
+ logger.error(f"Error creating new vector search index {index_name}: {str(e)}")
101
+
102
+
103
+ def create_triplets_database(
104
+ mongo_uri: str = "mongodb://localhost:27018/?directConnection=true",
105
+ db_name: str = "triplets_db",
106
+ entity_aliases_collection: str = "entity_aliases",
107
+ property_aliases_collection: str = "property_aliases",
108
+ triplets_collection: str = "triplets",
109
+ initial_triplets_collection: str = "initial_triplets",
110
+ filtered_triplets_collection: str = "filtered_triplets",
111
+ entity_aliases_index: str = "entity_aliases",
112
+ property_aliases_index: str = "property_aliases",
113
+ embedding_dimensions: int = 768,
114
+ drop_collections: bool = False,
115
+ ):
116
+ """
117
+ Create collections and indexes for the dynamic triplets database.
118
+
119
+ Args:
120
+ mongo_uri: MongoDB connection URI
121
+ db_name: Name of the database to create
122
+ entity_aliases_collection: Collection name for entity aliases
123
+ property_aliases_collection: Collection name for property aliases
124
+ triplets_collection: Collection name for triplets
125
+ initial_triplets_collection: Collection name for initial triplets
126
+ filtered_triplets_collection: Collection name for filtered triplets
127
+ entity_aliases_index: Index name for entities
128
+ property_aliases_index: Index name for property aliases
129
+ embedding_dimensions: Dimensions for embeddings (default: 768)
130
+ drop_collections: Whether to drop existing collections before creating new ones
131
+
132
+ Returns:
133
+ Database object
134
+ """
135
+ mongo_client = get_mongo_client(mongo_uri)
136
+ db = mongo_client.get_database(db_name)
137
+
138
+ # Drop all existing collections
139
+ if drop_collections:
140
+ logger.info("Dropping existing collections...")
141
+ for collection_name in db.list_collection_names():
142
+ logger.info(f"Dropping collection: {collection_name}")
143
+ db.drop_collection(collection_name)
144
+ logger.info("Successfully dropped all existing collections")
145
+
146
+ db.create_collection(entity_aliases_collection)
147
+ db.create_collection(property_aliases_collection)
148
+ db.create_collection(triplets_collection)
149
+ db.create_collection(initial_triplets_collection)
150
+ db.create_collection(filtered_triplets_collection)
151
+
152
+ logger.info("Collections created successfully")
153
+ db.entity_aliases.create_index([("sample_id", 1)])
154
+ db.entity_aliases.create_index([("label", 1)])
155
+
156
+ db.property_aliases.create_index([("sample_id", 1)])
157
+ db.property_aliases.create_index([("label", 1)])
158
+
159
+ db.triplets.create_index([("sample_id", 1)])
160
+ db.initial_triplets.create_index([("sample_id", 1)])
161
+ db.filtered_triplets.create_index([("sample_id", 1)])
162
+ logger.info("Indexes created successfully")
163
+
164
+ create_search_index_for_entities(
165
+ db,
166
+ collection_name=entity_aliases_collection,
167
+ embedding_field_name="alias_text_embedding",
168
+ index_name=entity_aliases_index,
169
+ dimensions=embedding_dimensions,
170
+ )
171
+ create_search_index_for_properties(
172
+ db,
173
+ collection_name=property_aliases_collection,
174
+ embedding_field_name="alias_text_embedding",
175
+ index_name=property_aliases_index,
176
+ dimensions=embedding_dimensions,
177
+ )
178
+ logger.info("Search index created successfully")
179
+ logger.info("All indexes created successfully")
180
+
181
+ return db
182
+
183
+
184
+ if __name__ == "__main__":
185
+ parser = argparse.ArgumentParser(
186
+ description="Create collections and indexes for the dynamic triplets database"
187
+ )
188
+ parser.add_argument(
189
+ "--mongo_uri",
190
+ type=str,
191
+ default="mongodb://localhost:27018/?directConnection=true",
192
+ )
193
+ parser.add_argument("--db_name", type=str, default="triplets_db")
194
+ parser.add_argument(
195
+ "--entity_aliases_collection",
196
+ type=str,
197
+ default="entity_aliases",
198
+ help="Collection name for entity aliases",
199
+ )
200
+ parser.add_argument(
201
+ "--property_aliases_collection",
202
+ type=str,
203
+ default="property_aliases",
204
+ help="Collection name for property aliases",
205
+ )
206
+ parser.add_argument(
207
+ "--triplets_collection",
208
+ type=str,
209
+ default="triplets",
210
+ help="Collection name for triplets",
211
+ )
212
+ parser.add_argument(
213
+ "--initial_triplets_collection",
214
+ type=str,
215
+ default="initial_triplets",
216
+ help="Collection name for initial triplets",
217
+ )
218
+ parser.add_argument(
219
+ "--filtered_triplets_collection",
220
+ type=str,
221
+ default="filtered_triplets",
222
+ help="Collection name for filtered triplets",
223
+ )
224
+ parser.add_argument(
225
+ "--entity_aliases_index",
226
+ type=str,
227
+ default="entity_aliases",
228
+ help="Index name for entities",
229
+ )
230
+ parser.add_argument(
231
+ "--property_aliases_index",
232
+ type=str,
233
+ default="property_aliases",
234
+ help="Index name for property aliases",
235
+ )
236
+ parser.add_argument(
237
+ "--embedding_dimensions",
238
+ type=int,
239
+ default=768,
240
+ help="Dimensions for embeddings",
241
+ )
242
+ parser.add_argument(
243
+ "--drop_collections", type=bool, default=False, help="Drop existing collections"
244
+ )
245
+
246
+ args = parser.parse_args()
247
+ create_triplets_database(
248
+ mongo_uri=args.mongo_uri,
249
+ db_name=args.db_name,
250
+ entity_aliases_collection=args.entity_aliases_collection,
251
+ property_aliases_collection=args.property_aliases_collection,
252
+ triplets_collection=args.triplets_collection,
253
+ initial_triplets_collection=args.initial_triplets_collection,
254
+ filtered_triplets_collection=args.filtered_triplets_collection,
255
+ entity_aliases_index=args.entity_aliases_index,
256
+ property_aliases_index=args.property_aliases_index,
257
+ embedding_dimensions=args.embedding_dimensions,
258
+ drop_collections=args.drop_collections,
259
+ )