kgnode 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kgnode/__init__.py +60 -0
- kgnode/_entity_descriptor.py +474 -0
- kgnode/_node_ranker.py +138 -0
- kgnode/chroma_db.py +782 -0
- kgnode/core/__init__.py +3 -0
- kgnode/core/kg_config.py +496 -0
- kgnode/core/schema_chromadb.py +215 -0
- kgnode/core/schema_extractor.py +226 -0
- kgnode/core/schema_selector.py +127 -0
- kgnode/core/sparql_query.py +77 -0
- kgnode/generator.py +814 -0
- kgnode/keyword_search.py +55 -0
- kgnode/py.typed +0 -0
- kgnode/seed_finder.py +462 -0
- kgnode/subgraph_extraction.py +747 -0
- kgnode/validator.py +135 -0
- kgnode-0.1.0.dist-info/METADATA +234 -0
- kgnode-0.1.0.dist-info/RECORD +19 -0
- kgnode-0.1.0.dist-info/WHEEL +4 -0
kgnode/__init__.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""
|
|
2
|
+
kgnode - Knowledge Graph Agnostic Node for Knowledge-Aware LLM Applications.
|
|
3
|
+
|
|
4
|
+
Public API for knowledge graph retrieval and answer generation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# Main Pipeline APIs
|
|
8
|
+
from kgnode.seed_finder import citable, get_seed_nodes
|
|
9
|
+
from kgnode.subgraph_extraction import get_subgraphs
|
|
10
|
+
from kgnode.generator import (
|
|
11
|
+
generate_sparql,
|
|
12
|
+
kg_retrieve,
|
|
13
|
+
generate_answer,
|
|
14
|
+
generate_answer_using_subgraph,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
# Validation
|
|
18
|
+
from kgnode.validator import validate_subgraph
|
|
19
|
+
|
|
20
|
+
# Search Operations
|
|
21
|
+
from kgnode.keyword_search import search_entities_by_keywords
|
|
22
|
+
|
|
23
|
+
# VectorDB Operations
|
|
24
|
+
from kgnode.chroma_db import (
|
|
25
|
+
compile_chromadb,
|
|
26
|
+
compile_chromadb_from_csv,
|
|
27
|
+
semantic_search_entities,
|
|
28
|
+
get_or_create_chromadb,
|
|
29
|
+
add_or_update_entities,
|
|
30
|
+
delete_entities,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Core Configuration
|
|
34
|
+
from kgnode.core.kg_config import KGConfig
|
|
35
|
+
from kgnode.core.sparql_query import execute_sparql_query
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
# Main Pipeline APIs
|
|
39
|
+
"citable",
|
|
40
|
+
"get_seed_nodes",
|
|
41
|
+
"get_subgraphs",
|
|
42
|
+
"generate_sparql",
|
|
43
|
+
"kg_retrieve",
|
|
44
|
+
"generate_answer",
|
|
45
|
+
"generate_answer_using_subgraph",
|
|
46
|
+
# Validation
|
|
47
|
+
"validate_subgraph",
|
|
48
|
+
# Search Operations
|
|
49
|
+
"search_entities_by_keywords",
|
|
50
|
+
# VectorDB Operations
|
|
51
|
+
"compile_chromadb",
|
|
52
|
+
"compile_chromadb_from_csv",
|
|
53
|
+
"semantic_search_entities",
|
|
54
|
+
"get_or_create_chromadb",
|
|
55
|
+
"add_or_update_entities",
|
|
56
|
+
"delete_entities",
|
|
57
|
+
# Core Configuration
|
|
58
|
+
"KGConfig",
|
|
59
|
+
"execute_sparql_query",
|
|
60
|
+
]
|
|
@@ -0,0 +1,474 @@
|
|
|
1
|
+
from kgnode.core.sparql_query import execute_sparql_query
|
|
2
|
+
from typing import List, Dict, Callable, Optional, Any
|
|
3
|
+
from kgnode.core.kg_config import KGConfig
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _default_entity_descriptor_logic(entity_uri: str, triples: List[Dict[str, Any]]) -> str:
|
|
7
|
+
"""Default DBLP-specific entity descriptor logic.
|
|
8
|
+
|
|
9
|
+
This function contains the hardcoded DBLP logic for creating entity descriptions.
|
|
10
|
+
It can be replaced by user-provided functions for other knowledge graphs.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
entity_uri: URI of the entity (cleaned, no brackets).
|
|
14
|
+
triples: List of dicts with 'predicate' and 'object' keys.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Natural language description optimized for search.
|
|
18
|
+
"""
|
|
19
|
+
if not triples:
|
|
20
|
+
return _uri_to_label(entity_uri)
|
|
21
|
+
|
|
22
|
+
# Organize triples by priority
|
|
23
|
+
entity_type = None
|
|
24
|
+
title = None
|
|
25
|
+
authors = []
|
|
26
|
+
venue = None
|
|
27
|
+
year = None
|
|
28
|
+
affiliation = None
|
|
29
|
+
coauthors = []
|
|
30
|
+
|
|
31
|
+
for triple in triples:
|
|
32
|
+
predicate = triple['predicate']
|
|
33
|
+
obj = triple['object']
|
|
34
|
+
pred_label = _uri_to_label(predicate).lower()
|
|
35
|
+
obj_label = _uri_to_label(obj)
|
|
36
|
+
|
|
37
|
+
# Identify entity type
|
|
38
|
+
if 'type' in pred_label:
|
|
39
|
+
entity_type = obj_label.lower()
|
|
40
|
+
|
|
41
|
+
# Extract key fields based on predicate
|
|
42
|
+
elif 'title' in pred_label:
|
|
43
|
+
title = obj_label
|
|
44
|
+
elif 'authored by' in pred_label or 'author' in pred_label or 'creator' in pred_label:
|
|
45
|
+
authors.append(obj_label)
|
|
46
|
+
elif 'published in' in pred_label or 'venue' in pred_label or 'journal' in pred_label:
|
|
47
|
+
venue = obj_label
|
|
48
|
+
elif 'year' in pred_label:
|
|
49
|
+
year = obj_label
|
|
50
|
+
elif 'affiliation' in pred_label or 'organization' in pred_label:
|
|
51
|
+
affiliation = obj_label
|
|
52
|
+
elif 'coauthor' in pred_label or 'collaborate' in pred_label:
|
|
53
|
+
coauthors.append(obj_label)
|
|
54
|
+
|
|
55
|
+
# Get base entity label
|
|
56
|
+
entity_label = _uri_to_label(entity_uri)
|
|
57
|
+
|
|
58
|
+
# Build focused description based on entity type
|
|
59
|
+
description_parts = []
|
|
60
|
+
|
|
61
|
+
# Always start with the entity label/name
|
|
62
|
+
if entity_type == 'person' or entity_type == 'creator':
|
|
63
|
+
description_parts.append(f"Person: {entity_label}")
|
|
64
|
+
if affiliation:
|
|
65
|
+
description_parts.append(f"affiliated with {affiliation}")
|
|
66
|
+
if authors: # These are actually papers they authored
|
|
67
|
+
description_parts.append(f"author of {len(authors)} publications")
|
|
68
|
+
if coauthors:
|
|
69
|
+
coauthor_names = ", ".join(coauthors[:5])
|
|
70
|
+
description_parts.append(f"collaborates with {coauthor_names}")
|
|
71
|
+
|
|
72
|
+
elif entity_type in ['article', 'publication', 'inproceedings', 'informal']:
|
|
73
|
+
description_parts.append(f"Publication: {entity_label}")
|
|
74
|
+
if title:
|
|
75
|
+
description_parts.append(f"titled '{title}'")
|
|
76
|
+
if authors:
|
|
77
|
+
# Limit to first few authors for readability
|
|
78
|
+
author_names = ", ".join(authors[:10])
|
|
79
|
+
if len(authors) > 10:
|
|
80
|
+
author_names += f" and {len(authors) - 10} more"
|
|
81
|
+
description_parts.append(f"authored by {author_names}")
|
|
82
|
+
if venue:
|
|
83
|
+
description_parts.append(f"published in {venue}")
|
|
84
|
+
if year:
|
|
85
|
+
description_parts.append(f"in year {year}")
|
|
86
|
+
|
|
87
|
+
else:
|
|
88
|
+
# Generic fallback for other entity types
|
|
89
|
+
description_parts.append(f"{entity_type or 'Entity'}: {entity_label}")
|
|
90
|
+
if title:
|
|
91
|
+
description_parts.append(f"titled '{title}'")
|
|
92
|
+
if authors:
|
|
93
|
+
description_parts.append(f"associated with authors: {', '.join(authors[:5])}")
|
|
94
|
+
if venue:
|
|
95
|
+
description_parts.append(f"venue: {venue}")
|
|
96
|
+
if year:
|
|
97
|
+
description_parts.append(f"year: {year}")
|
|
98
|
+
|
|
99
|
+
# Join with proper punctuation
|
|
100
|
+
return ". ".join(description_parts) + "."
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _default_relation_descriptor_logic(relation_uri: str) -> str:
|
|
104
|
+
"""Default relation descriptor logic (URI to label conversion).
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
relation_uri: URI of the relation/predicate.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
Human-readable label.
|
|
111
|
+
"""
|
|
112
|
+
return _uri_to_label(relation_uri)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def create_entity_description(entity_uri: str, config: Optional[KGConfig] = None) -> str:
|
|
116
|
+
"""
|
|
117
|
+
Create a focused natural language description optimized for semantic search.
|
|
118
|
+
Prioritizes key identifying information: names, titles, venues, years.
|
|
119
|
+
|
|
120
|
+
Note: This function uses default DBLP logic. For custom logic, use EntityDescriptorWrapper
|
|
121
|
+
with KGConfig.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
entity_uri: URI of the entity (without angle brackets)
|
|
125
|
+
config: Optional KGConfig instance for configuration.
|
|
126
|
+
If None, uses default KGConfig with environment variables or built-in defaults.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Natural language description optimized for search
|
|
130
|
+
"""
|
|
131
|
+
# Initialize config if not provided
|
|
132
|
+
if config is None:
|
|
133
|
+
config = KGConfig.default()
|
|
134
|
+
|
|
135
|
+
# Remove angle brackets if present
|
|
136
|
+
entity_uri = entity_uri.strip()
|
|
137
|
+
if entity_uri.startswith('<') and entity_uri.endswith('>'):
|
|
138
|
+
entity_uri = entity_uri[1:-1]
|
|
139
|
+
|
|
140
|
+
query = f"""
|
|
141
|
+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
142
|
+
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
|
|
143
|
+
PREFIX dblp: <https://dblp.org/rdf/schema#>
|
|
144
|
+
SELECT ?predicate ?object
|
|
145
|
+
WHERE {{
|
|
146
|
+
<{entity_uri}> ?predicate ?object .
|
|
147
|
+
}}
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
triples = execute_sparql_query(query, config=config)
|
|
151
|
+
|
|
152
|
+
return _default_entity_descriptor_logic(entity_uri, triples)
|
|
153
|
+
|
|
154
|
+
def create_entity_descriptions_batch(entity_uris: List[str], config: Optional[KGConfig] = None) -> Dict[str, str]:
|
|
155
|
+
"""
|
|
156
|
+
Create focused descriptions for multiple entities in batch.
|
|
157
|
+
Optimized for semantic search of author names and paper titles.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
entity_uris: List of entity URIs to describe.
|
|
161
|
+
config: Optional KGConfig instance for configuration.
|
|
162
|
+
If None, uses default KGConfig with environment variables or built-in defaults.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Dictionary mapping entity URIs to their descriptions.
|
|
166
|
+
"""
|
|
167
|
+
# Initialize config if not provided
|
|
168
|
+
if config is None:
|
|
169
|
+
config = KGConfig.default()
|
|
170
|
+
|
|
171
|
+
if not entity_uris:
|
|
172
|
+
return {}
|
|
173
|
+
|
|
174
|
+
# Clean URIs
|
|
175
|
+
cleaned_uris = []
|
|
176
|
+
for uri in entity_uris:
|
|
177
|
+
uri = uri.strip()
|
|
178
|
+
if not uri:
|
|
179
|
+
continue
|
|
180
|
+
if uri.startswith('<') and uri.endswith('>'):
|
|
181
|
+
uri = uri[1:-1]
|
|
182
|
+
cleaned_uris.append(uri)
|
|
183
|
+
|
|
184
|
+
if not cleaned_uris:
|
|
185
|
+
return {}
|
|
186
|
+
|
|
187
|
+
# Build VALUES clause
|
|
188
|
+
values_clause = " ".join([f"<{uri}>" for uri in cleaned_uris])
|
|
189
|
+
|
|
190
|
+
query = f"""
|
|
191
|
+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
192
|
+
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
|
|
193
|
+
PREFIX dblp: <https://dblp.org/rdf/schema#>
|
|
194
|
+
|
|
195
|
+
SELECT ?entity ?predicate ?object
|
|
196
|
+
WHERE {{
|
|
197
|
+
VALUES ?entity {{ {values_clause} }}
|
|
198
|
+
?entity ?predicate ?object .
|
|
199
|
+
}}
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
triples = execute_sparql_query(query, config=config)
|
|
203
|
+
|
|
204
|
+
# Group triples by entity
|
|
205
|
+
entity_triples = {}
|
|
206
|
+
for triple in triples:
|
|
207
|
+
entity = triple['entity']
|
|
208
|
+
if entity not in entity_triples:
|
|
209
|
+
entity_triples[entity] = []
|
|
210
|
+
entity_triples[entity].append(triple)
|
|
211
|
+
|
|
212
|
+
# Create descriptions using the same logic
|
|
213
|
+
descriptions = {}
|
|
214
|
+
for original_uri, cleaned_uri in zip(entity_uris, cleaned_uris):
|
|
215
|
+
if cleaned_uri not in entity_triples:
|
|
216
|
+
descriptions[original_uri] = _uri_to_label(original_uri)
|
|
217
|
+
else:
|
|
218
|
+
# Use similar logic as create_entity_description
|
|
219
|
+
entity_type = None
|
|
220
|
+
title = None
|
|
221
|
+
authors = []
|
|
222
|
+
venue = None
|
|
223
|
+
year = None
|
|
224
|
+
affiliation = None
|
|
225
|
+
|
|
226
|
+
for triple in entity_triples[cleaned_uri]:
|
|
227
|
+
pred_label = _uri_to_label(triple['predicate']).lower()
|
|
228
|
+
obj_label = _uri_to_label(triple['object'])
|
|
229
|
+
|
|
230
|
+
if 'type' in pred_label:
|
|
231
|
+
entity_type = obj_label.lower()
|
|
232
|
+
elif 'title' in pred_label:
|
|
233
|
+
title = obj_label
|
|
234
|
+
elif 'authored by' in pred_label or 'author' in pred_label or 'creator' in pred_label:
|
|
235
|
+
authors.append(obj_label)
|
|
236
|
+
elif 'published in' in pred_label or 'venue' in pred_label or 'journal' in pred_label:
|
|
237
|
+
venue = obj_label
|
|
238
|
+
elif 'year' in pred_label:
|
|
239
|
+
year = obj_label
|
|
240
|
+
elif 'affiliation' in pred_label:
|
|
241
|
+
affiliation = obj_label
|
|
242
|
+
|
|
243
|
+
entity_label = _uri_to_label(cleaned_uri)
|
|
244
|
+
description_parts = []
|
|
245
|
+
|
|
246
|
+
if entity_type == 'person' or entity_type == 'creator':
|
|
247
|
+
description_parts.append(f"Person: {entity_label}")
|
|
248
|
+
if affiliation:
|
|
249
|
+
description_parts.append(f"affiliated with {affiliation}")
|
|
250
|
+
if len(authors) > 0:
|
|
251
|
+
description_parts.append(f"author of {len(authors)} publications")
|
|
252
|
+
|
|
253
|
+
elif entity_type in ['article', 'publication', 'inproceedings', 'informal']:
|
|
254
|
+
description_parts.append(f"Publication: {entity_label}")
|
|
255
|
+
if title:
|
|
256
|
+
description_parts.append(f"titled '{title}'")
|
|
257
|
+
if authors:
|
|
258
|
+
author_names = ", ".join(authors[:10])
|
|
259
|
+
if len(authors) > 10:
|
|
260
|
+
author_names += f" and {len(authors) - 10} more"
|
|
261
|
+
description_parts.append(f"authored by {author_names}")
|
|
262
|
+
if venue:
|
|
263
|
+
description_parts.append(f"published in {venue}")
|
|
264
|
+
if year:
|
|
265
|
+
description_parts.append(f"in year {year}")
|
|
266
|
+
else:
|
|
267
|
+
description_parts.append(f"{entity_type or 'Entity'}: {entity_label}")
|
|
268
|
+
|
|
269
|
+
descriptions[original_uri] = ". ".join(description_parts) + "." if description_parts else entity_label
|
|
270
|
+
|
|
271
|
+
return descriptions
|
|
272
|
+
|
|
273
|
+
def create_relation_description(relation_uri: str) -> str:
|
|
274
|
+
"""
|
|
275
|
+
Create natural language description for a relation.
|
|
276
|
+
Knowledge graph agnostic - converts URI to readable text.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
relation_uri: URI of the relation/predicate
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Natural language description
|
|
283
|
+
"""
|
|
284
|
+
return _uri_to_label(relation_uri)
|
|
285
|
+
|
|
286
|
+
def _uri_to_label(uri: str) -> str:
|
|
287
|
+
"""
|
|
288
|
+
Convert URI to human-readable label.
|
|
289
|
+
Works for any knowledge graph by extracting and formatting the URI fragment.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
uri: Full URI (e.g., "http://example.org/ontology#hasName" or "<http://example.org/ontology#hasName>")
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
Human-readable label (e.g., "has name")
|
|
296
|
+
"""
|
|
297
|
+
# Remove SPARQL brackets if present
|
|
298
|
+
uri = uri.strip('<>')
|
|
299
|
+
|
|
300
|
+
# Extract the fragment/local name from URI
|
|
301
|
+
if '#' in uri:
|
|
302
|
+
label = uri.split('#')[-1]
|
|
303
|
+
elif '/' in uri:
|
|
304
|
+
label = uri.split('/')[-1]
|
|
305
|
+
else:
|
|
306
|
+
label = uri
|
|
307
|
+
|
|
308
|
+
# Convert camelCase or PascalCase to spaces
|
|
309
|
+
# hasName -> has Name -> has name
|
|
310
|
+
import re
|
|
311
|
+
label = re.sub(r'([a-z])([A-Z])', r'\1 \2', label)
|
|
312
|
+
|
|
313
|
+
# Convert snake_case or kebab-case to spaces
|
|
314
|
+
label = label.replace('_', ' ').replace('-', ' ')
|
|
315
|
+
|
|
316
|
+
# Lowercase and clean up
|
|
317
|
+
label = label.lower().strip()
|
|
318
|
+
|
|
319
|
+
return label
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
class EntityDescriptorWrapper:
|
|
323
|
+
"""Wrapper class that handles SPARQL queries and applies descriptor logic.
|
|
324
|
+
|
|
325
|
+
This class provides both single and batch entity description functionality,
|
|
326
|
+
using either user-provided descriptor logic or default DBLP logic.
|
|
327
|
+
"""
|
|
328
|
+
|
|
329
|
+
def __init__(
|
|
330
|
+
self,
|
|
331
|
+
descriptor_function: Optional[Callable[[str, List[Dict[str, Any]]], str]] = None,
|
|
332
|
+
config: Optional[KGConfig] = None
|
|
333
|
+
):
|
|
334
|
+
"""Initialize the wrapper with descriptor function.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
descriptor_function: Function that takes (entity_uri, triples) and returns
|
|
338
|
+
description string. If None, uses default DBLP logic.
|
|
339
|
+
config: Optional KGConfig instance for configuration.
|
|
340
|
+
If None, uses default KGConfig with environment variables or built-in defaults.
|
|
341
|
+
"""
|
|
342
|
+
# Initialize config if not provided
|
|
343
|
+
if config is None:
|
|
344
|
+
from kgnode.core.kg_config import KGConfig
|
|
345
|
+
config = KGConfig.default()
|
|
346
|
+
|
|
347
|
+
self.descriptor_function = descriptor_function or _default_entity_descriptor_logic
|
|
348
|
+
self.config = config
|
|
349
|
+
|
|
350
|
+
def describe_single(self, entity_uri: str) -> str:
|
|
351
|
+
"""Create description for a single entity.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
entity_uri: URI of the entity (with or without angle brackets).
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
Natural language description of the entity.
|
|
358
|
+
"""
|
|
359
|
+
# Clean URI
|
|
360
|
+
entity_uri = entity_uri.strip()
|
|
361
|
+
if entity_uri.startswith('<') and entity_uri.endswith('>'):
|
|
362
|
+
entity_uri = entity_uri[1:-1]
|
|
363
|
+
|
|
364
|
+
# Query triples for this entity
|
|
365
|
+
query = f"""
|
|
366
|
+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
367
|
+
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
|
|
368
|
+
PREFIX dblp: <https://dblp.org/rdf/schema#>
|
|
369
|
+
SELECT ?predicate ?object
|
|
370
|
+
WHERE {{
|
|
371
|
+
<{entity_uri}> ?predicate ?object .
|
|
372
|
+
}}
|
|
373
|
+
"""
|
|
374
|
+
|
|
375
|
+
try:
|
|
376
|
+
triples = execute_sparql_query(query, config=self.config)
|
|
377
|
+
return self.descriptor_function(entity_uri, triples)
|
|
378
|
+
except Exception as e:
|
|
379
|
+
print(f"Warning: Error describing entity {entity_uri}: {e}")
|
|
380
|
+
return _uri_to_label(entity_uri)
|
|
381
|
+
|
|
382
|
+
def describe_batch(self, entity_uris: List[str]) -> Dict[str, str]:
|
|
383
|
+
"""Create descriptions for multiple entities in batch (optimized).
|
|
384
|
+
|
|
385
|
+
Uses a single SPARQL query with VALUES clause to fetch all triples,
|
|
386
|
+
then applies descriptor logic to each entity. Batch size limited to
|
|
387
|
+
80 entities to stay within 8KB SPARQL query size limit.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
entity_uris: List of entity URIs (with or without angle brackets).
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
Dictionary mapping original URIs to their descriptions.
|
|
394
|
+
"""
|
|
395
|
+
if not entity_uris:
|
|
396
|
+
return {}
|
|
397
|
+
|
|
398
|
+
# Clean URIs
|
|
399
|
+
cleaned_uris = []
|
|
400
|
+
for uri in entity_uris:
|
|
401
|
+
uri = uri.strip()
|
|
402
|
+
if not uri:
|
|
403
|
+
continue
|
|
404
|
+
if uri.startswith('<') and uri.endswith('>'):
|
|
405
|
+
uri = uri[1:-1]
|
|
406
|
+
cleaned_uris.append(uri)
|
|
407
|
+
|
|
408
|
+
if not cleaned_uris:
|
|
409
|
+
return {}
|
|
410
|
+
|
|
411
|
+
# Build VALUES clause (limited to 80 URIs to stay within 8KB query limit)
|
|
412
|
+
if len(cleaned_uris) > 80:
|
|
413
|
+
print(f"Warning: Batch size {len(cleaned_uris)} exceeds limit of 80. "
|
|
414
|
+
f"Consider splitting into multiple batches.")
|
|
415
|
+
cleaned_uris = cleaned_uris[:80]
|
|
416
|
+
|
|
417
|
+
values_clause = " ".join([f"<{uri}>" for uri in cleaned_uris])
|
|
418
|
+
|
|
419
|
+
query = f"""
|
|
420
|
+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
421
|
+
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
|
|
422
|
+
PREFIX dblp: <https://dblp.org/rdf/schema#>
|
|
423
|
+
|
|
424
|
+
SELECT ?entity ?predicate ?object
|
|
425
|
+
WHERE {{
|
|
426
|
+
VALUES ?entity {{ {values_clause} }}
|
|
427
|
+
?entity ?predicate ?object .
|
|
428
|
+
}}
|
|
429
|
+
"""
|
|
430
|
+
|
|
431
|
+
try:
|
|
432
|
+
triples = execute_sparql_query(query, config=self.config)
|
|
433
|
+
except Exception as e:
|
|
434
|
+
print(f"Warning: Error fetching triples for batch: {e}")
|
|
435
|
+
# Fallback to URI labels
|
|
436
|
+
return {uri: _uri_to_label(uri) for uri in entity_uris}
|
|
437
|
+
|
|
438
|
+
# Group triples by entity
|
|
439
|
+
entity_triples = {}
|
|
440
|
+
for triple in triples:
|
|
441
|
+
entity = triple['entity']
|
|
442
|
+
if entity not in entity_triples:
|
|
443
|
+
entity_triples[entity] = []
|
|
444
|
+
entity_triples[entity].append(triple)
|
|
445
|
+
|
|
446
|
+
# Create descriptions using descriptor function
|
|
447
|
+
descriptions = {}
|
|
448
|
+
for original_uri, cleaned_uri in zip(entity_uris[:len(cleaned_uris)], cleaned_uris):
|
|
449
|
+
if cleaned_uri not in entity_triples:
|
|
450
|
+
descriptions[original_uri] = _uri_to_label(original_uri)
|
|
451
|
+
else:
|
|
452
|
+
try:
|
|
453
|
+
descriptions[original_uri] = self.descriptor_function(
|
|
454
|
+
cleaned_uri,
|
|
455
|
+
entity_triples[cleaned_uri]
|
|
456
|
+
)
|
|
457
|
+
except Exception as e:
|
|
458
|
+
print(f"Warning: Error applying descriptor function to {cleaned_uri}: {e}")
|
|
459
|
+
descriptions[original_uri] = _uri_to_label(original_uri)
|
|
460
|
+
|
|
461
|
+
return descriptions
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
if __name__ == "__main__":
|
|
465
|
+
# print(create_entity_description("https://dblp.org/rdf/schema#Publication"))
|
|
466
|
+
print(create_entity_descriptions_batch(["<https://dblp.org/rec/journals/nature/RheinbayNAWSTHH20>", "https://dblp.org/rdf/schema#Person"]))
|
|
467
|
+
# print(create_relation_description("http://www.w3.org/2000/01/rdf-schema#comment"))
|
|
468
|
+
|
|
469
|
+
# Example usage:
|
|
470
|
+
# entities = ["http://example.org/entity1", "http://example.org/entity2", "http://example.org/entity3"]
|
|
471
|
+
# descriptions = create_entity_descriptions_batch(entities)
|
|
472
|
+
#
|
|
473
|
+
# for entity, desc in descriptions.items():
|
|
474
|
+
# print(f"{entity}: {desc}")
|
kgnode/_node_ranker.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os
|
|
3
|
+
from functools import lru_cache
|
|
4
|
+
from typing import List, Dict, Optional
|
|
5
|
+
import time
|
|
6
|
+
import threading
|
|
7
|
+
from kgnode.core.kg_config import KGConfig
|
|
8
|
+
|
|
9
|
+
from kgnode.core.sparql_query import execute_sparql_query
|
|
10
|
+
|
|
11
|
+
@lru_cache(maxsize=10)
|
|
12
|
+
def get_top_entities_by_degree(
|
|
13
|
+
limit: int = 1_000_000,
|
|
14
|
+
output_file: Optional[str] = None,
|
|
15
|
+
config: Optional[KGConfig] = None
|
|
16
|
+
) -> List[Dict[str, str]]:
|
|
17
|
+
"""
|
|
18
|
+
Get top N entities from knowledge graph sorted by degree (number of connections).
|
|
19
|
+
Saves results to CSV file.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
limit (int): Number of top entities to retrieve. Default is 1,000,000.
|
|
23
|
+
output_file (str): Path to output CSV file. If None, defaults to ~/.kgnode/data/top_entities.csv.
|
|
24
|
+
Can be overridden via KGNODE_DATA_DIR environment variable.
|
|
25
|
+
config: Optional KGConfig instance for configuration.
|
|
26
|
+
If None, uses default KGConfig with environment variables or built-in defaults.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
List[Dict]: List of entities with their URIs and degrees.
|
|
30
|
+
Each dict has keys: 'entity', 'degree'
|
|
31
|
+
"""
|
|
32
|
+
# Initialize config if not provided
|
|
33
|
+
if config is None:
|
|
34
|
+
config = KGConfig.default()
|
|
35
|
+
|
|
36
|
+
# Use default path if not provided
|
|
37
|
+
if output_file is None:
|
|
38
|
+
output_file = config.csv_path
|
|
39
|
+
|
|
40
|
+
# Ensure parent directory exists
|
|
41
|
+
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
|
42
|
+
|
|
43
|
+
print(f"Querying top {limit:,} entities by degree, For 1 million nodes KG it takes 7 seconds to run.")
|
|
44
|
+
|
|
45
|
+
sparql_query = f"""
|
|
46
|
+
SELECT ?entity (COUNT(?o) as ?degree)
|
|
47
|
+
WHERE {{
|
|
48
|
+
?entity ?p ?o .
|
|
49
|
+
}}
|
|
50
|
+
GROUP BY ?entity
|
|
51
|
+
ORDER BY DESC(?degree)
|
|
52
|
+
LIMIT {limit}
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
# in+out
|
|
56
|
+
# SELECT ?entity (COUNT(?connection) as ?degree)
|
|
57
|
+
# WHERE {{
|
|
58
|
+
# {{ ?entity ?p ?o }}
|
|
59
|
+
# UNION
|
|
60
|
+
# {{ ?s ?p ?entity }}
|
|
61
|
+
# }}
|
|
62
|
+
# GROUP BY ?entity
|
|
63
|
+
# ORDER BY DESC(?degree)
|
|
64
|
+
# LIMIT {limit}
|
|
65
|
+
|
|
66
|
+
# indegree
|
|
67
|
+
# SELECT ?entity (COUNT(?s) as ?degree)
|
|
68
|
+
# WHERE {{
|
|
69
|
+
# ?s ?p ?entity .
|
|
70
|
+
# }}
|
|
71
|
+
# GROUP BY ?entity
|
|
72
|
+
# ORDER BY DESC(?degree)
|
|
73
|
+
# LIMIT {limit}
|
|
74
|
+
|
|
75
|
+
# outdegree
|
|
76
|
+
# SELECT ?entity (COUNT(?o) as ?degree)
|
|
77
|
+
# WHERE {{
|
|
78
|
+
# ?entity ?p ?o .
|
|
79
|
+
# }}
|
|
80
|
+
# GROUP BY ?entity
|
|
81
|
+
# ORDER BY DESC(?degree)
|
|
82
|
+
# LIMIT {limit}
|
|
83
|
+
|
|
84
|
+
# Spinner setup
|
|
85
|
+
spinner_chars = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']
|
|
86
|
+
spinner_running = True
|
|
87
|
+
|
|
88
|
+
def spin():
|
|
89
|
+
i = 0
|
|
90
|
+
start = time.time()
|
|
91
|
+
while spinner_running:
|
|
92
|
+
elapsed = int(time.time() - start)
|
|
93
|
+
mins, secs = divmod(elapsed, 60)
|
|
94
|
+
print(f'\r{spinner_chars[i % len(spinner_chars)]} Querying... {mins:02d}:{secs:02d}', end='', flush=True)
|
|
95
|
+
i += 1
|
|
96
|
+
time.sleep(0.1)
|
|
97
|
+
|
|
98
|
+
# Start spinner
|
|
99
|
+
spinner_thread = threading.Thread(target=spin)
|
|
100
|
+
spinner_thread.start()
|
|
101
|
+
|
|
102
|
+
start_time = time.time()
|
|
103
|
+
results = execute_sparql_query(sparql_query, config=config)
|
|
104
|
+
query_time = time.time() - start_time
|
|
105
|
+
|
|
106
|
+
# Stop spinner
|
|
107
|
+
spinner_running = False
|
|
108
|
+
spinner_thread.join()
|
|
109
|
+
|
|
110
|
+
print(f"\r✓ Query completed in {query_time:.1f} seconds ({query_time / 60:.1f} minutes)")
|
|
111
|
+
print(f"✓ Retrieved {len(results):,} entities")
|
|
112
|
+
print(f"Saving to {output_file}...")
|
|
113
|
+
|
|
114
|
+
# Save to CSV
|
|
115
|
+
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
|
|
116
|
+
fieldnames = ['entity', 'degree']
|
|
117
|
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
118
|
+
writer.writeheader()
|
|
119
|
+
|
|
120
|
+
for idx, row in enumerate(results):
|
|
121
|
+
writer.writerow({
|
|
122
|
+
'entity': row['entity'],
|
|
123
|
+
'degree': row['degree']
|
|
124
|
+
})
|
|
125
|
+
|
|
126
|
+
if (idx + 1) % 100_000 == 0:
|
|
127
|
+
print(f" Written {idx + 1:,} rows...")
|
|
128
|
+
|
|
129
|
+
total_time = time.time() - start_time
|
|
130
|
+
print(f"✓ Done! Saved {len(results):,} entities to {output_file}")
|
|
131
|
+
print(f"Total time: {total_time:.1f} seconds")
|
|
132
|
+
|
|
133
|
+
return results
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
if __name__ == "__main__":
|
|
137
|
+
entities = get_top_entities_by_degree(limit=10000)
|
|
138
|
+
print(entities)
|