graph-seeder 1.0.0.dev5__tar.gz → 1.0.0.dev8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/PKG-INFO +1 -1
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/pyproject.toml +1 -1
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/SubgraphExtractor.py +17 -14
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/densification/GraphConnector.py +14 -17
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/extraction/BFS/BFS.py +30 -14
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/extraction/ExtractionStrategy.py +5 -2
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/extraction/Hop/HopExpansion.py +33 -15
- graph_seeder-1.0.0.dev8/src/graph_seeder/models/RDFNode.py +45 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/utils/GraphExporter.py +41 -31
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/utils/GraphStatistics.py +9 -8
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/utils/utils.py +14 -2
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/NeighborhoodWrapper.py +2 -1
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/hashmap/HashMapWrapper.py +16 -4
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/sparql/GraphWrapper.py +17 -15
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/sparql/client/TurtleClient.py +6 -11
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/uv.lock +1 -1
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/.github/workflows/publish.yml +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/.gitignore +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/README.md +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/requirements.txt +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/GraphSeeder.py +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/configs/dbpedia_default.json +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/configs/default.json +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/configs/europeana_default.json +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/configs/pgxlod_default.json +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/configs/wikidata_default.json +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/utils/ConsoleUI.py +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/utils/Factory.py +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/utils/URIManager.py +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/sparql/BaseClient.py +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/sparql/SparqlQueryBuilder.py +0 -0
- {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/sparql/client/SparqlClient.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: graph-seeder
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev8
|
|
4
4
|
Summary: A powerful tool to extract and densify subgraphs from Knowledge Graphs via SPARQL or LMDB, with different extraction strategies.
|
|
5
5
|
Requires-Python: >=3.9
|
|
6
6
|
Requires-Dist: lmdb>=2.2.0
|
|
@@ -7,7 +7,7 @@ packages = ["src/graph_seeder"]
|
|
|
7
7
|
|
|
8
8
|
[project]
|
|
9
9
|
name = "graph-seeder"
|
|
10
|
-
version = "1.0.0.
|
|
10
|
+
version = "1.0.0.dev8"
|
|
11
11
|
description = "A powerful tool to extract and densify subgraphs from Knowledge Graphs via SPARQL or LMDB, with different extraction strategies."
|
|
12
12
|
readme = "README.md"
|
|
13
13
|
requires-python = ">=3.9"
|
|
@@ -8,6 +8,7 @@ from rich.logging import RichHandler
|
|
|
8
8
|
from rich.prompt import Confirm
|
|
9
9
|
from graph_seeder.extraction.ExtractionStrategy import ExtractionStrategy
|
|
10
10
|
from graph_seeder.densification.GraphConnector import GraphConnector
|
|
11
|
+
from graph_seeder.models.RDFNode import ExtractionResult
|
|
11
12
|
from graph_seeder.utils.Factory import ComponentFactory
|
|
12
13
|
from graph_seeder.utils.ConsoleUI import ConsoleUI
|
|
13
14
|
from graph_seeder.utils.GraphExporter import GraphExporter
|
|
@@ -242,8 +243,10 @@ class SubgraphExtractor:
|
|
|
242
243
|
|
|
243
244
|
return []
|
|
244
245
|
|
|
245
|
-
def extract_subgraph(
|
|
246
|
-
|
|
246
|
+
def extract_subgraph(
|
|
247
|
+
self, seeds: list[str]
|
|
248
|
+
) -> tuple[list[ExtractionResult], set[str]]:
|
|
249
|
+
all_results: list[ExtractionResult] = []
|
|
247
250
|
seeds_found: set[str] = set()
|
|
248
251
|
|
|
249
252
|
with self.ui.create_progress_bar() as progress:
|
|
@@ -264,7 +267,7 @@ class SubgraphExtractor:
|
|
|
264
267
|
if not triplets:
|
|
265
268
|
self.stats["not_found"] += 1
|
|
266
269
|
else:
|
|
267
|
-
|
|
270
|
+
all_results.append(
|
|
268
271
|
{
|
|
269
272
|
"seed": clean_nodes[0],
|
|
270
273
|
"target": clean_nodes[1] if len(clean_nodes) > 1 else None,
|
|
@@ -274,18 +277,18 @@ class SubgraphExtractor:
|
|
|
274
277
|
seeds_found.update(clean_nodes)
|
|
275
278
|
self.stats["found"] += 1
|
|
276
279
|
|
|
277
|
-
self.exporter.save_results(
|
|
280
|
+
self.exporter.save_results(all_results, self.uri_manager)
|
|
278
281
|
self.exporter.save_graph(self.extractor_strategy.graph)
|
|
279
282
|
|
|
280
283
|
logger.info("Computing final graph statistics...")
|
|
281
284
|
|
|
282
285
|
self.print_final_summary(
|
|
283
|
-
|
|
286
|
+
all_results, self.extractor_strategy.graph, "Extraction summary"
|
|
284
287
|
)
|
|
285
288
|
|
|
286
|
-
return
|
|
289
|
+
return all_results, seeds_found
|
|
287
290
|
|
|
288
|
-
def densify_graph(self,
|
|
291
|
+
def densify_graph(self, all_results: list[ExtractionResult], seeds_found: set[str]):
|
|
289
292
|
explored_nodes = self.extractor_strategy.explored_nodes
|
|
290
293
|
|
|
291
294
|
graph_connector: GraphConnector = GraphConnector(
|
|
@@ -296,13 +299,13 @@ class SubgraphExtractor:
|
|
|
296
299
|
self.ui,
|
|
297
300
|
self.cfg,
|
|
298
301
|
)
|
|
299
|
-
nb_components = get_connected_components(
|
|
302
|
+
nb_components = get_connected_components(all_results)
|
|
300
303
|
if len(nb_components) > 1:
|
|
301
304
|
logger.warning(
|
|
302
305
|
f"The extracted graph has {len(nb_components)} disconnected components. Starting densification to connect them...\n"
|
|
303
306
|
)
|
|
304
307
|
|
|
305
|
-
densified_paths = graph_connector.connect(
|
|
308
|
+
densified_paths = graph_connector.connect(all_results, seeds_found)
|
|
306
309
|
|
|
307
310
|
self.save(
|
|
308
311
|
densified_paths, graph_connector.bfs.graph, name_suffix="_densified"
|
|
@@ -321,7 +324,7 @@ class SubgraphExtractor:
|
|
|
321
324
|
|
|
322
325
|
def print_final_summary(
|
|
323
326
|
self,
|
|
324
|
-
paths: list[
|
|
327
|
+
paths: list[ExtractionResult],
|
|
325
328
|
graph: nx.MultiGraph,
|
|
326
329
|
table_title: str,
|
|
327
330
|
name_suffix: str = "",
|
|
@@ -333,7 +336,7 @@ class SubgraphExtractor:
|
|
|
333
336
|
|
|
334
337
|
def save(
|
|
335
338
|
self,
|
|
336
|
-
paths: list[
|
|
339
|
+
paths: list[ExtractionResult],
|
|
337
340
|
graph: nx.MultiGraph,
|
|
338
341
|
name_suffix: str = "",
|
|
339
342
|
) -> None:
|
|
@@ -391,13 +394,13 @@ class SubgraphExtractor:
|
|
|
391
394
|
seeds = [list(pair) for pair in itertools.combinations(unique_seeds, 2)]
|
|
392
395
|
logger.info(f"Created {len(seeds)} pairs to explore.\n")
|
|
393
396
|
|
|
394
|
-
|
|
397
|
+
all_results, seeds_found = self.extract_subgraph(seeds)
|
|
395
398
|
|
|
396
|
-
if not
|
|
399
|
+
if not all_results:
|
|
397
400
|
logger.error("No paths were extracted. Exiting without saving.")
|
|
398
401
|
return
|
|
399
402
|
|
|
400
403
|
if self.skip_densification:
|
|
401
404
|
return
|
|
402
405
|
|
|
403
|
-
self.densify_graph(
|
|
406
|
+
self.densify_graph(all_results, seeds_found)
|
{graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/densification/GraphConnector.py
RENAMED
|
@@ -8,6 +8,7 @@ from graph_seeder.utils.URIManager import URIManager
|
|
|
8
8
|
from graph_seeder.utils.utils import get_connected_components
|
|
9
9
|
from graph_seeder.utils.Factory import ComponentFactory
|
|
10
10
|
from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
|
|
11
|
+
from graph_seeder.models.RDFNode import ExtractionResult
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger("subgraph")
|
|
13
14
|
|
|
@@ -50,20 +51,20 @@ class GraphConnector:
|
|
|
50
51
|
|
|
51
52
|
def connect(
|
|
52
53
|
self,
|
|
53
|
-
|
|
54
|
+
initial_results: list[ExtractionResult],
|
|
54
55
|
found_seeds: set[str],
|
|
55
|
-
) -> list[
|
|
56
|
+
) -> list[ExtractionResult]:
|
|
56
57
|
|
|
57
|
-
|
|
58
|
+
current_results: list[ExtractionResult] = list(initial_results)
|
|
58
59
|
failed_component_pairs = set()
|
|
59
60
|
|
|
60
|
-
initial_triplets_count = sum(len(p.get("triples", [])) for p in
|
|
61
|
+
initial_triplets_count = sum(len(p.get("triples", [])) for p in current_results)
|
|
61
62
|
|
|
62
63
|
with self.ui.create_progress_bar() as progress:
|
|
63
64
|
task = progress.add_task("Components densification", total=None)
|
|
64
65
|
|
|
65
66
|
while True:
|
|
66
|
-
components = get_connected_components(
|
|
67
|
+
components = get_connected_components(current_results)
|
|
67
68
|
|
|
68
69
|
if len(components) <= 1:
|
|
69
70
|
current_completed = progress.tasks[0].completed
|
|
@@ -110,23 +111,19 @@ class GraphConnector:
|
|
|
110
111
|
)
|
|
111
112
|
|
|
112
113
|
if path_triplets:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
}
|
|
118
|
-
current_paths.append(new_path)
|
|
114
|
+
new_result = ExtractionResult(
|
|
115
|
+
seed=source, target=target, triples=path_triplets
|
|
116
|
+
)
|
|
117
|
+
current_results.append(new_result)
|
|
119
118
|
|
|
120
119
|
for s, p, o in path_triplets:
|
|
121
|
-
self.graph.add_edge(s, o)
|
|
120
|
+
self.graph.add_edge(s.value, o.value)
|
|
122
121
|
else:
|
|
123
|
-
failed_component_pairs.add(
|
|
124
|
-
tuple(sorted([tuple(seeds_a), tuple(seeds_b)]))
|
|
125
|
-
)
|
|
122
|
+
failed_component_pairs.add(pair_id)
|
|
126
123
|
|
|
127
|
-
final_triplets_count = sum(len(p.get("triples", [])) for p in
|
|
124
|
+
final_triplets_count = sum(len(p.get("triples", [])) for p in current_results)
|
|
128
125
|
logger.info(
|
|
129
126
|
f"Found {final_triplets_count - initial_triplets_count} new triplets during densification."
|
|
130
127
|
)
|
|
131
128
|
|
|
132
|
-
return
|
|
129
|
+
return current_results
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from graph_seeder.models.RDFNode import RDFNode
|
|
2
3
|
from graph_seeder.utils.URIManager import URIManager
|
|
3
4
|
from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
|
|
4
5
|
from graph_seeder.extraction.ExtractionStrategy import ExtractionStrategy
|
|
@@ -46,7 +47,9 @@ class BidirectionalBFS(ExtractionStrategy):
|
|
|
46
47
|
) -> str:
|
|
47
48
|
return f"Extracting path for {self.format_progress_description(nodes)}"
|
|
48
49
|
|
|
49
|
-
def extract(
|
|
50
|
+
def extract(
|
|
51
|
+
self, nodes: list[str]
|
|
52
|
+
) -> tuple[list[tuple[RDFNode, RDFNode, RDFNode]], str]:
|
|
50
53
|
"""Extract a subgraph connecting the given seed nodes using bidirectional BFS.
|
|
51
54
|
Returns:
|
|
52
55
|
Tuple of (list of path triplets, result message)"""
|
|
@@ -60,7 +63,7 @@ class BidirectionalBFS(ExtractionStrategy):
|
|
|
60
63
|
|
|
61
64
|
def _find_path(
|
|
62
65
|
self, source: str, target: str
|
|
63
|
-
) -> tuple[list[tuple[
|
|
66
|
+
) -> tuple[list[tuple[RDFNode, RDFNode, RDFNode]], str]:
|
|
64
67
|
"""Find a path between two nodes within the configured hop limit.
|
|
65
68
|
Returns:
|
|
66
69
|
Tuple of (path_triplets, result message)
|
|
@@ -160,19 +163,28 @@ class BidirectionalBFS(ExtractionStrategy):
|
|
|
160
163
|
self.graph.add_node(node)
|
|
161
164
|
|
|
162
165
|
for triplets in self.wrapper.get_neighborhood(nodes_to_query):
|
|
163
|
-
for
|
|
166
|
+
for subj_node, pred_node, obj_node in triplets:
|
|
167
|
+
subj_val = subj_node.value
|
|
168
|
+
obj_val = obj_node.value
|
|
169
|
+
pred_val = pred_node.value
|
|
170
|
+
|
|
164
171
|
if (
|
|
165
|
-
|
|
166
|
-
or
|
|
167
|
-
or
|
|
172
|
+
subj_val in self._excluded_nodes
|
|
173
|
+
or obj_val in self._excluded_nodes
|
|
174
|
+
or pred_val in self._excluded_properties
|
|
168
175
|
):
|
|
169
176
|
continue
|
|
170
177
|
|
|
171
178
|
self.graph.add_edge(
|
|
172
|
-
|
|
179
|
+
subj_val,
|
|
180
|
+
obj_val,
|
|
181
|
+
predicate=pred_val,
|
|
182
|
+
subj_node=subj_node,
|
|
183
|
+
pred_node=pred_node,
|
|
184
|
+
obj_node=obj_node,
|
|
173
185
|
)
|
|
174
186
|
|
|
175
|
-
for n in (
|
|
187
|
+
for n in (subj_val, obj_val):
|
|
176
188
|
if n not in nodes_visited:
|
|
177
189
|
next_level.add(n)
|
|
178
190
|
|
|
@@ -184,19 +196,23 @@ class BidirectionalBFS(ExtractionStrategy):
|
|
|
184
196
|
|
|
185
197
|
def _extract_path_triplets(
|
|
186
198
|
self, source: str, target: str
|
|
187
|
-
) -> list[tuple[
|
|
199
|
+
) -> list[tuple[RDFNode, RDFNode, RDFNode]]:
|
|
188
200
|
"""Build a triple sequence for the shortest path currently in the graph."""
|
|
189
201
|
path_nodes: list[str] = nx.shortest_path(
|
|
190
202
|
self.graph, source=source, target=target
|
|
191
203
|
)
|
|
192
|
-
triplets: list[tuple[
|
|
204
|
+
triplets: list[tuple[RDFNode, RDFNode, RDFNode]] = []
|
|
193
205
|
|
|
194
206
|
for u, v in zip(path_nodes, path_nodes[1:]):
|
|
195
207
|
edges = self.graph[u][v]
|
|
196
208
|
edge_data = edges[next(iter(edges))]
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
209
|
+
|
|
210
|
+
subj_node = edge_data.get("subj_node", RDFNode(u, "uri"))
|
|
211
|
+
pred_node = edge_data.get(
|
|
212
|
+
"pred_node", RDFNode(edge_data.get("predicate", "unknown"), "uri")
|
|
213
|
+
)
|
|
214
|
+
obj_node = edge_data.get("obj_node", RDFNode(v, "uri"))
|
|
215
|
+
|
|
216
|
+
triplets.append((subj_node, pred_node, obj_node))
|
|
201
217
|
|
|
202
218
|
return triplets
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from networkx import MultiGraph
|
|
3
|
+
from graph_seeder.models.RDFNode import RDFNode
|
|
3
4
|
from graph_seeder.utils.URIManager import URIManager
|
|
4
5
|
from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
|
|
5
6
|
from rich.progress import (
|
|
@@ -28,7 +29,9 @@ class ExtractionStrategy(ABC):
|
|
|
28
29
|
self.explored_nodes = set() if explored_nodes is None else explored_nodes
|
|
29
30
|
|
|
30
31
|
@abstractmethod
|
|
31
|
-
def extract(
|
|
32
|
+
def extract(
|
|
33
|
+
self, nodes: list[str]
|
|
34
|
+
) -> tuple[list[tuple[RDFNode, RDFNode, RDFNode]], str]:
|
|
32
35
|
"""Extract a subgraph given a list of seed nodes.
|
|
33
36
|
Returns:
|
|
34
37
|
Tuple of (list of triplets, result message)"""
|
|
@@ -46,7 +49,7 @@ class ExtractionStrategy(ABC):
|
|
|
46
49
|
|
|
47
50
|
def execute_task(
|
|
48
51
|
self, nodes: list[str], progress: Progress, task: TaskID
|
|
49
|
-
) -> list[tuple[
|
|
52
|
+
) -> list[tuple[RDFNode, RDFNode, RDFNode]]:
|
|
50
53
|
"""Execute the extraction task with progress bar updates and error handling."""
|
|
51
54
|
task_description = self.format_progress_description(nodes)
|
|
52
55
|
start_message = self.format_start_message(nodes)
|
{graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/extraction/Hop/HopExpansion.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from graph_seeder.extraction.ExtractionStrategy import ExtractionStrategy
|
|
3
3
|
|
|
4
|
+
from graph_seeder.models.RDFNode import RDFNode
|
|
4
5
|
from graph_seeder.utils.URIManager import URIManager
|
|
5
6
|
from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
|
|
6
7
|
|
|
@@ -34,7 +35,9 @@ class HopExpansion(ExtractionStrategy):
|
|
|
34
35
|
) -> str:
|
|
35
36
|
return f"Expanding {self.max_hops} hops for {self.format_progress_description(nodes)}"
|
|
36
37
|
|
|
37
|
-
def extract(
|
|
38
|
+
def extract(
|
|
39
|
+
self, nodes: list[str]
|
|
40
|
+
) -> tuple[list[tuple[RDFNode, RDFNode, RDFNode]], str]:
|
|
38
41
|
"""
|
|
39
42
|
Extract a subgraph by expanding from the given seed nodes up to max_hops.
|
|
40
43
|
Returns:
|
|
@@ -50,7 +53,9 @@ class HopExpansion(ExtractionStrategy):
|
|
|
50
53
|
f"[green]✓[/] Extracted {len(triplets)} triplets within {self.max_hops} hops.",
|
|
51
54
|
)
|
|
52
55
|
|
|
53
|
-
def _expand(
|
|
56
|
+
def _expand(
|
|
57
|
+
self, nodes: list[str]
|
|
58
|
+
) -> list[tuple[RDFNode, RDFNode, RDFNode]] | None:
|
|
54
59
|
"""
|
|
55
60
|
Expand a list of nodes radially up to max_hops.
|
|
56
61
|
Returns a list of all discovered unique triplets.
|
|
@@ -62,7 +67,7 @@ class HopExpansion(ExtractionStrategy):
|
|
|
62
67
|
visited_nodes: set[str] = set(valid_nodes)
|
|
63
68
|
current_level_nodes: set[str] = set(valid_nodes)
|
|
64
69
|
self.graph.add_nodes_from(valid_nodes)
|
|
65
|
-
all_triplets: set[tuple[
|
|
70
|
+
all_triplets: set[tuple[RDFNode, RDFNode, RDFNode]] = set()
|
|
66
71
|
|
|
67
72
|
for hop in range(self.max_hops):
|
|
68
73
|
logger.info(
|
|
@@ -76,11 +81,14 @@ class HopExpansion(ExtractionStrategy):
|
|
|
76
81
|
if node in self.explored_nodes:
|
|
77
82
|
if node in self.graph:
|
|
78
83
|
for u, v, data in self.graph.edges(node, data=True):
|
|
79
|
-
pred = data.get("key")
|
|
80
84
|
neighbor = v if u == node else u
|
|
81
85
|
|
|
82
86
|
if neighbor not in self._excluded_nodes:
|
|
83
|
-
|
|
87
|
+
subj_node = data.get("subj_node")
|
|
88
|
+
pred_node = data.get("pred_node")
|
|
89
|
+
obj_node = data.get("obj_node")
|
|
90
|
+
|
|
91
|
+
all_triplets.add((subj_node, pred_node, obj_node))
|
|
84
92
|
if neighbor not in visited_nodes:
|
|
85
93
|
next_level_nodes.add(neighbor)
|
|
86
94
|
else:
|
|
@@ -88,23 +96,33 @@ class HopExpansion(ExtractionStrategy):
|
|
|
88
96
|
|
|
89
97
|
if nodes_to_query:
|
|
90
98
|
for triplets in self.wrapper.get_neighborhood(nodes_to_query):
|
|
91
|
-
for
|
|
99
|
+
for subj_node, pred_node, obj_node in triplets:
|
|
100
|
+
subj_val = subj_node.value
|
|
101
|
+
obj_val = obj_node.value
|
|
102
|
+
pred_val = pred_node.value
|
|
103
|
+
|
|
92
104
|
if (
|
|
93
|
-
|
|
94
|
-
or
|
|
95
|
-
or
|
|
105
|
+
subj_val in self._excluded_nodes
|
|
106
|
+
or obj_val in self._excluded_nodes
|
|
107
|
+
or pred_val in self._excluded_properties
|
|
96
108
|
):
|
|
97
109
|
continue
|
|
110
|
+
|
|
98
111
|
self.graph.add_edge(
|
|
99
|
-
|
|
112
|
+
subj_val,
|
|
113
|
+
obj_val,
|
|
114
|
+
key=pred_val,
|
|
115
|
+
subj_node=subj_node,
|
|
116
|
+
pred_node=pred_node,
|
|
117
|
+
obj_node=obj_node,
|
|
100
118
|
)
|
|
101
119
|
|
|
102
|
-
all_triplets.add((
|
|
120
|
+
all_triplets.add((subj_node, pred_node, obj_node))
|
|
103
121
|
|
|
104
|
-
if
|
|
105
|
-
next_level_nodes.add(
|
|
106
|
-
if
|
|
107
|
-
next_level_nodes.add(
|
|
122
|
+
if subj_val not in visited_nodes:
|
|
123
|
+
next_level_nodes.add(subj_val)
|
|
124
|
+
if obj_val not in visited_nodes:
|
|
125
|
+
next_level_nodes.add(obj_val)
|
|
108
126
|
|
|
109
127
|
self.explored_nodes.update(nodes_to_query)
|
|
110
128
|
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional, TypedDict
|
|
3
|
+
from rdflib import URIRef, Literal
|
|
4
|
+
from graph_seeder.utils.URIManager import URIManager
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class RDFNode:
|
|
9
|
+
"""Represents a node in an RDF graph, which can be a URI or a literal value."""
|
|
10
|
+
|
|
11
|
+
value: str
|
|
12
|
+
node_type: str
|
|
13
|
+
datatype: Optional[str] = None
|
|
14
|
+
language: Optional[str] = None
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def is_uri(self) -> bool:
|
|
18
|
+
return self.node_type == "uri"
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def is_literal(self) -> bool:
|
|
22
|
+
return self.node_type in ("literal", "typed-literal")
|
|
23
|
+
|
|
24
|
+
def __str__(self) -> str:
|
|
25
|
+
return self.value
|
|
26
|
+
|
|
27
|
+
def to_rdflib(self, uri_manager: URIManager):
|
|
28
|
+
"""Convert internal string representation to proper rdflib Nodes."""
|
|
29
|
+
if self.is_uri:
|
|
30
|
+
if ":" in self.value and not self.value.startswith("http"):
|
|
31
|
+
prefix, local_name = self.value.split(":", 1)
|
|
32
|
+
if prefix in uri_manager.namespaces:
|
|
33
|
+
return URIRef(f"{uri_manager.namespaces[prefix]}{local_name}")
|
|
34
|
+
return URIRef(self.value)
|
|
35
|
+
|
|
36
|
+
else:
|
|
37
|
+
return Literal(self.value, lang=self.language, datatype=self.datatype)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ExtractionResult(TypedDict):
|
|
41
|
+
"""Type hinting for dictionaries of extracted paths."""
|
|
42
|
+
|
|
43
|
+
seed: str
|
|
44
|
+
target: Optional[str]
|
|
45
|
+
triples: list[tuple[RDFNode, RDFNode, RDFNode]]
|
|
@@ -3,7 +3,8 @@ import logging
|
|
|
3
3
|
import pickle
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
import networkx as nx
|
|
6
|
-
from rdflib import Graph, Namespace
|
|
6
|
+
from rdflib import Graph, Namespace
|
|
7
|
+
from graph_seeder.models.RDFNode import ExtractionResult
|
|
7
8
|
from graph_seeder.utils.URIManager import URIManager
|
|
8
9
|
|
|
9
10
|
logger = logging.getLogger("subgraph")
|
|
@@ -24,7 +25,7 @@ class GraphExporter:
|
|
|
24
25
|
|
|
25
26
|
def save_results(
|
|
26
27
|
self,
|
|
27
|
-
|
|
28
|
+
extraction_results: list[ExtractionResult],
|
|
28
29
|
uri_manager: URIManager,
|
|
29
30
|
name_suffix: str = "",
|
|
30
31
|
) -> None:
|
|
@@ -35,45 +36,54 @@ class GraphExporter:
|
|
|
35
36
|
fmt = self.output_format
|
|
36
37
|
|
|
37
38
|
if fmt == "json":
|
|
38
|
-
|
|
39
|
-
for
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
"
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
39
|
+
results_list = []
|
|
40
|
+
for result_data in extraction_results:
|
|
41
|
+
extracted_item = {"seed": uri_manager.compress_uri(result_data["seed"])}
|
|
42
|
+
|
|
43
|
+
if result_data.get("target"):
|
|
44
|
+
extracted_item["target"] = uri_manager.compress_uri(
|
|
45
|
+
result_data["target"]
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
extracted_item["triples"] = [
|
|
49
|
+
[
|
|
50
|
+
uri_manager.compress_uri(s.value),
|
|
51
|
+
uri_manager.compress_uri(p.value),
|
|
52
|
+
uri_manager.compress_uri(o.value) if o.is_uri else o.value,
|
|
53
|
+
]
|
|
54
|
+
for s, p, o in result_data.get("triples", [])
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
results_list.append(extracted_item)
|
|
58
|
+
|
|
59
|
+
final_json = {"@context": uri_manager.namespaces, "results": results_list}
|
|
56
60
|
|
|
57
61
|
with open(path, "w", encoding="utf-8") as f:
|
|
58
62
|
json.dump(final_json, f, indent=2)
|
|
59
63
|
|
|
60
64
|
elif fmt == "ttl":
|
|
61
65
|
rdf = Graph()
|
|
62
|
-
|
|
63
|
-
for prefix, uri in uri_manager.namespaces.items():
|
|
64
|
-
rdf.bind(prefix, Namespace(uri))
|
|
65
|
-
|
|
66
66
|
added_triples = set()
|
|
67
67
|
|
|
68
|
-
for
|
|
69
|
-
for s, p, o in
|
|
70
|
-
if (s, p, o) not in added_triples:
|
|
71
|
-
|
|
72
|
-
|
|
68
|
+
for result_data in extraction_results:
|
|
69
|
+
for s, p, o in result_data.get("triples", []):
|
|
70
|
+
if (s.value, p.value, o.value) not in added_triples:
|
|
71
|
+
uri_manager.compress_uri(s.value)
|
|
72
|
+
uri_manager.compress_uri(p.value)
|
|
73
|
+
if o.is_uri:
|
|
74
|
+
uri_manager.compress_uri(o.value)
|
|
75
|
+
|
|
76
|
+
rdf.add(
|
|
77
|
+
(
|
|
78
|
+
s.to_rdflib(uri_manager),
|
|
79
|
+
p.to_rdflib(uri_manager),
|
|
80
|
+
o.to_rdflib(uri_manager),
|
|
81
|
+
)
|
|
73
82
|
)
|
|
83
|
+
added_triples.add((s.value, p.value, o.value))
|
|
74
84
|
|
|
75
|
-
|
|
76
|
-
|
|
85
|
+
for prefix, uri in uri_manager.namespaces.items():
|
|
86
|
+
rdf.bind(prefix, Namespace(uri))
|
|
77
87
|
|
|
78
88
|
rdf.serialize(destination=str(path), format="turtle")
|
|
79
89
|
|
{graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/utils/GraphStatistics.py
RENAMED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import statistics
|
|
2
|
+
from graph_seeder.models.RDFNode import ExtractionResult
|
|
2
3
|
from graph_seeder.utils.utils import get_connected_components
|
|
3
4
|
|
|
4
5
|
|
|
@@ -6,23 +7,23 @@ class GraphStatistics:
|
|
|
6
7
|
"""Utility class for computing statistics on a graph."""
|
|
7
8
|
|
|
8
9
|
@staticmethod
|
|
9
|
-
def compute(
|
|
10
|
-
"""Compute statistics on the graph given the extracted
|
|
10
|
+
def compute(results: list[ExtractionResult]) -> dict:
|
|
11
|
+
"""Compute statistics on the graph given the extracted results."""
|
|
11
12
|
subjects = set()
|
|
12
13
|
predicates = set()
|
|
13
14
|
objects = set()
|
|
14
15
|
triplets = []
|
|
15
16
|
|
|
16
|
-
for path_data in
|
|
17
|
+
for path_data in results:
|
|
17
18
|
for s, p, o in path_data.get("triples", []):
|
|
18
|
-
triplets.append((s, p, o))
|
|
19
|
-
subjects.add(s)
|
|
20
|
-
predicates.add(p)
|
|
21
|
-
objects.add(o)
|
|
19
|
+
triplets.append((s.value, p.value, o.value))
|
|
20
|
+
subjects.add(s.value)
|
|
21
|
+
predicates.add(p.value)
|
|
22
|
+
objects.add(o.value)
|
|
22
23
|
|
|
23
24
|
unique_entities = len(subjects | objects)
|
|
24
25
|
|
|
25
|
-
components = get_connected_components(
|
|
26
|
+
components = get_connected_components(results)
|
|
26
27
|
|
|
27
28
|
comp_sizes = [len(comp) for comp in components]
|
|
28
29
|
mean_size = statistics.mean(comp_sizes) if comp_sizes else 0
|
|
@@ -2,6 +2,7 @@ import networkx as nx
|
|
|
2
2
|
import json
|
|
3
3
|
from importlib import resources
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from graph_seeder.models.RDFNode import ExtractionResult, RDFNode
|
|
5
6
|
|
|
6
7
|
BUILTIN_CONFIGS = [
|
|
7
8
|
"dbpedia_default",
|
|
@@ -285,9 +286,20 @@ def generate_config_template(output_path: str = "config_template.json") -> None:
|
|
|
285
286
|
print(f"Configuration template successfully generated at: {path}")
|
|
286
287
|
|
|
287
288
|
|
|
288
|
-
def get_connected_components(paths: list[
|
|
289
|
+
def get_connected_components(paths: list[ExtractionResult]) -> list[set[str]]:
|
|
289
290
|
"""Get the connected components from a list of structured paths."""
|
|
290
291
|
graph: nx.Graph = nx.Graph()
|
|
291
292
|
for path_data in paths:
|
|
292
|
-
graph.add_edges_from(
|
|
293
|
+
graph.add_edges_from(
|
|
294
|
+
(s.value, o.value) for s, p, o in path_data.get("triples", [])
|
|
295
|
+
)
|
|
293
296
|
return list(nx.connected_components(graph))
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def parse_node(binding: dict) -> RDFNode:
|
|
300
|
+
return RDFNode(
|
|
301
|
+
value=binding["value"],
|
|
302
|
+
node_type=binding["type"],
|
|
303
|
+
datatype=binding.get("datatype"),
|
|
304
|
+
language=binding.get("xml:lang"),
|
|
305
|
+
)
|
{graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/NeighborhoodWrapper.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from collections.abc import Generator
|
|
3
|
+
from graph_seeder.models.RDFNode import RDFNode
|
|
3
4
|
from graph_seeder.utils.URIManager import URIManager
|
|
4
5
|
|
|
5
6
|
|
|
@@ -40,7 +41,7 @@ class NeighborhoodWrapper(ABC):
|
|
|
40
41
|
@abstractmethod
|
|
41
42
|
def get_neighborhood(
|
|
42
43
|
self, nodes: list[str]
|
|
43
|
-
) -> Generator[list[tuple[
|
|
44
|
+
) -> Generator[list[tuple[RDFNode, RDFNode, RDFNode]], None, None]:
|
|
44
45
|
"""
|
|
45
46
|
Yields the neighborhood of a list of nodes in batches.
|
|
46
47
|
Allows the consumer to break the loop to stop early.
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from collections.abc import Generator
|
|
2
2
|
import lmdb
|
|
3
3
|
import logging
|
|
4
|
+
from graph_seeder.models.RDFNode import RDFNode
|
|
4
5
|
from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
|
|
5
6
|
from graph_seeder.utils.URIManager import URIManager
|
|
6
7
|
import json
|
|
@@ -48,8 +49,8 @@ class HashMapWrapper(NeighborhoodWrapper):
|
|
|
48
49
|
|
|
49
50
|
def get_neighborhood(
|
|
50
51
|
self, nodes: list[str]
|
|
51
|
-
) -> Generator[list[tuple[
|
|
52
|
-
triplets: list[tuple[
|
|
52
|
+
) -> Generator[list[tuple[RDFNode, RDFNode, RDFNode]], None, None]:
|
|
53
|
+
triplets: list[tuple[RDFNode, RDFNode, RDFNode]] = []
|
|
53
54
|
skipped_nodes: set[str] = set()
|
|
54
55
|
|
|
55
56
|
with self.env.begin() as txn:
|
|
@@ -109,10 +110,21 @@ class HashMapWrapper(NeighborhoodWrapper):
|
|
|
109
110
|
):
|
|
110
111
|
continue
|
|
111
112
|
|
|
113
|
+
subj_node = RDFNode(original_node, "uri")
|
|
114
|
+
pred_node = RDFNode(clean_prop, "uri")
|
|
115
|
+
|
|
116
|
+
is_uri = neighbor.startswith("http") or (
|
|
117
|
+
":" in neighbor and " " not in neighbor
|
|
118
|
+
)
|
|
119
|
+
obj_type = "uri" if is_uri else "literal"
|
|
120
|
+
obj_node = RDFNode(neighbor, obj_type)
|
|
121
|
+
|
|
112
122
|
if is_inverse:
|
|
113
|
-
|
|
123
|
+
inv_subj = RDFNode(neighbor, "uri")
|
|
124
|
+
inv_obj = RDFNode(original_node, "uri")
|
|
125
|
+
triplets.append((inv_subj, pred_node, inv_obj))
|
|
114
126
|
else:
|
|
115
|
-
triplets.append((
|
|
127
|
+
triplets.append((subj_node, pred_node, obj_node))
|
|
116
128
|
|
|
117
129
|
if triplets:
|
|
118
130
|
yield triplets
|
{graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/sparql/GraphWrapper.py
RENAMED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
from collections.abc import Generator
|
|
2
|
-
|
|
3
|
-
from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
|
|
4
2
|
import logging
|
|
3
|
+
from graph_seeder.models.RDFNode import RDFNode
|
|
4
|
+
from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
|
|
5
5
|
from graph_seeder.wrapper.sparql.SparqlQueryBuilder import SparqlQueryBuilder
|
|
6
|
-
from graph_seeder.utils.URIManager import URIManager
|
|
7
6
|
from graph_seeder.wrapper.sparql.BaseClient import BaseClient
|
|
7
|
+
from graph_seeder.utils.URIManager import URIManager
|
|
8
|
+
from graph_seeder.utils.utils import parse_node
|
|
8
9
|
|
|
9
10
|
logger = logging.getLogger("subgraph")
|
|
10
11
|
|
|
@@ -135,7 +136,7 @@ class GraphWrapper(NeighborhoodWrapper):
|
|
|
135
136
|
|
|
136
137
|
def get_neighborhood(
|
|
137
138
|
self, nodes: list[str]
|
|
138
|
-
) -> Generator[list[tuple[
|
|
139
|
+
) -> Generator[list[tuple[RDFNode, RDFNode, RDFNode]], None, None]:
|
|
139
140
|
"""Fetch one-hop neighbors using property occurrence to decide strategy."""
|
|
140
141
|
if not nodes:
|
|
141
142
|
return None
|
|
@@ -178,9 +179,9 @@ class GraphWrapper(NeighborhoodWrapper):
|
|
|
178
179
|
):
|
|
179
180
|
yield [
|
|
180
181
|
(
|
|
181
|
-
r["subject"]
|
|
182
|
-
r["property"]
|
|
183
|
-
r["object"]
|
|
182
|
+
parse_node(r["subject"]),
|
|
183
|
+
parse_node(r["property"]),
|
|
184
|
+
parse_node(r["object"]),
|
|
184
185
|
)
|
|
185
186
|
for r in raw_rows
|
|
186
187
|
]
|
|
@@ -192,9 +193,9 @@ class GraphWrapper(NeighborhoodWrapper):
|
|
|
192
193
|
):
|
|
193
194
|
yield [
|
|
194
195
|
(
|
|
195
|
-
r["subject"]
|
|
196
|
-
r["property"]
|
|
197
|
-
r["object"]
|
|
196
|
+
parse_node(r["subject"]),
|
|
197
|
+
parse_node(r["property"]),
|
|
198
|
+
parse_node(r["object"]),
|
|
198
199
|
)
|
|
199
200
|
for r in raw_rows
|
|
200
201
|
]
|
|
@@ -212,16 +213,17 @@ class GraphWrapper(NeighborhoodWrapper):
|
|
|
212
213
|
for raw_rows in self._execute_with_dichotomy(
|
|
213
214
|
nodes, self.query_builder.build_prop_occurrence_query
|
|
214
215
|
):
|
|
215
|
-
|
|
216
|
-
|
|
216
|
+
for r in raw_rows:
|
|
217
|
+
if not r or "entity" not in r:
|
|
218
|
+
continue
|
|
219
|
+
|
|
220
|
+
all_stats.append(
|
|
217
221
|
(
|
|
218
222
|
r["entity"]["value"],
|
|
219
223
|
r["property"]["value"],
|
|
220
224
|
int(r["count"]["value"]),
|
|
221
225
|
)
|
|
222
|
-
|
|
223
|
-
]
|
|
224
|
-
)
|
|
226
|
+
)
|
|
225
227
|
return all_stats
|
|
226
228
|
|
|
227
229
|
def _chunk_hub_properties(
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import rdflib
|
|
3
3
|
from graph_seeder.wrapper.sparql.BaseClient import BaseClient
|
|
4
|
+
import json
|
|
4
5
|
|
|
5
6
|
logger = logging.getLogger("subgraph")
|
|
6
7
|
|
|
@@ -23,25 +24,19 @@ class TurtleClient(BaseClient):
|
|
|
23
24
|
self.graph.parse(self.file_path, format="turtle")
|
|
24
25
|
logger.info(f"Successfully loaded {len(self.graph)} triples.")
|
|
25
26
|
|
|
26
|
-
self.optimal_batch_size = 500
|
|
27
|
-
|
|
28
27
|
def query(
|
|
29
28
|
self, sparql_query: str, silent: bool = False, retries: int = None
|
|
30
29
|
) -> list[dict]:
|
|
31
30
|
"""Execute the SPARQL query on the local rdflib graph and format the output."""
|
|
32
31
|
try:
|
|
33
32
|
results = self.graph.query(sparql_query)
|
|
34
|
-
bindings = []
|
|
35
33
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
binding[str(var)] = {"value": str(val)}
|
|
42
|
-
bindings.append(binding)
|
|
34
|
+
json_bytes = results.serialize(format="json")
|
|
35
|
+
|
|
36
|
+
json_dict = json.loads(json_bytes)
|
|
37
|
+
|
|
38
|
+
return json_dict["results"]["bindings"]
|
|
43
39
|
|
|
44
|
-
return bindings
|
|
45
40
|
except Exception as e:
|
|
46
41
|
logger.error(f"Failed to execute local Turtle query: {e}")
|
|
47
42
|
raise RuntimeError(f"Turtle query failed: {e}") from e
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/configs/dbpedia_default.json
RENAMED
|
File without changes
|
|
File without changes
|
{graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/configs/europeana_default.json
RENAMED
|
File without changes
|
{graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/configs/pgxlod_default.json
RENAMED
|
File without changes
|
{graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/configs/wikidata_default.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/sparql/BaseClient.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|