graph-seeder 1.0.0.dev2__tar.gz → 1.0.0.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/PKG-INFO +1 -1
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/pyproject.toml +1 -1
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/SubgraphExtractor.py +31 -29
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/configs/dbpedia_default.json +2 -2
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/configs/default.json +1 -1
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/configs/europeana_default.json +2 -2
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/configs/pgxlod_default.json +9 -9
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/configs/wikidata_default.json +1 -1
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/densification/GraphConnector.py +25 -9
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/utils/ConsoleUI.py +1 -1
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/utils/GraphExporter.py +46 -36
- graph_seeder-1.0.0.dev3/src/graph_seeder/utils/GraphStatistics.py +41 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/utils/utils.py +4 -2
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/uv.lock +1 -1
- graph_seeder-1.0.0.dev2/src/graph_seeder/utils/GraphStatistics.py +0 -32
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/.github/workflows/publish.yml +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/.gitignore +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/README.md +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/requirements.txt +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/GraphSeeder.py +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/extraction/BFS/BFS.py +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/extraction/ExtractionStrategy.py +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/extraction/Hop/HopExpansion.py +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/utils/Factory.py +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/utils/URIManager.py +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/NeighborhoodWrapper.py +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/hashmap/HashMapWrapper.py +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/sparql/BaseClient.py +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/sparql/GraphWrapper.py +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/sparql/SparqlQueryBuilder.py +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/sparql/client/SparqlClient.py +0 -0
- {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/sparql/client/TurtleClient.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: graph-seeder
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev3
|
|
4
4
|
Summary: A powerful tool to extract and densify subgraphs from Knowledge Graphs via SPARQL or LMDB, with different extraction strategies.
|
|
5
5
|
Requires-Python: >=3.9
|
|
6
6
|
Requires-Dist: lmdb>=2.2.0
|
|
@@ -7,7 +7,7 @@ packages = ["src/graph_seeder"]
|
|
|
7
7
|
|
|
8
8
|
[project]
|
|
9
9
|
name = "graph-seeder"
|
|
10
|
-
version = "1.0.0.
|
|
10
|
+
version = "1.0.0.dev3"
|
|
11
11
|
description = "A powerful tool to extract and densify subgraphs from Knowledge Graphs via SPARQL or LMDB, with different extraction strategies."
|
|
12
12
|
readme = "README.md"
|
|
13
13
|
requires-python = ">=3.9"
|
|
@@ -102,9 +102,9 @@ class SubgraphExtractor:
|
|
|
102
102
|
raise ValueError("Data config requires an 'input_path' to a CSV file.")
|
|
103
103
|
|
|
104
104
|
output_format = data_cfg.get("output_format", {})
|
|
105
|
-
if output_format not in ["
|
|
105
|
+
if output_format not in ["json", "ttl"]:
|
|
106
106
|
raise ValueError(
|
|
107
|
-
"Data config 'output_format' must be one of: '
|
|
107
|
+
"Data config 'output_format' must be one of: 'json', 'ttl'."
|
|
108
108
|
)
|
|
109
109
|
|
|
110
110
|
# Validate client config
|
|
@@ -242,10 +242,8 @@ class SubgraphExtractor:
|
|
|
242
242
|
|
|
243
243
|
return []
|
|
244
244
|
|
|
245
|
-
def extract_subgraph(
|
|
246
|
-
|
|
247
|
-
) -> tuple[list[tuple[str, str, str]], set[str]]:
|
|
248
|
-
all_triplets: list[tuple[str, str, str]] = []
|
|
245
|
+
def extract_subgraph(self, seeds: list[str]) -> tuple[list[dict], set[str]]:
|
|
246
|
+
all_paths: list[dict] = []
|
|
249
247
|
seeds_found: set[str] = set()
|
|
250
248
|
|
|
251
249
|
with self.ui.create_progress_bar() as progress:
|
|
@@ -266,24 +264,28 @@ class SubgraphExtractor:
|
|
|
266
264
|
if not triplets:
|
|
267
265
|
self.stats["not_found"] += 1
|
|
268
266
|
else:
|
|
269
|
-
|
|
267
|
+
all_paths.append(
|
|
268
|
+
{
|
|
269
|
+
"seed": clean_nodes[0],
|
|
270
|
+
"target": clean_nodes[1] if len(clean_nodes) > 1 else None,
|
|
271
|
+
"triples": triplets,
|
|
272
|
+
}
|
|
273
|
+
)
|
|
270
274
|
seeds_found.update(clean_nodes)
|
|
271
275
|
self.stats["found"] += 1
|
|
272
276
|
|
|
273
|
-
self.exporter.
|
|
277
|
+
self.exporter.save_results(all_paths, self.uri_manager)
|
|
274
278
|
self.exporter.save_graph(self.extractor_strategy.graph)
|
|
275
279
|
|
|
276
280
|
logger.info("Computing final graph statistics...")
|
|
277
281
|
|
|
278
282
|
self.print_final_summary(
|
|
279
|
-
|
|
283
|
+
all_paths, self.extractor_strategy.graph, "Extraction summary"
|
|
280
284
|
)
|
|
281
285
|
|
|
282
|
-
return
|
|
286
|
+
return all_paths, seeds_found
|
|
283
287
|
|
|
284
|
-
def densify_graph(
|
|
285
|
-
self, triplets: list[tuple[str, str, str]], seeds_found: set[str]
|
|
286
|
-
):
|
|
288
|
+
def densify_graph(self, all_paths: list[dict], seeds_found: set[str]):
|
|
287
289
|
explored_nodes = self.extractor_strategy.explored_nodes
|
|
288
290
|
|
|
289
291
|
graph_connector: GraphConnector = GraphConnector(
|
|
@@ -294,22 +296,20 @@ class SubgraphExtractor:
|
|
|
294
296
|
self.ui,
|
|
295
297
|
self.cfg,
|
|
296
298
|
)
|
|
297
|
-
nb_components = get_connected_components(
|
|
299
|
+
nb_components = get_connected_components(all_paths)
|
|
298
300
|
if len(nb_components) > 1:
|
|
299
301
|
logger.warning(
|
|
300
302
|
f"The extracted graph has {len(nb_components)} disconnected components. Starting densification to connect them...\n"
|
|
301
303
|
)
|
|
302
304
|
|
|
303
|
-
|
|
305
|
+
densified_paths = graph_connector.connect(all_paths, seeds_found)
|
|
304
306
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
+
self.save(
|
|
308
|
+
densified_paths, graph_connector.bfs.graph, name_suffix="_densified"
|
|
307
309
|
)
|
|
308
310
|
|
|
309
|
-
self.save(new_triplets, graph_connector.bfs.graph, name_suffix="_densified")
|
|
310
|
-
|
|
311
311
|
self.print_final_summary(
|
|
312
|
-
|
|
312
|
+
densified_paths,
|
|
313
313
|
graph_connector.bfs.graph,
|
|
314
314
|
"Densification summary",
|
|
315
315
|
"_densified",
|
|
@@ -321,23 +321,23 @@ class SubgraphExtractor:
|
|
|
321
321
|
|
|
322
322
|
def print_final_summary(
|
|
323
323
|
self,
|
|
324
|
-
|
|
324
|
+
paths: list[dict],
|
|
325
325
|
graph: nx.MultiGraph,
|
|
326
326
|
table_title: str,
|
|
327
327
|
name_suffix: str = "",
|
|
328
328
|
) -> None:
|
|
329
|
-
detailed_stats = GraphStatistics.compute(
|
|
329
|
+
detailed_stats = GraphStatistics.compute(paths)
|
|
330
330
|
self.ui.print_summary(self.stats, detailed_stats, graph, table_title)
|
|
331
331
|
|
|
332
332
|
self.exporter.save_stats(self.stats, detailed_stats, name_suffix)
|
|
333
333
|
|
|
334
334
|
def save(
|
|
335
335
|
self,
|
|
336
|
-
|
|
336
|
+
paths: list[dict],
|
|
337
337
|
graph: nx.MultiGraph,
|
|
338
338
|
name_suffix: str = "",
|
|
339
339
|
) -> None:
|
|
340
|
-
self.exporter.
|
|
340
|
+
self.exporter.save_results(paths, self.uri_manager, name_suffix)
|
|
341
341
|
self.exporter.save_graph(graph, name_suffix)
|
|
342
342
|
|
|
343
343
|
def run(self) -> None:
|
|
@@ -352,7 +352,9 @@ class SubgraphExtractor:
|
|
|
352
352
|
)
|
|
353
353
|
return
|
|
354
354
|
|
|
355
|
-
unique_seeds = list(
|
|
355
|
+
unique_seeds = list(
|
|
356
|
+
dict.fromkeys([str(seed).strip() for row in seeds for seed in row])
|
|
357
|
+
)
|
|
356
358
|
|
|
357
359
|
if self.check_seeds_validity:
|
|
358
360
|
invalid_seeds = self._check_seeds_validity(unique_seeds)
|
|
@@ -389,13 +391,13 @@ class SubgraphExtractor:
|
|
|
389
391
|
seeds = [list(pair) for pair in itertools.combinations(unique_seeds, 2)]
|
|
390
392
|
logger.info(f"Created {len(seeds)} pairs to explore.\n")
|
|
391
393
|
|
|
392
|
-
|
|
394
|
+
all_paths, seeds_found = self.extract_subgraph(seeds)
|
|
393
395
|
|
|
394
|
-
if not
|
|
395
|
-
logger.error("No
|
|
396
|
+
if not all_paths:
|
|
397
|
+
logger.error("No paths were extracted. Exiting without saving.")
|
|
396
398
|
return
|
|
397
399
|
|
|
398
400
|
if self.skip_densification:
|
|
399
401
|
return
|
|
400
402
|
|
|
401
|
-
self.densify_graph(
|
|
403
|
+
self.densify_graph(all_paths, seeds_found)
|
{graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/configs/dbpedia_default.json
RENAMED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"data": {
|
|
3
3
|
"input_path": "seed.csv",
|
|
4
|
-
"output_format": "
|
|
4
|
+
"output_format": "json",
|
|
5
5
|
"output_path": "output/result",
|
|
6
6
|
"stats_output_path": "output/stats.json"
|
|
7
7
|
},
|
|
@@ -40,7 +40,7 @@
|
|
|
40
40
|
"extraction": {
|
|
41
41
|
"strategy": "bfs",
|
|
42
42
|
"create_all_pairs": false,
|
|
43
|
-
"batch_size":
|
|
43
|
+
"batch_size": 30,
|
|
44
44
|
"max_hops": 6,
|
|
45
45
|
"hub_pagination_threshold": 70000,
|
|
46
46
|
"max_neighbors_threshold": 300000,
|
{graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/configs/europeana_default.json
RENAMED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"data": {
|
|
3
3
|
"input_path": "seed.csv",
|
|
4
|
-
"output_format": "
|
|
4
|
+
"output_format": "json",
|
|
5
5
|
"output_path": "output/result",
|
|
6
6
|
"stats_output_path": "output/stats.json"
|
|
7
7
|
},
|
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
"extraction": {
|
|
32
32
|
"strategy": "bfs",
|
|
33
33
|
"create_all_pairs": false,
|
|
34
|
-
"batch_size":
|
|
34
|
+
"batch_size": 30,
|
|
35
35
|
"max_hops": 6,
|
|
36
36
|
"hub_pagination_threshold": 60000,
|
|
37
37
|
"max_neighbors_threshold": 150000,
|
{graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/configs/pgxlod_default.json
RENAMED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
{
|
|
2
2
|
"data": {
|
|
3
3
|
"input_path": "seed.csv",
|
|
4
|
-
"output_format": "
|
|
4
|
+
"output_format": "json",
|
|
5
5
|
"output_path": "output/result",
|
|
6
|
-
"stats_output_path": "output/stats.json"
|
|
6
|
+
"stats_output_path": "output/stats.json"
|
|
7
7
|
},
|
|
8
8
|
"client": {
|
|
9
9
|
"type": "SPARQL",
|
|
@@ -19,16 +19,16 @@
|
|
|
19
19
|
"include_uri_prefixes": [],
|
|
20
20
|
"exclude_uri_prefixes": [],
|
|
21
21
|
"exclude_nodes": [],
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
22
|
+
"exclude_properties": [],
|
|
23
|
+
"namespaces": {
|
|
24
|
+
"pharmgkb": "http://bio2rdf.org/pharmgkb",
|
|
25
|
+
"pgxlod": "http://pgxlod.loria.fr/resource/"
|
|
26
|
+
}
|
|
27
27
|
},
|
|
28
28
|
"extraction": {
|
|
29
29
|
"strategy": "bfs",
|
|
30
30
|
"create_all_pairs": false,
|
|
31
|
-
"batch_size":
|
|
31
|
+
"batch_size": 30,
|
|
32
32
|
"max_hops": 6,
|
|
33
33
|
"hub_pagination_threshold": 60000,
|
|
34
34
|
"max_neighbors_threshold": 150000,
|
|
@@ -46,4 +46,4 @@
|
|
|
46
46
|
"debug_enabled": false,
|
|
47
47
|
"request_logging": false
|
|
48
48
|
}
|
|
49
|
-
}
|
|
49
|
+
}
|
{graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/densification/GraphConnector.py
RENAMED
|
@@ -47,16 +47,21 @@ class GraphConnector:
|
|
|
47
47
|
return comp_seeds[0]
|
|
48
48
|
|
|
49
49
|
def connect(
|
|
50
|
-
self,
|
|
51
|
-
|
|
52
|
-
|
|
50
|
+
self,
|
|
51
|
+
initial_paths: list[dict],
|
|
52
|
+
found_seeds: set[str],
|
|
53
|
+
) -> list[dict]:
|
|
54
|
+
|
|
55
|
+
current_paths: list[dict] = list(initial_paths)
|
|
53
56
|
failed_component_pairs = set()
|
|
54
57
|
|
|
58
|
+
initial_triplets_count = sum(len(p.get("triples", [])) for p in current_paths)
|
|
59
|
+
|
|
55
60
|
with self.ui.create_progress_bar() as progress:
|
|
56
61
|
task = progress.add_task("Components densification", total=None)
|
|
57
62
|
|
|
58
63
|
while True:
|
|
59
|
-
components = get_connected_components(
|
|
64
|
+
components = get_connected_components(current_paths)
|
|
60
65
|
|
|
61
66
|
if len(components) <= 1:
|
|
62
67
|
current_completed = progress.tasks[0].completed
|
|
@@ -96,19 +101,30 @@ class GraphConnector:
|
|
|
96
101
|
source = self._pick_representative(list(seeds_a))
|
|
97
102
|
target = self._pick_representative(list(seeds_b))
|
|
98
103
|
|
|
99
|
-
|
|
104
|
+
path_triplets = self.bfs.execute_task(
|
|
100
105
|
[source, target],
|
|
101
106
|
progress,
|
|
102
107
|
task,
|
|
103
108
|
)
|
|
104
109
|
|
|
105
|
-
if
|
|
106
|
-
|
|
107
|
-
|
|
110
|
+
if path_triplets:
|
|
111
|
+
new_path = {
|
|
112
|
+
"seed": source,
|
|
113
|
+
"target": target,
|
|
114
|
+
"triples": path_triplets,
|
|
115
|
+
}
|
|
116
|
+
current_paths.append(new_path)
|
|
117
|
+
|
|
118
|
+
for s, p, o in path_triplets:
|
|
108
119
|
self.graph.add_edge(s, o)
|
|
109
120
|
else:
|
|
110
121
|
failed_component_pairs.add(
|
|
111
122
|
tuple(sorted([tuple(seeds_a), tuple(seeds_b)]))
|
|
112
123
|
)
|
|
113
124
|
|
|
114
|
-
|
|
125
|
+
final_triplets_count = sum(len(p.get("triples", [])) for p in current_paths)
|
|
126
|
+
logger.info(
|
|
127
|
+
f"Found {final_triplets_count - initial_triplets_count} new triplets during densification."
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return current_paths
|
|
@@ -251,7 +251,7 @@ class ConsoleUI:
|
|
|
251
251
|
|
|
252
252
|
table.add_section()
|
|
253
253
|
table.add_row("Graph nodes", str(len(graph.nodes)))
|
|
254
|
-
table.add_row("
|
|
254
|
+
table.add_row("Unique triples in graph", str(len(graph.edges)))
|
|
255
255
|
|
|
256
256
|
self.console.print(
|
|
257
257
|
Panel(table, title=f"[bold green]{table_title}[/]", expand=False)
|
|
@@ -1,24 +1,18 @@
|
|
|
1
|
-
import csv
|
|
2
1
|
import json
|
|
3
2
|
import logging
|
|
4
3
|
import pickle
|
|
5
4
|
from pathlib import Path
|
|
6
|
-
|
|
7
5
|
import networkx as nx
|
|
8
|
-
from rdflib import Graph, Namespace, URIRef
|
|
6
|
+
from rdflib import Graph, Namespace, URIRef, Literal
|
|
7
|
+
from graph_seeder.utils.URIManager import URIManager
|
|
9
8
|
|
|
10
9
|
logger = logging.getLogger("subgraph")
|
|
11
10
|
|
|
12
11
|
|
|
13
12
|
class GraphExporter:
|
|
14
|
-
"""Export
|
|
13
|
+
"""Export paths and graphs to disk in various formats."""
|
|
15
14
|
|
|
16
15
|
def __init__(self, data_cfg: dict) -> None:
|
|
17
|
-
"""Initialize exporter settings.
|
|
18
|
-
|
|
19
|
-
Args:
|
|
20
|
-
output_format: Target format for triplet export.
|
|
21
|
-
"""
|
|
22
16
|
self.output_format = data_cfg.get("output_format")
|
|
23
17
|
self.output_path = Path(data_cfg.get("output_path", ".")).resolve()
|
|
24
18
|
self.output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -28,51 +22,67 @@ class GraphExporter:
|
|
|
28
22
|
if self.stats_output_path:
|
|
29
23
|
self.stats_output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
30
24
|
|
|
31
|
-
def
|
|
25
|
+
def save_results(
|
|
32
26
|
self,
|
|
33
|
-
|
|
34
|
-
|
|
27
|
+
extracted_paths: list[dict],
|
|
28
|
+
uri_manager: URIManager,
|
|
35
29
|
name_suffix: str = "",
|
|
36
30
|
) -> None:
|
|
37
|
-
"""Write
|
|
38
|
-
|
|
39
|
-
Args:
|
|
40
|
-
triplets: Sequence of ``(subject, predicate, object)`` identifiers.
|
|
41
|
-
namespaces: Dictionary mapping namespace prefixes to URIs.
|
|
42
|
-
name_suffix: Optional suffix for the output file name (before extension).
|
|
43
|
-
|
|
44
|
-
Raises:
|
|
45
|
-
ValueError: If the configured output format is not supported.
|
|
46
|
-
"""
|
|
31
|
+
"""Write extracted paths to disk using the configured output format."""
|
|
47
32
|
path = self.output_path.with_name(
|
|
48
33
|
f"{self.output_path.stem}{name_suffix}.{self.output_format}"
|
|
49
34
|
)
|
|
50
35
|
fmt = self.output_format
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
36
|
+
|
|
37
|
+
if fmt == "json":
|
|
38
|
+
compressed_paths = []
|
|
39
|
+
for path_data in extracted_paths:
|
|
40
|
+
compressed_paths.append(
|
|
41
|
+
{
|
|
42
|
+
"seed": uri_manager.compress_uri(path_data["seed"]),
|
|
43
|
+
"target": uri_manager.compress_uri(path_data["target"]),
|
|
44
|
+
"triples": [
|
|
45
|
+
[
|
|
46
|
+
uri_manager.compress_uri(s),
|
|
47
|
+
uri_manager.compress_uri(p),
|
|
48
|
+
uri_manager.compress_uri(o),
|
|
49
|
+
]
|
|
50
|
+
for s, p, o in path_data.get("triples", [])
|
|
51
|
+
],
|
|
52
|
+
}
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
final_json = {"@context": uri_manager.namespaces, "paths": compressed_paths}
|
|
56
|
+
|
|
59
57
|
with open(path, "w", encoding="utf-8") as f:
|
|
60
|
-
json.dump(
|
|
58
|
+
json.dump(final_json, f, indent=2)
|
|
61
59
|
|
|
62
60
|
elif fmt == "ttl":
|
|
63
61
|
rdf = Graph()
|
|
64
62
|
|
|
65
|
-
for prefix, uri in namespaces.items():
|
|
63
|
+
for prefix, uri in uri_manager.namespaces.items():
|
|
66
64
|
rdf.bind(prefix, Namespace(uri))
|
|
67
65
|
|
|
68
|
-
|
|
69
|
-
|
|
66
|
+
added_triples = set()
|
|
67
|
+
|
|
68
|
+
for path_data in extracted_paths:
|
|
69
|
+
for s, p, o in path_data.get("triples", []):
|
|
70
|
+
if (s, p, o) not in added_triples:
|
|
71
|
+
obj_node = (
|
|
72
|
+
URIRef(o) if str(o).startswith("http") else Literal(o)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
rdf.add((URIRef(s), URIRef(p), obj_node))
|
|
76
|
+
added_triples.add((s, p, o))
|
|
70
77
|
|
|
71
78
|
rdf.serialize(destination=str(path), format="turtle")
|
|
79
|
+
|
|
72
80
|
else:
|
|
73
|
-
raise ValueError(
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"Unsupported output format: '{fmt}'. Please use 'json' or 'ttl'."
|
|
83
|
+
)
|
|
74
84
|
|
|
75
|
-
logger.info(f"
|
|
85
|
+
logger.info(f"Results saved → [bold]{path}[/]")
|
|
76
86
|
|
|
77
87
|
def save_graph(self, graph: nx.MultiGraph, name_suffix: str = "") -> None:
|
|
78
88
|
"""Serialize a NetworkX graph to a gpickle file.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import statistics
|
|
2
|
+
from graph_seeder.utils.utils import get_connected_components
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class GraphStatistics:
|
|
6
|
+
"""Utility class for computing statistics on a graph."""
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def compute(paths: list[dict]) -> dict:
|
|
10
|
+
"""Compute statistics on the graph given the extracted paths."""
|
|
11
|
+
subjects = set()
|
|
12
|
+
predicates = set()
|
|
13
|
+
objects = set()
|
|
14
|
+
triplets = []
|
|
15
|
+
|
|
16
|
+
for path_data in paths:
|
|
17
|
+
for s, p, o in path_data.get("triples", []):
|
|
18
|
+
triplets.append((s, p, o))
|
|
19
|
+
subjects.add(s)
|
|
20
|
+
predicates.add(p)
|
|
21
|
+
objects.add(o)
|
|
22
|
+
|
|
23
|
+
unique_entities = len(subjects | objects)
|
|
24
|
+
|
|
25
|
+
components = get_connected_components(paths)
|
|
26
|
+
|
|
27
|
+
comp_sizes = [len(comp) for comp in components]
|
|
28
|
+
mean_size = statistics.mean(comp_sizes) if comp_sizes else 0
|
|
29
|
+
stdev_size = statistics.stdev(comp_sizes) if len(comp_sizes) > 1 else 0
|
|
30
|
+
|
|
31
|
+
return {
|
|
32
|
+
"Traversed triples": len(triplets),
|
|
33
|
+
"Unique triples": len(set(triplets)),
|
|
34
|
+
"Unique subjects": len(subjects),
|
|
35
|
+
"Unique predicates": len(predicates),
|
|
36
|
+
"Unique objects": len(objects),
|
|
37
|
+
"Unique entities": unique_entities,
|
|
38
|
+
"Connected components": len(components),
|
|
39
|
+
"Mean component size": round(mean_size, 2),
|
|
40
|
+
"Std dev component size": round(stdev_size, 2),
|
|
41
|
+
}
|
|
@@ -221,7 +221,9 @@ def load_config(config_path: str | None, overrides: dict) -> dict:
|
|
|
221
221
|
return cfg
|
|
222
222
|
|
|
223
223
|
|
|
224
|
-
def get_connected_components(
|
|
224
|
+
def get_connected_components(paths: list[dict]) -> list[set[str]]:
|
|
225
|
+
"""Get the connected components from a list of structured paths."""
|
|
225
226
|
graph: nx.Graph = nx.Graph()
|
|
226
|
-
|
|
227
|
+
for path_data in paths:
|
|
228
|
+
graph.add_edges_from((s, o) for s, p, o in path_data.get("triples", []))
|
|
227
229
|
return list(nx.connected_components(graph))
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
import networkx as nx
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class GraphStatistics:
|
|
5
|
-
"""Utility class for computing statistics on a graph."""
|
|
6
|
-
|
|
7
|
-
@staticmethod
|
|
8
|
-
def compute(triplets: list[tuple[str, str, str]]) -> dict:
|
|
9
|
-
"""Compute statistics on the graph given a list of triplets and the graph itself."""
|
|
10
|
-
subjects = set()
|
|
11
|
-
predicates = set()
|
|
12
|
-
objects = set()
|
|
13
|
-
triplets_graph = nx.Graph()
|
|
14
|
-
|
|
15
|
-
for s, p, o in triplets:
|
|
16
|
-
subjects.add(s)
|
|
17
|
-
predicates.add(p)
|
|
18
|
-
objects.add(o)
|
|
19
|
-
triplets_graph.add_edge(s, o)
|
|
20
|
-
|
|
21
|
-
nb_components = nx.number_connected_components(triplets_graph)
|
|
22
|
-
|
|
23
|
-
unique_entities = len(subjects | objects)
|
|
24
|
-
|
|
25
|
-
return {
|
|
26
|
-
"total_triplets": len(triplets),
|
|
27
|
-
"unique_subjects": len(subjects),
|
|
28
|
-
"unique_predicates": len(predicates),
|
|
29
|
-
"unique_objects": len(objects),
|
|
30
|
-
"unique_entities": unique_entities,
|
|
31
|
-
"connected_components": nb_components,
|
|
32
|
-
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/extraction/Hop/HopExpansion.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/NeighborhoodWrapper.py
RENAMED
|
File without changes
|
|
File without changes
|
{graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/sparql/BaseClient.py
RENAMED
|
File without changes
|
{graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/sparql/GraphWrapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|