graph-seeder 1.0.0.dev2__tar.gz → 1.0.0.dev3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/PKG-INFO +1 -1
  2. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/pyproject.toml +1 -1
  3. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/SubgraphExtractor.py +31 -29
  4. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/configs/dbpedia_default.json +2 -2
  5. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/configs/default.json +1 -1
  6. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/configs/europeana_default.json +2 -2
  7. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/configs/pgxlod_default.json +9 -9
  8. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/configs/wikidata_default.json +1 -1
  9. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/densification/GraphConnector.py +25 -9
  10. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/utils/ConsoleUI.py +1 -1
  11. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/utils/GraphExporter.py +46 -36
  12. graph_seeder-1.0.0.dev3/src/graph_seeder/utils/GraphStatistics.py +41 -0
  13. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/utils/utils.py +4 -2
  14. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/uv.lock +1 -1
  15. graph_seeder-1.0.0.dev2/src/graph_seeder/utils/GraphStatistics.py +0 -32
  16. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/.github/workflows/publish.yml +0 -0
  17. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/.gitignore +0 -0
  18. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/README.md +0 -0
  19. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/requirements.txt +0 -0
  20. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/GraphSeeder.py +0 -0
  21. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/extraction/BFS/BFS.py +0 -0
  22. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/extraction/ExtractionStrategy.py +0 -0
  23. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/extraction/Hop/HopExpansion.py +0 -0
  24. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/utils/Factory.py +0 -0
  25. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/utils/URIManager.py +0 -0
  26. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/NeighborhoodWrapper.py +0 -0
  27. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/hashmap/HashMapWrapper.py +0 -0
  28. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/sparql/BaseClient.py +0 -0
  29. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/sparql/GraphWrapper.py +0 -0
  30. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/sparql/SparqlQueryBuilder.py +0 -0
  31. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/sparql/client/SparqlClient.py +0 -0
  32. {graph_seeder-1.0.0.dev2 → graph_seeder-1.0.0.dev3}/src/graph_seeder/wrapper/sparql/client/TurtleClient.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: graph-seeder
3
- Version: 1.0.0.dev2
3
+ Version: 1.0.0.dev3
4
4
  Summary: A powerful tool to extract and densify subgraphs from Knowledge Graphs via SPARQL or LMDB, with different extraction strategies.
5
5
  Requires-Python: >=3.9
6
6
  Requires-Dist: lmdb>=2.2.0
@@ -7,7 +7,7 @@ packages = ["src/graph_seeder"]
7
7
 
8
8
  [project]
9
9
  name = "graph-seeder"
10
- version = "1.0.0.dev2"
10
+ version = "1.0.0.dev3"
11
11
  description = "A powerful tool to extract and densify subgraphs from Knowledge Graphs via SPARQL or LMDB, with different extraction strategies."
12
12
  readme = "README.md"
13
13
  requires-python = ">=3.9"
@@ -102,9 +102,9 @@ class SubgraphExtractor:
102
102
  raise ValueError("Data config requires an 'input_path' to a CSV file.")
103
103
 
104
104
  output_format = data_cfg.get("output_format", {})
105
- if output_format not in ["csv", "json", "ttl"]:
105
+ if output_format not in ["json", "ttl"]:
106
106
  raise ValueError(
107
- "Data config 'output_format' must be one of: 'csv', 'json', 'ttl'."
107
+ "Data config 'output_format' must be one of: 'json', 'ttl'."
108
108
  )
109
109
 
110
110
  # Validate client config
@@ -242,10 +242,8 @@ class SubgraphExtractor:
242
242
 
243
243
  return []
244
244
 
245
- def extract_subgraph(
246
- self, seeds: list[str]
247
- ) -> tuple[list[tuple[str, str, str]], set[str]]:
248
- all_triplets: list[tuple[str, str, str]] = []
245
+ def extract_subgraph(self, seeds: list[str]) -> tuple[list[dict], set[str]]:
246
+ all_paths: list[dict] = []
249
247
  seeds_found: set[str] = set()
250
248
 
251
249
  with self.ui.create_progress_bar() as progress:
@@ -266,24 +264,28 @@ class SubgraphExtractor:
266
264
  if not triplets:
267
265
  self.stats["not_found"] += 1
268
266
  else:
269
- all_triplets.extend(triplets)
267
+ all_paths.append(
268
+ {
269
+ "seed": clean_nodes[0],
270
+ "target": clean_nodes[1] if len(clean_nodes) > 1 else None,
271
+ "triples": triplets,
272
+ }
273
+ )
270
274
  seeds_found.update(clean_nodes)
271
275
  self.stats["found"] += 1
272
276
 
273
- self.exporter.save_triplets(all_triplets, self.uri_manager.namespaces)
277
+ self.exporter.save_results(all_paths, self.uri_manager)
274
278
  self.exporter.save_graph(self.extractor_strategy.graph)
275
279
 
276
280
  logger.info("Computing final graph statistics...")
277
281
 
278
282
  self.print_final_summary(
279
- all_triplets, self.extractor_strategy.graph, "Extraction summary"
283
+ all_paths, self.extractor_strategy.graph, "Extraction summary"
280
284
  )
281
285
 
282
- return all_triplets, seeds_found
286
+ return all_paths, seeds_found
283
287
 
284
- def densify_graph(
285
- self, triplets: list[tuple[str, str, str]], seeds_found: set[str]
286
- ):
288
+ def densify_graph(self, all_paths: list[dict], seeds_found: set[str]):
287
289
  explored_nodes = self.extractor_strategy.explored_nodes
288
290
 
289
291
  graph_connector: GraphConnector = GraphConnector(
@@ -294,22 +296,20 @@ class SubgraphExtractor:
294
296
  self.ui,
295
297
  self.cfg,
296
298
  )
297
- nb_components = get_connected_components(triplets)
299
+ nb_components = get_connected_components(all_paths)
298
300
  if len(nb_components) > 1:
299
301
  logger.warning(
300
302
  f"The extracted graph has {len(nb_components)} disconnected components. Starting densification to connect them...\n"
301
303
  )
302
304
 
303
- new_triplets = graph_connector.connect(seeds_found, triplets)
305
+ densified_paths = graph_connector.connect(all_paths, seeds_found)
304
306
 
305
- logger.info(
306
- f"Found {len(new_triplets) - len(triplets)} new triplets during densification."
307
+ self.save(
308
+ densified_paths, graph_connector.bfs.graph, name_suffix="_densified"
307
309
  )
308
310
 
309
- self.save(new_triplets, graph_connector.bfs.graph, name_suffix="_densified")
310
-
311
311
  self.print_final_summary(
312
- new_triplets,
312
+ densified_paths,
313
313
  graph_connector.bfs.graph,
314
314
  "Densification summary",
315
315
  "_densified",
@@ -321,23 +321,23 @@ class SubgraphExtractor:
321
321
 
322
322
  def print_final_summary(
323
323
  self,
324
- all_triplets: list[tuple[str, str, str]],
324
+ paths: list[dict],
325
325
  graph: nx.MultiGraph,
326
326
  table_title: str,
327
327
  name_suffix: str = "",
328
328
  ) -> None:
329
- detailed_stats = GraphStatistics.compute(all_triplets)
329
+ detailed_stats = GraphStatistics.compute(paths)
330
330
  self.ui.print_summary(self.stats, detailed_stats, graph, table_title)
331
331
 
332
332
  self.exporter.save_stats(self.stats, detailed_stats, name_suffix)
333
333
 
334
334
  def save(
335
335
  self,
336
- triplets: list[tuple[str, str, str]],
336
+ paths: list[dict],
337
337
  graph: nx.MultiGraph,
338
338
  name_suffix: str = "",
339
339
  ) -> None:
340
- self.exporter.save_triplets(triplets, self.uri_manager.namespaces, name_suffix)
340
+ self.exporter.save_results(paths, self.uri_manager, name_suffix)
341
341
  self.exporter.save_graph(graph, name_suffix)
342
342
 
343
343
  def run(self) -> None:
@@ -352,7 +352,9 @@ class SubgraphExtractor:
352
352
  )
353
353
  return
354
354
 
355
- unique_seeds = list(set([str(seed).strip() for row in seeds for seed in row]))
355
+ unique_seeds = list(
356
+ dict.fromkeys([str(seed).strip() for row in seeds for seed in row])
357
+ )
356
358
 
357
359
  if self.check_seeds_validity:
358
360
  invalid_seeds = self._check_seeds_validity(unique_seeds)
@@ -389,13 +391,13 @@ class SubgraphExtractor:
389
391
  seeds = [list(pair) for pair in itertools.combinations(unique_seeds, 2)]
390
392
  logger.info(f"Created {len(seeds)} pairs to explore.\n")
391
393
 
392
- all_triplets, seeds_found = self.extract_subgraph(seeds)
394
+ all_paths, seeds_found = self.extract_subgraph(seeds)
393
395
 
394
- if not all_triplets:
395
- logger.error("No triplets were extracted. Exiting without saving.")
396
+ if not all_paths:
397
+ logger.error("No paths were extracted. Exiting without saving.")
396
398
  return
397
399
 
398
400
  if self.skip_densification:
399
401
  return
400
402
 
401
- self.densify_graph(all_triplets, seeds_found)
403
+ self.densify_graph(all_paths, seeds_found)
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "data": {
3
3
  "input_path": "seed.csv",
4
- "output_format": "csv",
4
+ "output_format": "json",
5
5
  "output_path": "output/result",
6
6
  "stats_output_path": "output/stats.json"
7
7
  },
@@ -40,7 +40,7 @@
40
40
  "extraction": {
41
41
  "strategy": "bfs",
42
42
  "create_all_pairs": false,
43
- "batch_size": 15,
43
+ "batch_size": 30,
44
44
  "max_hops": 6,
45
45
  "hub_pagination_threshold": 70000,
46
46
  "max_neighbors_threshold": 300000,
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "data": {
3
3
  "input_path": "seeds.csv",
4
- "output_format": "csv",
4
+ "output_format": "json",
5
5
  "output_path": "output/result",
6
6
  "stats_output_path": "output/stats.json"
7
7
  },
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "data": {
3
3
  "input_path": "seed.csv",
4
- "output_format": "csv",
4
+ "output_format": "json",
5
5
  "output_path": "output/result",
6
6
  "stats_output_path": "output/stats.json"
7
7
  },
@@ -31,7 +31,7 @@
31
31
  "extraction": {
32
32
  "strategy": "bfs",
33
33
  "create_all_pairs": false,
34
- "batch_size": 15,
34
+ "batch_size": 30,
35
35
  "max_hops": 6,
36
36
  "hub_pagination_threshold": 60000,
37
37
  "max_neighbors_threshold": 150000,
@@ -1,9 +1,9 @@
1
1
  {
2
2
  "data": {
3
3
  "input_path": "seed.csv",
4
- "output_format": "csv",
4
+ "output_format": "json",
5
5
  "output_path": "output/result",
6
- "stats_output_path": "output/stats.json"
6
+ "stats_output_path": "output/stats.json"
7
7
  },
8
8
  "client": {
9
9
  "type": "SPARQL",
@@ -19,16 +19,16 @@
19
19
  "include_uri_prefixes": [],
20
20
  "exclude_uri_prefixes": [],
21
21
  "exclude_nodes": [],
22
- "exclude_properties": [],
23
- "namespaces": {
24
- "pharmgkb": "http://bio2rdf.org/pharmgkb",
25
- "pgxlod": "http://pgxlod.loria.fr/resource/"
26
- }
22
+ "exclude_properties": [],
23
+ "namespaces": {
24
+ "pharmgkb": "http://bio2rdf.org/pharmgkb",
25
+ "pgxlod": "http://pgxlod.loria.fr/resource/"
26
+ }
27
27
  },
28
28
  "extraction": {
29
29
  "strategy": "bfs",
30
30
  "create_all_pairs": false,
31
- "batch_size": 15,
31
+ "batch_size": 30,
32
32
  "max_hops": 6,
33
33
  "hub_pagination_threshold": 60000,
34
34
  "max_neighbors_threshold": 150000,
@@ -46,4 +46,4 @@
46
46
  "debug_enabled": false,
47
47
  "request_logging": false
48
48
  }
49
- }
49
+ }
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "data": {
3
3
  "input_path": "seed.csv",
4
- "output_format": "csv",
4
+ "output_format": "json",
5
5
  "output_path": "output/result",
6
6
  "stats_output_path": "output/stats.json"
7
7
  },
@@ -47,16 +47,21 @@ class GraphConnector:
47
47
  return comp_seeds[0]
48
48
 
49
49
  def connect(
50
- self, found_seeds: set[str], triplets: list[tuple[str, str, str]]
51
- ) -> list[tuple[str, str, str]]:
52
- new_triplets: list[tuple[str, str, str]] = list(triplets)
50
+ self,
51
+ initial_paths: list[dict],
52
+ found_seeds: set[str],
53
+ ) -> list[dict]:
54
+
55
+ current_paths: list[dict] = list(initial_paths)
53
56
  failed_component_pairs = set()
54
57
 
58
+ initial_triplets_count = sum(len(p.get("triples", [])) for p in current_paths)
59
+
55
60
  with self.ui.create_progress_bar() as progress:
56
61
  task = progress.add_task("Components densification", total=None)
57
62
 
58
63
  while True:
59
- components = get_connected_components(new_triplets)
64
+ components = get_connected_components(current_paths)
60
65
 
61
66
  if len(components) <= 1:
62
67
  current_completed = progress.tasks[0].completed
@@ -96,19 +101,30 @@ class GraphConnector:
96
101
  source = self._pick_representative(list(seeds_a))
97
102
  target = self._pick_representative(list(seeds_b))
98
103
 
99
- triplets = self.bfs.execute_task(
104
+ path_triplets = self.bfs.execute_task(
100
105
  [source, target],
101
106
  progress,
102
107
  task,
103
108
  )
104
109
 
105
- if triplets:
106
- new_triplets.extend(triplets)
107
- for s, p, o in triplets:
110
+ if path_triplets:
111
+ new_path = {
112
+ "seed": source,
113
+ "target": target,
114
+ "triples": path_triplets,
115
+ }
116
+ current_paths.append(new_path)
117
+
118
+ for s, p, o in path_triplets:
108
119
  self.graph.add_edge(s, o)
109
120
  else:
110
121
  failed_component_pairs.add(
111
122
  tuple(sorted([tuple(seeds_a), tuple(seeds_b)]))
112
123
  )
113
124
 
114
- return new_triplets
125
+ final_triplets_count = sum(len(p.get("triples", [])) for p in current_paths)
126
+ logger.info(
127
+ f"Found {final_triplets_count - initial_triplets_count} new triplets during densification."
128
+ )
129
+
130
+ return current_paths
@@ -251,7 +251,7 @@ class ConsoleUI:
251
251
 
252
252
  table.add_section()
253
253
  table.add_row("Graph nodes", str(len(graph.nodes)))
254
- table.add_row("Graph edges", str(len(graph.edges)))
254
+ table.add_row("Unique triples in graph", str(len(graph.edges)))
255
255
 
256
256
  self.console.print(
257
257
  Panel(table, title=f"[bold green]{table_title}[/]", expand=False)
@@ -1,24 +1,18 @@
1
- import csv
2
1
  import json
3
2
  import logging
4
3
  import pickle
5
4
  from pathlib import Path
6
-
7
5
  import networkx as nx
8
- from rdflib import Graph, Namespace, URIRef
6
+ from rdflib import Graph, Namespace, URIRef, Literal
7
+ from graph_seeder.utils.URIManager import URIManager
9
8
 
10
9
  logger = logging.getLogger("subgraph")
11
10
 
12
11
 
13
12
  class GraphExporter:
14
- """Export triplets and graphs to disk in various formats."""
13
+ """Export paths and graphs to disk in various formats."""
15
14
 
16
15
  def __init__(self, data_cfg: dict) -> None:
17
- """Initialize exporter settings.
18
-
19
- Args:
20
- output_format: Target format for triplet export.
21
- """
22
16
  self.output_format = data_cfg.get("output_format")
23
17
  self.output_path = Path(data_cfg.get("output_path", ".")).resolve()
24
18
  self.output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -28,51 +22,67 @@ class GraphExporter:
28
22
  if self.stats_output_path:
29
23
  self.stats_output_path.parent.mkdir(parents=True, exist_ok=True)
30
24
 
31
- def save_triplets(
25
+ def save_results(
32
26
  self,
33
- triplets: list[tuple[str, str, str]],
34
- namespaces: dict[str, str],
27
+ extracted_paths: list[dict],
28
+ uri_manager: URIManager,
35
29
  name_suffix: str = "",
36
30
  ) -> None:
37
- """Write triplets to disk using the configured output format.
38
-
39
- Args:
40
- triplets: Sequence of ``(subject, predicate, object)`` identifiers.
41
- namespaces: Dictionary mapping namespace prefixes to URIs.
42
- name_suffix: Optional suffix for the output file name (before extension).
43
-
44
- Raises:
45
- ValueError: If the configured output format is not supported.
46
- """
31
+ """Write extracted paths to disk using the configured output format."""
47
32
  path = self.output_path.with_name(
48
33
  f"{self.output_path.stem}{name_suffix}.{self.output_format}"
49
34
  )
50
35
  fmt = self.output_format
51
- if fmt == "csv":
52
- with open(path, "w", newline="", encoding="utf-8") as f:
53
- writer = csv.writer(f)
54
- writer.writerow(["subject", "predicate", "object"])
55
- writer.writerows(triplets)
56
-
57
- elif fmt == "json":
58
- data = [{"subject": s, "predicate": p, "object": o} for s, p, o in triplets]
36
+
37
+ if fmt == "json":
38
+ compressed_paths = []
39
+ for path_data in extracted_paths:
40
+ compressed_paths.append(
41
+ {
42
+ "seed": uri_manager.compress_uri(path_data["seed"]),
43
+ "target": uri_manager.compress_uri(path_data["target"]),
44
+ "triples": [
45
+ [
46
+ uri_manager.compress_uri(s),
47
+ uri_manager.compress_uri(p),
48
+ uri_manager.compress_uri(o),
49
+ ]
50
+ for s, p, o in path_data.get("triples", [])
51
+ ],
52
+ }
53
+ )
54
+
55
+ final_json = {"@context": uri_manager.namespaces, "paths": compressed_paths}
56
+
59
57
  with open(path, "w", encoding="utf-8") as f:
60
- json.dump(data, f, indent=2)
58
+ json.dump(final_json, f, indent=2)
61
59
 
62
60
  elif fmt == "ttl":
63
61
  rdf = Graph()
64
62
 
65
- for prefix, uri in namespaces.items():
63
+ for prefix, uri in uri_manager.namespaces.items():
66
64
  rdf.bind(prefix, Namespace(uri))
67
65
 
68
- for s, p, o in triplets:
69
- rdf.add((URIRef(s), URIRef(p), URIRef(o)))
66
+ added_triples = set()
67
+
68
+ for path_data in extracted_paths:
69
+ for s, p, o in path_data.get("triples", []):
70
+ if (s, p, o) not in added_triples:
71
+ obj_node = (
72
+ URIRef(o) if str(o).startswith("http") else Literal(o)
73
+ )
74
+
75
+ rdf.add((URIRef(s), URIRef(p), obj_node))
76
+ added_triples.add((s, p, o))
70
77
 
71
78
  rdf.serialize(destination=str(path), format="turtle")
79
+
72
80
  else:
73
- raise ValueError(f"Unsupported output format: {fmt!r}")
81
+ raise ValueError(
82
+ f"Unsupported output format: '{fmt}'. Please use 'json' or 'ttl'."
83
+ )
74
84
 
75
- logger.info(f"Triplets saved → [bold]{path}[/]")
85
+ logger.info(f"Results saved → [bold]{path}[/]")
76
86
 
77
87
  def save_graph(self, graph: nx.MultiGraph, name_suffix: str = "") -> None:
78
88
  """Serialize a NetworkX graph to a gpickle file.
@@ -0,0 +1,41 @@
1
+ import statistics
2
+ from graph_seeder.utils.utils import get_connected_components
3
+
4
+
5
+ class GraphStatistics:
6
+ """Utility class for computing statistics on a graph."""
7
+
8
+ @staticmethod
9
+ def compute(paths: list[dict]) -> dict:
10
+ """Compute statistics on the graph given the extracted paths."""
11
+ subjects = set()
12
+ predicates = set()
13
+ objects = set()
14
+ triplets = []
15
+
16
+ for path_data in paths:
17
+ for s, p, o in path_data.get("triples", []):
18
+ triplets.append((s, p, o))
19
+ subjects.add(s)
20
+ predicates.add(p)
21
+ objects.add(o)
22
+
23
+ unique_entities = len(subjects | objects)
24
+
25
+ components = get_connected_components(paths)
26
+
27
+ comp_sizes = [len(comp) for comp in components]
28
+ mean_size = statistics.mean(comp_sizes) if comp_sizes else 0
29
+ stdev_size = statistics.stdev(comp_sizes) if len(comp_sizes) > 1 else 0
30
+
31
+ return {
32
+ "Traversed triples": len(triplets),
33
+ "Unique triples": len(set(triplets)),
34
+ "Unique subjects": len(subjects),
35
+ "Unique predicates": len(predicates),
36
+ "Unique objects": len(objects),
37
+ "Unique entities": unique_entities,
38
+ "Connected components": len(components),
39
+ "Mean component size": round(mean_size, 2),
40
+ "Std dev component size": round(stdev_size, 2),
41
+ }
@@ -221,7 +221,9 @@ def load_config(config_path: str | None, overrides: dict) -> dict:
221
221
  return cfg
222
222
 
223
223
 
224
- def get_connected_components(triplets: list[tuple[str, str, str]]) -> list[set[str]]:
224
+ def get_connected_components(paths: list[dict]) -> list[set[str]]:
225
+ """Get the connected components from a list of structured paths."""
225
226
  graph: nx.Graph = nx.Graph()
226
- graph.add_edges_from((s, o) for s, p, o in triplets)
227
+ for path_data in paths:
228
+ graph.add_edges_from((s, o) for s, p, o in path_data.get("triples", []))
227
229
  return list(nx.connected_components(graph))
@@ -147,7 +147,7 @@ wheels = [
147
147
 
148
148
  [[package]]
149
149
  name = "graph-seeder"
150
- version = "1.0.0.dev2"
150
+ version = "1.0.0.dev3"
151
151
  source = { editable = "." }
152
152
  dependencies = [
153
153
  { name = "lmdb" },
@@ -1,32 +0,0 @@
1
- import networkx as nx
2
-
3
-
4
- class GraphStatistics:
5
- """Utility class for computing statistics on a graph."""
6
-
7
- @staticmethod
8
- def compute(triplets: list[tuple[str, str, str]]) -> dict:
9
- """Compute statistics on the graph given a list of triplets and the graph itself."""
10
- subjects = set()
11
- predicates = set()
12
- objects = set()
13
- triplets_graph = nx.Graph()
14
-
15
- for s, p, o in triplets:
16
- subjects.add(s)
17
- predicates.add(p)
18
- objects.add(o)
19
- triplets_graph.add_edge(s, o)
20
-
21
- nb_components = nx.number_connected_components(triplets_graph)
22
-
23
- unique_entities = len(subjects | objects)
24
-
25
- return {
26
- "total_triplets": len(triplets),
27
- "unique_subjects": len(subjects),
28
- "unique_predicates": len(predicates),
29
- "unique_objects": len(objects),
30
- "unique_entities": unique_entities,
31
- "connected_components": nb_components,
32
- }