graph-seeder 1.0.0.dev5__tar.gz → 1.0.0.dev8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/PKG-INFO +1 -1
  2. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/pyproject.toml +1 -1
  3. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/SubgraphExtractor.py +17 -14
  4. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/densification/GraphConnector.py +14 -17
  5. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/extraction/BFS/BFS.py +30 -14
  6. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/extraction/ExtractionStrategy.py +5 -2
  7. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/extraction/Hop/HopExpansion.py +33 -15
  8. graph_seeder-1.0.0.dev8/src/graph_seeder/models/RDFNode.py +45 -0
  9. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/utils/GraphExporter.py +41 -31
  10. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/utils/GraphStatistics.py +9 -8
  11. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/utils/utils.py +14 -2
  12. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/NeighborhoodWrapper.py +2 -1
  13. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/hashmap/HashMapWrapper.py +16 -4
  14. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/sparql/GraphWrapper.py +17 -15
  15. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/sparql/client/TurtleClient.py +6 -11
  16. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/uv.lock +1 -1
  17. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/.github/workflows/publish.yml +0 -0
  18. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/.gitignore +0 -0
  19. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/README.md +0 -0
  20. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/requirements.txt +0 -0
  21. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/GraphSeeder.py +0 -0
  22. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/configs/dbpedia_default.json +0 -0
  23. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/configs/default.json +0 -0
  24. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/configs/europeana_default.json +0 -0
  25. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/configs/pgxlod_default.json +0 -0
  26. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/configs/wikidata_default.json +0 -0
  27. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/utils/ConsoleUI.py +0 -0
  28. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/utils/Factory.py +0 -0
  29. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/utils/URIManager.py +0 -0
  30. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/sparql/BaseClient.py +0 -0
  31. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/sparql/SparqlQueryBuilder.py +0 -0
  32. {graph_seeder-1.0.0.dev5 → graph_seeder-1.0.0.dev8}/src/graph_seeder/wrapper/sparql/client/SparqlClient.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: graph-seeder
3
- Version: 1.0.0.dev5
3
+ Version: 1.0.0.dev8
4
4
  Summary: A powerful tool to extract and densify subgraphs from Knowledge Graphs via SPARQL or LMDB, with different extraction strategies.
5
5
  Requires-Python: >=3.9
6
6
  Requires-Dist: lmdb>=2.2.0
@@ -7,7 +7,7 @@ packages = ["src/graph_seeder"]
7
7
 
8
8
  [project]
9
9
  name = "graph-seeder"
10
- version = "1.0.0.dev5"
10
+ version = "1.0.0.dev8"
11
11
  description = "A powerful tool to extract and densify subgraphs from Knowledge Graphs via SPARQL or LMDB, with different extraction strategies."
12
12
  readme = "README.md"
13
13
  requires-python = ">=3.9"
@@ -8,6 +8,7 @@ from rich.logging import RichHandler
8
8
  from rich.prompt import Confirm
9
9
  from graph_seeder.extraction.ExtractionStrategy import ExtractionStrategy
10
10
  from graph_seeder.densification.GraphConnector import GraphConnector
11
+ from graph_seeder.models.RDFNode import ExtractionResult
11
12
  from graph_seeder.utils.Factory import ComponentFactory
12
13
  from graph_seeder.utils.ConsoleUI import ConsoleUI
13
14
  from graph_seeder.utils.GraphExporter import GraphExporter
@@ -242,8 +243,10 @@ class SubgraphExtractor:
242
243
 
243
244
  return []
244
245
 
245
- def extract_subgraph(self, seeds: list[str]) -> tuple[list[dict], set[str]]:
246
- all_paths: list[dict] = []
246
+ def extract_subgraph(
247
+ self, seeds: list[str]
248
+ ) -> tuple[list[ExtractionResult], set[str]]:
249
+ all_results: list[ExtractionResult] = []
247
250
  seeds_found: set[str] = set()
248
251
 
249
252
  with self.ui.create_progress_bar() as progress:
@@ -264,7 +267,7 @@ class SubgraphExtractor:
264
267
  if not triplets:
265
268
  self.stats["not_found"] += 1
266
269
  else:
267
- all_paths.append(
270
+ all_results.append(
268
271
  {
269
272
  "seed": clean_nodes[0],
270
273
  "target": clean_nodes[1] if len(clean_nodes) > 1 else None,
@@ -274,18 +277,18 @@ class SubgraphExtractor:
274
277
  seeds_found.update(clean_nodes)
275
278
  self.stats["found"] += 1
276
279
 
277
- self.exporter.save_results(all_paths, self.uri_manager)
280
+ self.exporter.save_results(all_results, self.uri_manager)
278
281
  self.exporter.save_graph(self.extractor_strategy.graph)
279
282
 
280
283
  logger.info("Computing final graph statistics...")
281
284
 
282
285
  self.print_final_summary(
283
- all_paths, self.extractor_strategy.graph, "Extraction summary"
286
+ all_results, self.extractor_strategy.graph, "Extraction summary"
284
287
  )
285
288
 
286
- return all_paths, seeds_found
289
+ return all_results, seeds_found
287
290
 
288
- def densify_graph(self, all_paths: list[dict], seeds_found: set[str]):
291
+ def densify_graph(self, all_results: list[ExtractionResult], seeds_found: set[str]):
289
292
  explored_nodes = self.extractor_strategy.explored_nodes
290
293
 
291
294
  graph_connector: GraphConnector = GraphConnector(
@@ -296,13 +299,13 @@ class SubgraphExtractor:
296
299
  self.ui,
297
300
  self.cfg,
298
301
  )
299
- nb_components = get_connected_components(all_paths)
302
+ nb_components = get_connected_components(all_results)
300
303
  if len(nb_components) > 1:
301
304
  logger.warning(
302
305
  f"The extracted graph has {len(nb_components)} disconnected components. Starting densification to connect them...\n"
303
306
  )
304
307
 
305
- densified_paths = graph_connector.connect(all_paths, seeds_found)
308
+ densified_paths = graph_connector.connect(all_results, seeds_found)
306
309
 
307
310
  self.save(
308
311
  densified_paths, graph_connector.bfs.graph, name_suffix="_densified"
@@ -321,7 +324,7 @@ class SubgraphExtractor:
321
324
 
322
325
  def print_final_summary(
323
326
  self,
324
- paths: list[dict],
327
+ paths: list[ExtractionResult],
325
328
  graph: nx.MultiGraph,
326
329
  table_title: str,
327
330
  name_suffix: str = "",
@@ -333,7 +336,7 @@ class SubgraphExtractor:
333
336
 
334
337
  def save(
335
338
  self,
336
- paths: list[dict],
339
+ paths: list[ExtractionResult],
337
340
  graph: nx.MultiGraph,
338
341
  name_suffix: str = "",
339
342
  ) -> None:
@@ -391,13 +394,13 @@ class SubgraphExtractor:
391
394
  seeds = [list(pair) for pair in itertools.combinations(unique_seeds, 2)]
392
395
  logger.info(f"Created {len(seeds)} pairs to explore.\n")
393
396
 
394
- all_paths, seeds_found = self.extract_subgraph(seeds)
397
+ all_results, seeds_found = self.extract_subgraph(seeds)
395
398
 
396
- if not all_paths:
399
+ if not all_results:
397
400
  logger.error("No paths were extracted. Exiting without saving.")
398
401
  return
399
402
 
400
403
  if self.skip_densification:
401
404
  return
402
405
 
403
- self.densify_graph(all_paths, seeds_found)
406
+ self.densify_graph(all_results, seeds_found)
@@ -8,6 +8,7 @@ from graph_seeder.utils.URIManager import URIManager
8
8
  from graph_seeder.utils.utils import get_connected_components
9
9
  from graph_seeder.utils.Factory import ComponentFactory
10
10
  from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
11
+ from graph_seeder.models.RDFNode import ExtractionResult
11
12
 
12
13
  logger = logging.getLogger("subgraph")
13
14
 
@@ -50,20 +51,20 @@ class GraphConnector:
50
51
 
51
52
  def connect(
52
53
  self,
53
- initial_paths: list[dict],
54
+ initial_results: list[ExtractionResult],
54
55
  found_seeds: set[str],
55
- ) -> list[dict]:
56
+ ) -> list[ExtractionResult]:
56
57
 
57
- current_paths: list[dict] = list(initial_paths)
58
+ current_results: list[ExtractionResult] = list(initial_results)
58
59
  failed_component_pairs = set()
59
60
 
60
- initial_triplets_count = sum(len(p.get("triples", [])) for p in current_paths)
61
+ initial_triplets_count = sum(len(p.get("triples", [])) for p in current_results)
61
62
 
62
63
  with self.ui.create_progress_bar() as progress:
63
64
  task = progress.add_task("Components densification", total=None)
64
65
 
65
66
  while True:
66
- components = get_connected_components(current_paths)
67
+ components = get_connected_components(current_results)
67
68
 
68
69
  if len(components) <= 1:
69
70
  current_completed = progress.tasks[0].completed
@@ -110,23 +111,19 @@ class GraphConnector:
110
111
  )
111
112
 
112
113
  if path_triplets:
113
- new_path = {
114
- "seed": source,
115
- "target": target,
116
- "triples": path_triplets,
117
- }
118
- current_paths.append(new_path)
114
+ new_result = ExtractionResult(
115
+ seed=source, target=target, triples=path_triplets
116
+ )
117
+ current_results.append(new_result)
119
118
 
120
119
  for s, p, o in path_triplets:
121
- self.graph.add_edge(s, o)
120
+ self.graph.add_edge(s.value, o.value)
122
121
  else:
123
- failed_component_pairs.add(
124
- tuple(sorted([tuple(seeds_a), tuple(seeds_b)]))
125
- )
122
+ failed_component_pairs.add(pair_id)
126
123
 
127
- final_triplets_count = sum(len(p.get("triples", [])) for p in current_paths)
124
+ final_triplets_count = sum(len(p.get("triples", [])) for p in current_results)
128
125
  logger.info(
129
126
  f"Found {final_triplets_count - initial_triplets_count} new triplets during densification."
130
127
  )
131
128
 
132
- return current_paths
129
+ return current_results
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from graph_seeder.models.RDFNode import RDFNode
2
3
  from graph_seeder.utils.URIManager import URIManager
3
4
  from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
4
5
  from graph_seeder.extraction.ExtractionStrategy import ExtractionStrategy
@@ -46,7 +47,9 @@ class BidirectionalBFS(ExtractionStrategy):
46
47
  ) -> str:
47
48
  return f"Extracting path for {self.format_progress_description(nodes)}"
48
49
 
49
- def extract(self, nodes: list[str]) -> tuple[list[tuple[str, str, str]], str]:
50
+ def extract(
51
+ self, nodes: list[str]
52
+ ) -> tuple[list[tuple[RDFNode, RDFNode, RDFNode]], str]:
50
53
  """Extract a subgraph connecting the given seed nodes using bidirectional BFS.
51
54
  Returns:
52
55
  Tuple of (list of path triplets, result message)"""
@@ -60,7 +63,7 @@ class BidirectionalBFS(ExtractionStrategy):
60
63
 
61
64
  def _find_path(
62
65
  self, source: str, target: str
63
- ) -> tuple[list[tuple[str, str, str]], str]:
66
+ ) -> tuple[list[tuple[RDFNode, RDFNode, RDFNode]], str]:
64
67
  """Find a path between two nodes within the configured hop limit.
65
68
  Returns:
66
69
  Tuple of (path_triplets, result message)
@@ -160,19 +163,28 @@ class BidirectionalBFS(ExtractionStrategy):
160
163
  self.graph.add_node(node)
161
164
 
162
165
  for triplets in self.wrapper.get_neighborhood(nodes_to_query):
163
- for subj, predicate, obj in triplets:
166
+ for subj_node, pred_node, obj_node in triplets:
167
+ subj_val = subj_node.value
168
+ obj_val = obj_node.value
169
+ pred_val = pred_node.value
170
+
164
171
  if (
165
- subj in self._excluded_nodes
166
- or obj in self._excluded_nodes
167
- or predicate in self._excluded_properties
172
+ subj_val in self._excluded_nodes
173
+ or obj_val in self._excluded_nodes
174
+ or pred_val in self._excluded_properties
168
175
  ):
169
176
  continue
170
177
 
171
178
  self.graph.add_edge(
172
- subj, obj, predicate=predicate, original_subj=subj, original_obj=obj
179
+ subj_val,
180
+ obj_val,
181
+ predicate=pred_val,
182
+ subj_node=subj_node,
183
+ pred_node=pred_node,
184
+ obj_node=obj_node,
173
185
  )
174
186
 
175
- for n in (subj, obj):
187
+ for n in (subj_val, obj_val):
176
188
  if n not in nodes_visited:
177
189
  next_level.add(n)
178
190
 
@@ -184,19 +196,23 @@ class BidirectionalBFS(ExtractionStrategy):
184
196
 
185
197
  def _extract_path_triplets(
186
198
  self, source: str, target: str
187
- ) -> list[tuple[str, str, str]]:
199
+ ) -> list[tuple[RDFNode, RDFNode, RDFNode]]:
188
200
  """Build a triple sequence for the shortest path currently in the graph."""
189
201
  path_nodes: list[str] = nx.shortest_path(
190
202
  self.graph, source=source, target=target
191
203
  )
192
- triplets: list[tuple[str, str, str]] = []
204
+ triplets: list[tuple[RDFNode, RDFNode, RDFNode]] = []
193
205
 
194
206
  for u, v in zip(path_nodes, path_nodes[1:]):
195
207
  edges = self.graph[u][v]
196
208
  edge_data = edges[next(iter(edges))]
197
- predicate = edge_data.get("predicate", "unknown_property")
198
- subj = edge_data.get("original_subj", u)
199
- obj = edge_data.get("original_obj", v)
200
- triplets.append((subj, predicate, obj))
209
+
210
+ subj_node = edge_data.get("subj_node", RDFNode(u, "uri"))
211
+ pred_node = edge_data.get(
212
+ "pred_node", RDFNode(edge_data.get("predicate", "unknown"), "uri")
213
+ )
214
+ obj_node = edge_data.get("obj_node", RDFNode(v, "uri"))
215
+
216
+ triplets.append((subj_node, pred_node, obj_node))
201
217
 
202
218
  return triplets
@@ -1,5 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from networkx import MultiGraph
3
+ from graph_seeder.models.RDFNode import RDFNode
3
4
  from graph_seeder.utils.URIManager import URIManager
4
5
  from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
5
6
  from rich.progress import (
@@ -28,7 +29,9 @@ class ExtractionStrategy(ABC):
28
29
  self.explored_nodes = set() if explored_nodes is None else explored_nodes
29
30
 
30
31
  @abstractmethod
31
- def extract(self, nodes: list[str]) -> tuple[list[tuple[str, str, str]], str]:
32
+ def extract(
33
+ self, nodes: list[str]
34
+ ) -> tuple[list[tuple[RDFNode, RDFNode, RDFNode]], str]:
32
35
  """Extract a subgraph given a list of seed nodes.
33
36
  Returns:
34
37
  Tuple of (list of triplets, result message)"""
@@ -46,7 +49,7 @@ class ExtractionStrategy(ABC):
46
49
 
47
50
  def execute_task(
48
51
  self, nodes: list[str], progress: Progress, task: TaskID
49
- ) -> list[tuple[str, str, str]]:
52
+ ) -> list[tuple[RDFNode, RDFNode, RDFNode]]:
50
53
  """Execute the extraction task with progress bar updates and error handling."""
51
54
  task_description = self.format_progress_description(nodes)
52
55
  start_message = self.format_start_message(nodes)
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  from graph_seeder.extraction.ExtractionStrategy import ExtractionStrategy
3
3
 
4
+ from graph_seeder.models.RDFNode import RDFNode
4
5
  from graph_seeder.utils.URIManager import URIManager
5
6
  from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
6
7
 
@@ -34,7 +35,9 @@ class HopExpansion(ExtractionStrategy):
34
35
  ) -> str:
35
36
  return f"Expanding {self.max_hops} hops for {self.format_progress_description(nodes)}"
36
37
 
37
- def extract(self, nodes: list[str]) -> tuple[list[tuple[str, str, str]], str]:
38
+ def extract(
39
+ self, nodes: list[str]
40
+ ) -> tuple[list[tuple[RDFNode, RDFNode, RDFNode]], str]:
38
41
  """
39
42
  Extract a subgraph by expanding from the given seed nodes up to max_hops.
40
43
  Returns:
@@ -50,7 +53,9 @@ class HopExpansion(ExtractionStrategy):
50
53
  f"[green]✓[/] Extracted {len(triplets)} triplets within {self.max_hops} hops.",
51
54
  )
52
55
 
53
- def _expand(self, nodes: list[str]) -> list[tuple[str, str, str]] | None:
56
+ def _expand(
57
+ self, nodes: list[str]
58
+ ) -> list[tuple[RDFNode, RDFNode, RDFNode]] | None:
54
59
  """
55
60
  Expand a list of nodes radially up to max_hops.
56
61
  Returns a list of all discovered unique triplets.
@@ -62,7 +67,7 @@ class HopExpansion(ExtractionStrategy):
62
67
  visited_nodes: set[str] = set(valid_nodes)
63
68
  current_level_nodes: set[str] = set(valid_nodes)
64
69
  self.graph.add_nodes_from(valid_nodes)
65
- all_triplets: set[tuple[str, str, str]] = set()
70
+ all_triplets: set[tuple[RDFNode, RDFNode, RDFNode]] = set()
66
71
 
67
72
  for hop in range(self.max_hops):
68
73
  logger.info(
@@ -76,11 +81,14 @@ class HopExpansion(ExtractionStrategy):
76
81
  if node in self.explored_nodes:
77
82
  if node in self.graph:
78
83
  for u, v, data in self.graph.edges(node, data=True):
79
- pred = data.get("key")
80
84
  neighbor = v if u == node else u
81
85
 
82
86
  if neighbor not in self._excluded_nodes:
83
- all_triplets.add((u, pred, v))
87
+ subj_node = data.get("subj_node")
88
+ pred_node = data.get("pred_node")
89
+ obj_node = data.get("obj_node")
90
+
91
+ all_triplets.add((subj_node, pred_node, obj_node))
84
92
  if neighbor not in visited_nodes:
85
93
  next_level_nodes.add(neighbor)
86
94
  else:
@@ -88,23 +96,33 @@ class HopExpansion(ExtractionStrategy):
88
96
 
89
97
  if nodes_to_query:
90
98
  for triplets in self.wrapper.get_neighborhood(nodes_to_query):
91
- for subj, pred, obj in triplets:
99
+ for subj_node, pred_node, obj_node in triplets:
100
+ subj_val = subj_node.value
101
+ obj_val = obj_node.value
102
+ pred_val = pred_node.value
103
+
92
104
  if (
93
- subj in self._excluded_nodes
94
- or obj in self._excluded_nodes
95
- or pred in self._excluded_properties
105
+ subj_val in self._excluded_nodes
106
+ or obj_val in self._excluded_nodes
107
+ or pred_val in self._excluded_properties
96
108
  ):
97
109
  continue
110
+
98
111
  self.graph.add_edge(
99
- subj, obj, key=pred, original_subj=subj, original_obj=obj
112
+ subj_val,
113
+ obj_val,
114
+ key=pred_val,
115
+ subj_node=subj_node,
116
+ pred_node=pred_node,
117
+ obj_node=obj_node,
100
118
  )
101
119
 
102
- all_triplets.add((subj, pred, obj))
120
+ all_triplets.add((subj_node, pred_node, obj_node))
103
121
 
104
- if subj not in visited_nodes:
105
- next_level_nodes.add(subj)
106
- if obj not in visited_nodes:
107
- next_level_nodes.add(obj)
122
+ if subj_val not in visited_nodes:
123
+ next_level_nodes.add(subj_val)
124
+ if obj_val not in visited_nodes:
125
+ next_level_nodes.add(obj_val)
108
126
 
109
127
  self.explored_nodes.update(nodes_to_query)
110
128
 
@@ -0,0 +1,45 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional, TypedDict
3
+ from rdflib import URIRef, Literal
4
+ from graph_seeder.utils.URIManager import URIManager
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class RDFNode:
9
+ """Represents a node in an RDF graph, which can be a URI or a literal value."""
10
+
11
+ value: str
12
+ node_type: str
13
+ datatype: Optional[str] = None
14
+ language: Optional[str] = None
15
+
16
+ @property
17
+ def is_uri(self) -> bool:
18
+ return self.node_type == "uri"
19
+
20
+ @property
21
+ def is_literal(self) -> bool:
22
+ return self.node_type in ("literal", "typed-literal")
23
+
24
+ def __str__(self) -> str:
25
+ return self.value
26
+
27
+ def to_rdflib(self, uri_manager: URIManager):
28
+ """Convert internal string representation to proper rdflib Nodes."""
29
+ if self.is_uri:
30
+ if ":" in self.value and not self.value.startswith("http"):
31
+ prefix, local_name = self.value.split(":", 1)
32
+ if prefix in uri_manager.namespaces:
33
+ return URIRef(f"{uri_manager.namespaces[prefix]}{local_name}")
34
+ return URIRef(self.value)
35
+
36
+ else:
37
+ return Literal(self.value, lang=self.language, datatype=self.datatype)
38
+
39
+
40
+ class ExtractionResult(TypedDict):
41
+ """Type hinting for dictionaries of extracted paths."""
42
+
43
+ seed: str
44
+ target: Optional[str]
45
+ triples: list[tuple[RDFNode, RDFNode, RDFNode]]
@@ -3,7 +3,8 @@ import logging
3
3
  import pickle
4
4
  from pathlib import Path
5
5
  import networkx as nx
6
- from rdflib import Graph, Namespace, URIRef, Literal
6
+ from rdflib import Graph, Namespace
7
+ from graph_seeder.models.RDFNode import ExtractionResult
7
8
  from graph_seeder.utils.URIManager import URIManager
8
9
 
9
10
  logger = logging.getLogger("subgraph")
@@ -24,7 +25,7 @@ class GraphExporter:
24
25
 
25
26
  def save_results(
26
27
  self,
27
- extracted_paths: list[dict],
28
+ extraction_results: list[ExtractionResult],
28
29
  uri_manager: URIManager,
29
30
  name_suffix: str = "",
30
31
  ) -> None:
@@ -35,45 +36,54 @@ class GraphExporter:
35
36
  fmt = self.output_format
36
37
 
37
38
  if fmt == "json":
38
- compressed_paths = []
39
- for path_data in extracted_paths:
40
- compressed_paths.append(
41
- {
42
- "seed": uri_manager.compress_uri(path_data["seed"]),
43
- "target": uri_manager.compress_uri(path_data["target"]),
44
- "triples": [
45
- [
46
- uri_manager.compress_uri(s),
47
- uri_manager.compress_uri(p),
48
- uri_manager.compress_uri(o),
49
- ]
50
- for s, p, o in path_data.get("triples", [])
51
- ],
52
- }
53
- )
54
-
55
- final_json = {"@context": uri_manager.namespaces, "paths": compressed_paths}
39
+ results_list = []
40
+ for result_data in extraction_results:
41
+ extracted_item = {"seed": uri_manager.compress_uri(result_data["seed"])}
42
+
43
+ if result_data.get("target"):
44
+ extracted_item["target"] = uri_manager.compress_uri(
45
+ result_data["target"]
46
+ )
47
+
48
+ extracted_item["triples"] = [
49
+ [
50
+ uri_manager.compress_uri(s.value),
51
+ uri_manager.compress_uri(p.value),
52
+ uri_manager.compress_uri(o.value) if o.is_uri else o.value,
53
+ ]
54
+ for s, p, o in result_data.get("triples", [])
55
+ ]
56
+
57
+ results_list.append(extracted_item)
58
+
59
+ final_json = {"@context": uri_manager.namespaces, "results": results_list}
56
60
 
57
61
  with open(path, "w", encoding="utf-8") as f:
58
62
  json.dump(final_json, f, indent=2)
59
63
 
60
64
  elif fmt == "ttl":
61
65
  rdf = Graph()
62
-
63
- for prefix, uri in uri_manager.namespaces.items():
64
- rdf.bind(prefix, Namespace(uri))
65
-
66
66
  added_triples = set()
67
67
 
68
- for path_data in extracted_paths:
69
- for s, p, o in path_data.get("triples", []):
70
- if (s, p, o) not in added_triples:
71
- obj_node = (
72
- URIRef(o) if str(o).startswith("http") else Literal(o)
68
+ for result_data in extraction_results:
69
+ for s, p, o in result_data.get("triples", []):
70
+ if (s.value, p.value, o.value) not in added_triples:
71
+ uri_manager.compress_uri(s.value)
72
+ uri_manager.compress_uri(p.value)
73
+ if o.is_uri:
74
+ uri_manager.compress_uri(o.value)
75
+
76
+ rdf.add(
77
+ (
78
+ s.to_rdflib(uri_manager),
79
+ p.to_rdflib(uri_manager),
80
+ o.to_rdflib(uri_manager),
81
+ )
73
82
  )
83
+ added_triples.add((s.value, p.value, o.value))
74
84
 
75
- rdf.add((URIRef(s), URIRef(p), obj_node))
76
- added_triples.add((s, p, o))
85
+ for prefix, uri in uri_manager.namespaces.items():
86
+ rdf.bind(prefix, Namespace(uri))
77
87
 
78
88
  rdf.serialize(destination=str(path), format="turtle")
79
89
 
@@ -1,4 +1,5 @@
1
1
  import statistics
2
+ from graph_seeder.models.RDFNode import ExtractionResult
2
3
  from graph_seeder.utils.utils import get_connected_components
3
4
 
4
5
 
@@ -6,23 +7,23 @@ class GraphStatistics:
6
7
  """Utility class for computing statistics on a graph."""
7
8
 
8
9
  @staticmethod
9
- def compute(paths: list[dict]) -> dict:
10
- """Compute statistics on the graph given the extracted paths."""
10
+ def compute(results: list[ExtractionResult]) -> dict:
11
+ """Compute statistics on the graph given the extracted results."""
11
12
  subjects = set()
12
13
  predicates = set()
13
14
  objects = set()
14
15
  triplets = []
15
16
 
16
- for path_data in paths:
17
+ for path_data in results:
17
18
  for s, p, o in path_data.get("triples", []):
18
- triplets.append((s, p, o))
19
- subjects.add(s)
20
- predicates.add(p)
21
- objects.add(o)
19
+ triplets.append((s.value, p.value, o.value))
20
+ subjects.add(s.value)
21
+ predicates.add(p.value)
22
+ objects.add(o.value)
22
23
 
23
24
  unique_entities = len(subjects | objects)
24
25
 
25
- components = get_connected_components(paths)
26
+ components = get_connected_components(results)
26
27
 
27
28
  comp_sizes = [len(comp) for comp in components]
28
29
  mean_size = statistics.mean(comp_sizes) if comp_sizes else 0
@@ -2,6 +2,7 @@ import networkx as nx
2
2
  import json
3
3
  from importlib import resources
4
4
  from pathlib import Path
5
+ from graph_seeder.models.RDFNode import ExtractionResult, RDFNode
5
6
 
6
7
  BUILTIN_CONFIGS = [
7
8
  "dbpedia_default",
@@ -285,9 +286,20 @@ def generate_config_template(output_path: str = "config_template.json") -> None:
285
286
  print(f"Configuration template successfully generated at: {path}")
286
287
 
287
288
 
288
- def get_connected_components(paths: list[dict]) -> list[set[str]]:
289
+ def get_connected_components(paths: list[ExtractionResult]) -> list[set[str]]:
289
290
  """Get the connected components from a list of structured paths."""
290
291
  graph: nx.Graph = nx.Graph()
291
292
  for path_data in paths:
292
- graph.add_edges_from((s, o) for s, p, o in path_data.get("triples", []))
293
+ graph.add_edges_from(
294
+ (s.value, o.value) for s, p, o in path_data.get("triples", [])
295
+ )
293
296
  return list(nx.connected_components(graph))
297
+
298
+
299
+ def parse_node(binding: dict) -> RDFNode:
300
+ return RDFNode(
301
+ value=binding["value"],
302
+ node_type=binding["type"],
303
+ datatype=binding.get("datatype"),
304
+ language=binding.get("xml:lang"),
305
+ )
@@ -1,5 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from collections.abc import Generator
3
+ from graph_seeder.models.RDFNode import RDFNode
3
4
  from graph_seeder.utils.URIManager import URIManager
4
5
 
5
6
 
@@ -40,7 +41,7 @@ class NeighborhoodWrapper(ABC):
40
41
  @abstractmethod
41
42
  def get_neighborhood(
42
43
  self, nodes: list[str]
43
- ) -> Generator[list[tuple[str, str, str]], None, None]:
44
+ ) -> Generator[list[tuple[RDFNode, RDFNode, RDFNode]], None, None]:
44
45
  """
45
46
  Yields the neighborhood of a list of nodes in batches.
46
47
  Allows the consumer to break the loop to stop early.
@@ -1,6 +1,7 @@
1
1
  from collections.abc import Generator
2
2
  import lmdb
3
3
  import logging
4
+ from graph_seeder.models.RDFNode import RDFNode
4
5
  from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
5
6
  from graph_seeder.utils.URIManager import URIManager
6
7
  import json
@@ -48,8 +49,8 @@ class HashMapWrapper(NeighborhoodWrapper):
48
49
 
49
50
  def get_neighborhood(
50
51
  self, nodes: list[str]
51
- ) -> Generator[list[tuple[str, str, str]], None, None]:
52
- triplets: list[tuple[str, str, str]] = []
52
+ ) -> Generator[list[tuple[RDFNode, RDFNode, RDFNode]], None, None]:
53
+ triplets: list[tuple[RDFNode, RDFNode, RDFNode]] = []
53
54
  skipped_nodes: set[str] = set()
54
55
 
55
56
  with self.env.begin() as txn:
@@ -109,10 +110,21 @@ class HashMapWrapper(NeighborhoodWrapper):
109
110
  ):
110
111
  continue
111
112
 
113
+ subj_node = RDFNode(original_node, "uri")
114
+ pred_node = RDFNode(clean_prop, "uri")
115
+
116
+ is_uri = neighbor.startswith("http") or (
117
+ ":" in neighbor and " " not in neighbor
118
+ )
119
+ obj_type = "uri" if is_uri else "literal"
120
+ obj_node = RDFNode(neighbor, obj_type)
121
+
112
122
  if is_inverse:
113
- triplets.append((neighbor, clean_prop, original_node))
123
+ inv_subj = RDFNode(neighbor, "uri")
124
+ inv_obj = RDFNode(original_node, "uri")
125
+ triplets.append((inv_subj, pred_node, inv_obj))
114
126
  else:
115
- triplets.append((original_node, clean_prop, neighbor))
127
+ triplets.append((subj_node, pred_node, obj_node))
116
128
 
117
129
  if triplets:
118
130
  yield triplets
@@ -1,10 +1,11 @@
1
1
  from collections.abc import Generator
2
-
3
- from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
4
2
  import logging
3
+ from graph_seeder.models.RDFNode import RDFNode
4
+ from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
5
5
  from graph_seeder.wrapper.sparql.SparqlQueryBuilder import SparqlQueryBuilder
6
- from graph_seeder.utils.URIManager import URIManager
7
6
  from graph_seeder.wrapper.sparql.BaseClient import BaseClient
7
+ from graph_seeder.utils.URIManager import URIManager
8
+ from graph_seeder.utils.utils import parse_node
8
9
 
9
10
  logger = logging.getLogger("subgraph")
10
11
 
@@ -135,7 +136,7 @@ class GraphWrapper(NeighborhoodWrapper):
135
136
 
136
137
  def get_neighborhood(
137
138
  self, nodes: list[str]
138
- ) -> Generator[list[tuple[str, str, str]], None, None]:
139
+ ) -> Generator[list[tuple[RDFNode, RDFNode, RDFNode]], None, None]:
139
140
  """Fetch one-hop neighbors using property occurrence to decide strategy."""
140
141
  if not nodes:
141
142
  return None
@@ -178,9 +179,9 @@ class GraphWrapper(NeighborhoodWrapper):
178
179
  ):
179
180
  yield [
180
181
  (
181
- r["subject"]["value"],
182
- r["property"]["value"],
183
- r["object"]["value"],
182
+ parse_node(r["subject"]),
183
+ parse_node(r["property"]),
184
+ parse_node(r["object"]),
184
185
  )
185
186
  for r in raw_rows
186
187
  ]
@@ -192,9 +193,9 @@ class GraphWrapper(NeighborhoodWrapper):
192
193
  ):
193
194
  yield [
194
195
  (
195
- r["subject"]["value"],
196
- r["property"]["value"],
197
- r["object"]["value"],
196
+ parse_node(r["subject"]),
197
+ parse_node(r["property"]),
198
+ parse_node(r["object"]),
198
199
  )
199
200
  for r in raw_rows
200
201
  ]
@@ -212,16 +213,17 @@ class GraphWrapper(NeighborhoodWrapper):
212
213
  for raw_rows in self._execute_with_dichotomy(
213
214
  nodes, self.query_builder.build_prop_occurrence_query
214
215
  ):
215
- all_stats.extend(
216
- [
216
+ for r in raw_rows:
217
+ if not r or "entity" not in r:
218
+ continue
219
+
220
+ all_stats.append(
217
221
  (
218
222
  r["entity"]["value"],
219
223
  r["property"]["value"],
220
224
  int(r["count"]["value"]),
221
225
  )
222
- for r in raw_rows
223
- ]
224
- )
226
+ )
225
227
  return all_stats
226
228
 
227
229
  def _chunk_hub_properties(
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import rdflib
3
3
  from graph_seeder.wrapper.sparql.BaseClient import BaseClient
4
+ import json
4
5
 
5
6
  logger = logging.getLogger("subgraph")
6
7
 
@@ -23,25 +24,19 @@ class TurtleClient(BaseClient):
23
24
  self.graph.parse(self.file_path, format="turtle")
24
25
  logger.info(f"Successfully loaded {len(self.graph)} triples.")
25
26
 
26
- self.optimal_batch_size = 500
27
-
28
27
  def query(
29
28
  self, sparql_query: str, silent: bool = False, retries: int = None
30
29
  ) -> list[dict]:
31
30
  """Execute the SPARQL query on the local rdflib graph and format the output."""
32
31
  try:
33
32
  results = self.graph.query(sparql_query)
34
- bindings = []
35
33
 
36
- for row in results:
37
- binding = {}
38
- for var in results.vars:
39
- val = row[var]
40
- if val is not None:
41
- binding[str(var)] = {"value": str(val)}
42
- bindings.append(binding)
34
+ json_bytes = results.serialize(format="json")
35
+
36
+ json_dict = json.loads(json_bytes)
37
+
38
+ return json_dict["results"]["bindings"]
43
39
 
44
- return bindings
45
40
  except Exception as e:
46
41
  logger.error(f"Failed to execute local Turtle query: {e}")
47
42
  raise RuntimeError(f"Turtle query failed: {e}") from e
@@ -147,7 +147,7 @@ wheels = [
147
147
 
148
148
  [[package]]
149
149
  name = "graph-seeder"
150
- version = "1.0.0.dev5"
150
+ version = "1.0.0.dev8"
151
151
  source = { editable = "." }
152
152
  dependencies = [
153
153
  { name = "lmdb" },