graph-seeder 1.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,273 @@
1
+ from typing import Mapping, Any
2
+ import networkx as nx
3
+ from rich.console import Console
4
+ from rich.panel import Panel
5
+ from rich.progress import (
6
+ BarColumn,
7
+ MofNCompleteColumn,
8
+ Progress,
9
+ ProgressColumn,
10
+ SpinnerColumn,
11
+ TextColumn,
12
+ TimeElapsedColumn,
13
+ )
14
+ from rich.text import Text
15
+ from rich.table import Table
16
+
17
+
18
+ class SimplifiedEMAColumn(ProgressColumn):
19
+ def __init__(self, alpha: float = 0.2):
20
+ super().__init__()
21
+ self._ema: float = 0.0
22
+ self._alpha = alpha
23
+ self._last_completed = 0
24
+ self._last_time: float = 0.0
25
+
26
+ def render(self, task) -> Text:
27
+ if self._last_time == 0.0:
28
+ self._last_time = task._get_time()
29
+
30
+ if task.total is None or task.completed == 0 or task.total == 0:
31
+ return Text("--:--:--", style="progress.remaining")
32
+
33
+ if task.completed >= task.total:
34
+ return Text("00:00:00", style="progress.remaining")
35
+
36
+ if task.completed > self._last_completed:
37
+ now = task._get_time()
38
+ elapsed = now - self._last_time
39
+ self._ema = self._alpha * elapsed + (1 - self._alpha) * self._ema
40
+ self._last_time = now
41
+ self._last_completed = task.completed
42
+
43
+ remaining_steps = task.total - self._last_completed
44
+ elapsed_current = task._get_time() - self._last_time
45
+ virtual_ema = self._alpha * elapsed_current + (1 - self._alpha) * self._ema
46
+
47
+ remaining_secs = elapsed_current + (remaining_steps - 1) * virtual_ema
48
+
49
+ hours, rem = divmod(int(remaining_secs), 3600)
50
+ minutes, seconds = divmod(rem, 60)
51
+
52
+ return Text(f"{hours}:{minutes:02}:{seconds:02}", style="progress.remaining")
53
+
54
+
55
+ class ConsoleUI:
56
+ """Helper class for displaying configuration and extraction summaries using Rich."""
57
+
58
+ def __init__(self, console: Console) -> None:
59
+ """Initialize the Rich console used by the UI helper."""
60
+ self.console = console
61
+
62
+ def _add_row_if_exists(
63
+ self, table: Table, label: str, value: Any, suffix: str = ""
64
+ ) -> None:
65
+ """Helper to only add a row to the table if the value is not None."""
66
+ if value is not None:
67
+ table.add_row(label, f"{value}{suffix}")
68
+
69
+ def _format_threshold(self, value: Any, is_lower_bound: bool = False) -> str:
70
+ """Format threshold values for display, handling special cases like None and infinity."""
71
+ if value is None:
72
+ return "Disabled"
73
+ if value == float("inf"):
74
+ return "Disabled (No limit)"
75
+ if is_lower_bound and value == 0:
76
+ return "Disabled"
77
+ return str(value)
78
+
79
+ def print_config(
80
+ self,
81
+ cfg: dict,
82
+ excluded_nodes_display: list[str],
83
+ excluded_properties_display: list[str],
84
+ ) -> None:
85
+ """Display the configuration settings dynamically based on what is provided."""
86
+ table = Table(show_header=False, box=None, padding=(0, 2))
87
+ table.add_column(style="bold cyan")
88
+ table.add_column(style="white")
89
+
90
+ data_config = cfg.get("data", {})
91
+ client_config = cfg.get("client", {})
92
+ extraction_config = cfg.get("extraction", {})
93
+ densification_config = cfg.get("densification", {})
94
+ filters_config = cfg.get("graph_filters", {})
95
+ debug_config = cfg.get("debug", {})
96
+
97
+ # Data settings
98
+ self._add_row_if_exists(table, "Input path", data_config.get("input_path"))
99
+ self._add_row_if_exists(
100
+ table, "Output format", data_config.get("output_format")
101
+ )
102
+ self._add_row_if_exists(table, "Output path", data_config.get("output_path"))
103
+
104
+ # Client settings
105
+ self._add_row_if_exists(table, "Client Type", client_config.get("type"))
106
+ self._add_row_if_exists(table, "Endpoint", client_config.get("endpoint"))
107
+ self._add_row_if_exists(table, "Database Path", client_config.get("path"))
108
+ self._add_row_if_exists(table, "User-Agent", client_config.get("user_agent"))
109
+ self._add_row_if_exists(
110
+ table, "Request delay", client_config.get("request_delay"), "s"
111
+ )
112
+ self._add_row_if_exists(
113
+ table, "Retry attempts", client_config.get("retry_attempts")
114
+ )
115
+ self._add_row_if_exists(
116
+ table, "Retry delay", client_config.get("retry_delay"), "s"
117
+ )
118
+ self._add_row_if_exists(
119
+ table, "Rate limit wait", client_config.get("rate_limit_wait"), "s"
120
+ )
121
+ self._add_row_if_exists(table, "Timeout", client_config.get("timeout"), "s")
122
+
123
+ # Extraction settings
124
+ self._add_row_if_exists(table, "Strategy", extraction_config.get("strategy"))
125
+ self._add_row_if_exists(
126
+ table, "Batch size", extraction_config.get("batch_size")
127
+ )
128
+ self._add_row_if_exists(
129
+ table,
130
+ "Max neighbors threshold",
131
+ self._format_threshold(extraction_config.get("max_neighbors_threshold")),
132
+ )
133
+ self._add_row_if_exists(
134
+ table,
135
+ "Max hops limit",
136
+ self._format_threshold(extraction_config.get("max_hops")),
137
+ )
138
+ self._add_row_if_exists(
139
+ table, "Hub pairs batch size", extraction_config.get("hub_pairs_batch_size")
140
+ )
141
+ self._add_row_if_exists(
142
+ table,
143
+ "Hub pagination threshold",
144
+ self._format_threshold(extraction_config.get("hub_pagination_threshold")),
145
+ )
146
+ self._add_row_if_exists(
147
+ table,
148
+ "Min triplets per property",
149
+ self._format_threshold(
150
+ extraction_config.get("min_triplets_per_property"),
151
+ is_lower_bound=True,
152
+ ),
153
+ )
154
+ self._add_row_if_exists(
155
+ table,
156
+ "Check seeds validity",
157
+ str(extraction_config.get("check_seeds_validity")),
158
+ )
159
+
160
+ # Hub seeds settings
161
+ self._add_row_if_exists(
162
+ table,
163
+ "Check hub seeds",
164
+ str(extraction_config.get("check_hub_seeds")),
165
+ )
166
+ self._add_row_if_exists(
167
+ table, "Keep hub seeds", str(extraction_config.get("keep_hub_seeds"))
168
+ )
169
+
170
+ # Densification settings
171
+ self._add_row_if_exists(
172
+ table,
173
+ "Skip densification",
174
+ str(densification_config.get("skip_densification", False)),
175
+ )
176
+ self._add_row_if_exists(
177
+ table, "Densification mode", densification_config.get("mode")
178
+ )
179
+
180
+ # Filters settings
181
+ include_uris = filters_config.get("include_uri_prefixes", [])
182
+ exclude_uris = filters_config.get("exclude_uri_prefixes", [])
183
+
184
+ self._add_row_if_exists(
185
+ table,
186
+ "Included URI prefixes",
187
+ "\n".join(include_uris) if include_uris else "None",
188
+ )
189
+ self._add_row_if_exists(
190
+ table,
191
+ "Excluded URI prefixes",
192
+ "\n".join(exclude_uris) if exclude_uris else "None",
193
+ )
194
+ self._add_row_if_exists(
195
+ table, "Excluded nodes", ", ".join(excluded_nodes_display) or "None"
196
+ )
197
+ self._add_row_if_exists(
198
+ table,
199
+ "Excluded properties",
200
+ ", ".join(excluded_properties_display) or "None",
201
+ )
202
+ namespaces = "\n".join(
203
+ f"{prefix}: {uri}"
204
+ for prefix, uri in filters_config.get("namespaces", {}).items()
205
+ )
206
+ self._add_row_if_exists(
207
+ table, "Loaded namespaces", namespaces if namespaces else "Default only"
208
+ )
209
+
210
+ # Debug settings
211
+ self._add_row_if_exists(table, "Debug mode", debug_config.get("debug_enabled"))
212
+ self._add_row_if_exists(
213
+ table, "Request logging", debug_config.get("request_logging")
214
+ )
215
+
216
+ self.console.print(
217
+ Panel(table, title="[bold green]Configuration[/]", expand=False)
218
+ )
219
+
220
+ def print_summary(
221
+ self,
222
+ found_stats: Mapping[str, int],
223
+ stats: Mapping[str, int],
224
+ graph: nx.Graph,
225
+ table_title: str = "Extraction Summary",
226
+ ) -> None:
227
+ """Display a dynamic summary of extraction results.
228
+
229
+ Args:
230
+ found_stats: Statistics for found triplets.
231
+ stats: Aggregated extraction counters (dynamically rendered).
232
+ graph: In-memory graph built during extraction.
233
+ table_title: Title for the summary table.
234
+ """
235
+ table = Table(show_header=False, box=None, padding=(0, 2))
236
+ table.add_column(style="bold cyan")
237
+ table.add_column(style="white")
238
+
239
+ for key, value in found_stats.items():
240
+ display_key = key.replace("_", " ").capitalize()
241
+ if "not" in key.lower() or "fail" in key.lower() or "error" in key.lower():
242
+ color = "yellow"
243
+ else:
244
+ color = "green"
245
+ table.add_row(display_key, f"[{color}]{value}[/]")
246
+
247
+ for key, value in stats.items():
248
+ display_key = key.replace("_", " ").capitalize()
249
+
250
+ table.add_row(display_key, f"{value}")
251
+
252
+ table.add_section()
253
+ table.add_row("Graph nodes", str(len(graph.nodes)))
254
+ table.add_row("Graph edges", str(len(graph.edges)))
255
+
256
+ self.console.print(
257
+ Panel(table, title=f"[bold green]{table_title}[/]", expand=False)
258
+ )
259
+
260
+ def create_progress_bar(self) -> Progress:
261
+ """Create a Rich progress bar for tracking extraction progress."""
262
+ return Progress(
263
+ SpinnerColumn(),
264
+ TextColumn("[progress.description]{task.description}"),
265
+ BarColumn(),
266
+ MofNCompleteColumn(),
267
+ TextColumn("•"),
268
+ TimeElapsedColumn(),
269
+ TextColumn("•"),
270
+ SimplifiedEMAColumn(),
271
+ console=self.console,
272
+ refresh_per_second=1,
273
+ )
@@ -0,0 +1,64 @@
1
+ from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
2
+ from graph_seeder.wrapper.sparql.GraphWrapper import GraphWrapper
3
+ from graph_seeder.wrapper.sparql.client.SparqlClient import SparqlClient
4
+ from graph_seeder.wrapper.sparql.client.TurtleClient import TurtleClient
5
+ from graph_seeder.wrapper.hashmap.HashMapWrapper import HashMapWrapper
6
+ from graph_seeder.extraction.BFS.BFS import BidirectionalBFS
7
+ from graph_seeder.extraction.Hop.HopExpansion import HopExpansion
8
+
9
+
10
+ class ComponentFactory:
11
+ @staticmethod
12
+ def create_client(config: dict):
13
+ """Create a SPARQL client based on the configuration."""
14
+ client_type = config.get("client", {}).get("type", "sparql").lower()
15
+
16
+ registered_clients = {
17
+ "sparql": SparqlClient,
18
+ "turtle": TurtleClient,
19
+ }
20
+
21
+ if client_type not in registered_clients:
22
+ raise ValueError(f"Unknown client type: {client_type}")
23
+
24
+ return registered_clients[client_type](config)
25
+
26
+ @staticmethod
27
+ def create_wrapper(uri_manager, config: dict, client=None) -> NeighborhoodWrapper:
28
+ """Create a graph wrapper based on the configuration."""
29
+ wrapper_type = config.get("client", {}).get("type", "sparql").lower()
30
+
31
+ registered_wrappers = {
32
+ "sparql": GraphWrapper,
33
+ "turtle": GraphWrapper,
34
+ "hashmap": HashMapWrapper,
35
+ }
36
+
37
+ if wrapper_type not in registered_wrappers:
38
+ raise ValueError(f"Unknown wrapper type: {wrapper_type}")
39
+
40
+ wrapper_class = registered_wrappers[wrapper_type]
41
+
42
+ if wrapper_class == GraphWrapper:
43
+ if client is None:
44
+ client = ComponentFactory.create_client(config)
45
+ return wrapper_class(uri_manager, config, client=client)
46
+
47
+ return wrapper_class(uri_manager, config)
48
+
49
+ @staticmethod
50
+ def create_strategy(wrapper, uri_manager, config: dict):
51
+ """Create an extraction strategy based on the configuration."""
52
+ strategy_type = config.get("extraction", {}).get("strategy", "bfs").lower()
53
+
54
+ registered_strategies = {
55
+ "bfs": BidirectionalBFS,
56
+ "hop": HopExpansion,
57
+ }
58
+
59
+ if strategy_type not in registered_strategies:
60
+ raise ValueError(
61
+ f"Unknown strategy type: '{strategy_type}'. Valid options: {list(registered_strategies.keys())}"
62
+ )
63
+
64
+ return registered_strategies[strategy_type](wrapper, uri_manager, config)
@@ -0,0 +1,84 @@
1
+ import csv
2
+ import json
3
+ import logging
4
+ import pickle
5
+ from pathlib import Path
6
+
7
+ import networkx as nx
8
+ from rdflib import Graph, Namespace, URIRef
9
+
10
+ logger = logging.getLogger("subgraph")
11
+
12
+
13
+ class GraphExporter:
14
+ """Export triplets and graphs to disk in various formats."""
15
+
16
+ def __init__(self, data_cfg: dict) -> None:
17
+ """Initialize exporter settings.
18
+
19
+ Args:
20
+ output_format: Target format for triplet export.
21
+ """
22
+ self.output_format = data_cfg.get("output_format")
23
+ self.output_path = Path(data_cfg.get("output_path", ".")).resolve()
24
+ self.output_path.parent.mkdir(parents=True, exist_ok=True)
25
+
26
+ def save_triplets(
27
+ self,
28
+ triplets: list[tuple[str, str, str]],
29
+ namespaces: dict[str, str],
30
+ name_suffix: str = "",
31
+ ) -> None:
32
+ """Write triplets to disk using the configured output format.
33
+
34
+ Args:
35
+ triplets: Sequence of ``(subject, predicate, object)`` identifiers.
36
+ namespaces: Dictionary mapping namespace prefixes to URIs.
37
+ name_suffix: Optional suffix for the output file name (before extension).
38
+
39
+ Raises:
40
+ ValueError: If the configured output format is not supported.
41
+ """
42
+ path = self.output_path.with_name(
43
+ f"{self.output_path.stem}{name_suffix}.{self.output_format}"
44
+ )
45
+ fmt = self.output_format
46
+ if fmt == "csv":
47
+ with open(path, "w", newline="", encoding="utf-8") as f:
48
+ writer = csv.writer(f)
49
+ writer.writerow(["subject", "predicate", "object"])
50
+ writer.writerows(triplets)
51
+
52
+ elif fmt == "json":
53
+ data = [{"subject": s, "predicate": p, "object": o} for s, p, o in triplets]
54
+ with open(path, "w", encoding="utf-8") as f:
55
+ json.dump(data, f, indent=2)
56
+
57
+ elif fmt == "ttl":
58
+ rdf = Graph()
59
+
60
+ for prefix, uri in namespaces.items():
61
+ rdf.bind(prefix, Namespace(uri))
62
+
63
+ for s, p, o in triplets:
64
+ rdf.add((URIRef(s), URIRef(p), URIRef(o)))
65
+
66
+ rdf.serialize(destination=str(path), format="turtle")
67
+ else:
68
+ raise ValueError(f"Unsupported output format: {fmt!r}")
69
+
70
+ logger.info(f"Triplets saved → [bold]{path}[/]")
71
+
72
+ def save_graph(self, graph: nx.MultiGraph, name_suffix: str = "") -> None:
73
+ """Serialize a NetworkX graph to a gpickle file.
74
+
75
+ Args:
76
+ graph: In-memory graph built during extraction.
77
+ name_suffix: Optional suffix for the output file name (before extension).
78
+ """
79
+ graph_path = self.output_path.with_name(
80
+ f"{self.output_path.stem}{name_suffix}.gpickle"
81
+ )
82
+ with open(graph_path, "wb") as f:
83
+ pickle.dump(graph, f, protocol=pickle.HIGHEST_PROTOCOL)
84
+ logger.info(f"Graph saved → [bold]{graph_path}[/]\n")
@@ -0,0 +1,32 @@
1
+ import networkx as nx
2
+
3
+
4
+ class GraphStatistics:
5
+ """Utility class for computing statistics on a graph."""
6
+
7
+ @staticmethod
8
+ def compute(triplets: list[tuple[str, str, str]]) -> dict:
9
+ """Compute statistics on the graph given a list of triplets and the graph itself."""
10
+ subjects = set()
11
+ predicates = set()
12
+ objects = set()
13
+ triplets_graph = nx.Graph()
14
+
15
+ for s, p, o in triplets:
16
+ subjects.add(s)
17
+ predicates.add(p)
18
+ objects.add(o)
19
+ triplets_graph.add_edge(s, o)
20
+
21
+ nb_components = nx.number_connected_components(triplets_graph)
22
+
23
+ unique_entities = len(subjects | objects)
24
+
25
+ return {
26
+ "total_triplets": len(triplets),
27
+ "unique_subjects": len(subjects),
28
+ "unique_predicates": len(predicates),
29
+ "unique_objects": len(objects),
30
+ "unique_entities": unique_entities,
31
+ "connected_components": nb_components,
32
+ }
@@ -0,0 +1,95 @@
1
+ import json
2
+ import re
3
+ import time
4
+ from pathlib import Path
5
+ from typing import Any
6
+ import requests
7
+ import urllib3
8
+ from importlib import resources
9
+
10
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
11
+
12
+ CACHE_DIR = Path.home() / ".cache" / "graph_seeder"
13
+ CACHE_FILE = CACHE_DIR / "context.json"
14
+ CACHE_TTL = 60 * 60 * 24 * 7 # 1 week
15
+
16
+
17
+ class URIManager:
18
+ """Utility class to manage URI compression and namespace resolution."""
19
+
20
+ def __init__(self, custom_namespaces: dict[str, str] = None):
21
+ self.namespaces = custom_namespaces or {}
22
+
23
+ self._update_sorted_namespaces()
24
+
25
+ context_data = self._get_cached_or_fresh_context()
26
+ if not context_data:
27
+ try:
28
+ fallback_path = resources.files("graph_seeder.utils").joinpath(
29
+ "context.json"
30
+ )
31
+ with fallback_path.open("r", encoding="utf-8") as f:
32
+ context_data = json.load(f)
33
+ except Exception:
34
+ context_data = {}
35
+
36
+ self.context = context_data.get("@context", {})
37
+
38
+ def _update_sorted_namespaces(self):
39
+ self.sorted_namespaces = sorted(
40
+ self.namespaces.items(),
41
+ key=lambda item: len(item[1]),
42
+ reverse=True,
43
+ )
44
+
45
+ def _get_context_fallback_prefix(
46
+ self, uri: str
47
+ ) -> tuple[Any, Any] | tuple[None, None]:
48
+ """Return the prefix and namespace for a given URI from the context."""
49
+ for prefix, namespace in self.context.items():
50
+ if uri.startswith(namespace):
51
+ return prefix, namespace
52
+ return None, None
53
+
54
+ def compress_uri(self, full_uri: str) -> str:
55
+ """Compress a full URI into a prefix:local_id format if possible."""
56
+ full_uri = full_uri.strip()
57
+
58
+ for prefix, namespace_uri in self.sorted_namespaces:
59
+ if full_uri.startswith(namespace_uri):
60
+ local_id = full_uri[len(namespace_uri) :]
61
+ if re.match(r"^[\w\-\.:]+$", local_id):
62
+ return f"{prefix}:{local_id}"
63
+
64
+ prefix, namespace = self._get_context_fallback_prefix(full_uri)
65
+ if prefix:
66
+ local_id = full_uri[len(namespace) :]
67
+ if re.match(r"^[\w\-\.:]+$", local_id):
68
+ if prefix not in self.namespaces:
69
+ self.namespaces[prefix] = namespace
70
+ self._update_sorted_namespaces()
71
+ return f"{prefix}:{local_id}"
72
+
73
+ return f"<{full_uri}>"
74
+
75
+ def _get_cached_or_fresh_context(self) -> dict | None:
76
+ if CACHE_FILE.exists():
77
+ file_age = time.time() - CACHE_FILE.stat().st_mtime
78
+ if file_age < CACHE_TTL:
79
+ try:
80
+ with open(CACHE_FILE, "r") as f:
81
+ return json.load(f)
82
+ except json.JSONDecodeError:
83
+ pass
84
+ try:
85
+ response = requests.get(
86
+ "https://prefix.cc/context", verify=False, timeout=5
87
+ )
88
+ response.raise_for_status()
89
+ context = response.json()
90
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
91
+ with open(CACHE_FILE, "w") as f:
92
+ json.dump(context, f)
93
+ return context
94
+ except (requests.RequestException, json.JSONDecodeError):
95
+ return None