graph-seeder 1.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,47 @@
1
+ from graph_seeder.SubgraphExtractor import SubgraphExtractor
2
+ from graph_seeder.utils.utils import OVERRIDE_MAP
3
+ import argparse
4
+
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser(
8
+ description="Extract subgraph paths from Knowledge Graphs."
9
+ )
10
+ parser.add_argument(
11
+ "--config", default=None, help="Built-in template name OR path to a JSON file."
12
+ )
13
+
14
+ groups = {}
15
+
16
+ for key, mapping in OVERRIDE_MAP.items():
17
+ section = mapping[0]
18
+ expected_type = mapping[1]
19
+ description = mapping[2]
20
+
21
+ if section not in groups:
22
+ groups[section] = parser.add_argument_group(
23
+ f"{section.replace('_', ' ').title()} Options"
24
+ )
25
+
26
+ kwargs = {"dest": key, "help": f"{description} (Type: {expected_type})"}
27
+
28
+ if isinstance(expected_type, tuple) or expected_type == list:
29
+ kwargs["type"] = str
30
+ kwargs["nargs"] = "+"
31
+ elif expected_type is bool:
32
+ kwargs["action"] = argparse.BooleanOptionalAction
33
+ else:
34
+ kwargs["type"] = expected_type
35
+
36
+ groups[section].add_argument(f"--{key.replace('_', '-')}", **kwargs)
37
+
38
+ args = parser.parse_args()
39
+ overrides = {k: v for k, v in vars(args).items() if k != "config" and v is not None}
40
+
41
+ extractor = SubgraphExtractor(config_path=args.config, **overrides)
42
+
43
+ extractor.run()
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()
@@ -0,0 +1,377 @@
1
+ import logging
2
+ from pathlib import Path
3
+ import pandas as pd
4
+ import networkx as nx
5
+ from rich.console import Console
6
+ from rich.logging import RichHandler
7
+ from rich.prompt import Confirm
8
+ from graph_seeder.extraction.ExtractionStrategy import ExtractionStrategy
9
+ from graph_seeder.densification.GraphConnector import GraphConnector
10
+ from graph_seeder.utils.Factory import ComponentFactory
11
+ from graph_seeder.utils.ConsoleUI import ConsoleUI
12
+ from graph_seeder.utils.GraphExporter import GraphExporter
13
+ from graph_seeder.utils.URIManager import URIManager
14
+ from graph_seeder.utils.GraphStatistics import GraphStatistics
15
+ from graph_seeder.utils.utils import get_connected_components, load_config
16
+
17
+ console = Console(emoji=False)
18
+
19
+ # Logging
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format="%(message)s",
23
+ handlers=[
24
+ RichHandler(console=console, rich_tracebacks=True, markup=True, show_path=False)
25
+ ],
26
+ )
27
+ logger = logging.getLogger("subgraph")
28
+
29
+
30
+ class SubgraphExtractor:
31
+ """Extract subgraphs connecting seed nodes based on a specified strategy and export the results."""
32
+
33
+ def __init__(self, config_path: str = None, **kwargs) -> None:
34
+ """Initialize the extractor and all collaborating components."""
35
+ self.cfg = load_config(config_path, kwargs)
36
+
37
+ self._validate_and_clean_config()
38
+
39
+ if self.cfg["debug"]["debug_enabled"]:
40
+ logger.setLevel(logging.DEBUG)
41
+ logger.debug("Debug mode enabled.")
42
+
43
+ self.uri_manager = URIManager(self.cfg["graph_filters"].get("namespaces", {}))
44
+
45
+ self.ui = ConsoleUI(console)
46
+ self.exporter = GraphExporter(self.cfg["data"])
47
+
48
+ excluded_properties_display = [
49
+ self.uri_manager.compress_uri(prop)
50
+ for prop in self.cfg["graph_filters"]["exclude_properties"]
51
+ ]
52
+ excluded_nodes_display = [
53
+ self.uri_manager.compress_uri(node)
54
+ for node in self.cfg["graph_filters"]["exclude_nodes"]
55
+ ]
56
+
57
+ self.ui.print_config(
58
+ self.cfg,
59
+ excluded_nodes_display,
60
+ excluded_properties_display,
61
+ )
62
+
63
+ client_type = self.cfg.get("client", {}).get("type", "sparql").lower()
64
+ client = None
65
+ if client_type in ["sparql", "turtle"]:
66
+ client = ComponentFactory.create_client(self.cfg)
67
+
68
+ self.wrapper = ComponentFactory.create_wrapper(
69
+ self.uri_manager, self.cfg, client
70
+ )
71
+
72
+ self.extractor_strategy: ExtractionStrategy = ComponentFactory.create_strategy(
73
+ self.wrapper, self.uri_manager, self.cfg
74
+ )
75
+
76
+ self.check_seeds_validity = self.cfg["extraction"].get(
77
+ "check_seeds_validity", True
78
+ )
79
+ self.check_hub_seeds = self.cfg["extraction"].get("check_hub_seeds", True)
80
+ self.keep_hub_seeds = self.cfg["extraction"].get("keep_hub_seeds", None)
81
+ self.max_neighbors_threshold = self.wrapper.max_neighbors_threshold
82
+
83
+ self.skip_densification = self.cfg.get("densification", {}).get(
84
+ "skip_densification", False
85
+ )
86
+
87
+ self.stats = {
88
+ "found": 0,
89
+ "not_found": 0,
90
+ }
91
+
92
+ def _validate_and_clean_config(self) -> None:
93
+ """Validate config values and apply defaults where necessary."""
94
+
95
+ # Validate data config
96
+ data_cfg = self.cfg.get("data", {})
97
+ input_path = data_cfg.get("input_path")
98
+ if not input_path:
99
+ raise ValueError("Data config requires an 'input_path' to a CSV file.")
100
+
101
+ output_format = data_cfg.get("output_format", {})
102
+ if output_format not in ["csv", "json", "ttl"]:
103
+ raise ValueError(
104
+ "Data config 'output_format' must be one of: 'csv', 'json', 'ttl'."
105
+ )
106
+
107
+ # Validate client config
108
+ client_cfg = self.cfg.get("client", {})
109
+ if client_cfg.get("type").lower() == "sparql":
110
+ endpoint = client_cfg.get("endpoint")
111
+ if not endpoint:
112
+ raise ValueError(
113
+ "SPARQL client requires an 'endpoint' URL in the config."
114
+ )
115
+
116
+ user_agent = client_cfg.get("user_agent", "")
117
+ if (
118
+ not user_agent
119
+ or "YOUR_PROJECT_NAME" in user_agent
120
+ or user_agent.strip() == ""
121
+ ):
122
+ logger.warning(
123
+ """SPARQL user agent has not been set or is using the default placeholder. Note that this may lead to some endpoints (e.g. Wikidata) blocking your requests or reducing your rate limits. Set a custom user agent (e.g. "your-project/1.0 (your-email@example.com)") in the config to avoid this issue and to help endpoint operators identify your traffic.
124
+ """
125
+ )
126
+
127
+ client_parameters = [
128
+ "request_delay",
129
+ "retry_attempts",
130
+ "retry_delay",
131
+ "rate_limit_wait",
132
+ "timeout",
133
+ ]
134
+ for param in client_parameters:
135
+ val = client_cfg.get(param)
136
+ if val is None:
137
+ continue
138
+ if not isinstance(val, (int, float)):
139
+ raise ValueError(f"Client parameter '{param}' must be a number.")
140
+ if val < 0:
141
+ raise ValueError(f"Client parameter '{param}' must be non-negative.")
142
+
143
+ # Validate extraction config
144
+ extraction_cfg = self.cfg.get("extraction", {})
145
+
146
+ extraction_parameters = [
147
+ "batch_size",
148
+ "max_hops",
149
+ "hub_pagination_threshold",
150
+ "max_neighbors_threshold",
151
+ "min_triplets_per_property",
152
+ ]
153
+ batch_size = extraction_cfg.get("batch_size")
154
+ if batch_size is None:
155
+ self.cfg["extraction"]["batch_size"] = 15
156
+ for param in extraction_parameters:
157
+ val = extraction_cfg.get(param)
158
+ if val is None:
159
+ continue
160
+ if not isinstance(val, (int, float)):
161
+ raise ValueError(f"Client parameter '{param}' must be a number.")
162
+ if val < 1:
163
+ raise ValueError(f"Client parameter '{param}' must be greater than 0.")
164
+
165
+ strategy = extraction_cfg.get("strategy", "").lower()
166
+ available_strategies = ["bfs", "hop"]
167
+ if strategy not in available_strategies:
168
+ raise ValueError(
169
+ f"Extraction strategy '{strategy}' is not supported. Available strategies are: {available_strategies}."
170
+ )
171
+
172
+ def _check_seeds_validity(self, seeds: list[str]) -> list[str]:
173
+ """Check if the given seeds are valid and return a list of invalid seeds."""
174
+ logger.info(f"Checking validity of {len(seeds)} unique seed nodes...")
175
+ validity_dict = self.wrapper.check_seeds_validity(seeds)
176
+ invalid_seeds = [
177
+ seed for seed, is_valid in validity_dict.items() if not is_valid
178
+ ]
179
+ return invalid_seeds
180
+
181
+ def _handle_hub_seeds(
182
+ self, seeds: list[list[str]], unique_seeds: list[str]
183
+ ) -> list[list[str]]:
184
+ """Check if any seed nodes exceed the max neighbors threshold and ask the user how to handle them."""
185
+ logger.info(
186
+ f"Checking number of neighbors for {len(unique_seeds)} unique seed nodes..."
187
+ )
188
+ try:
189
+ seed_totals = self.wrapper.count_neighbors(unique_seeds)
190
+
191
+ hub_seeds = {
192
+ seed: total
193
+ for seed, total in seed_totals.items()
194
+ if total >= self.max_neighbors_threshold
195
+ }
196
+
197
+ if not hub_seeds:
198
+ logger.info("No seed nodes exceed the max neighbors threshold.\n")
199
+ return seeds
200
+
201
+ logger.warning(
202
+ f"Found {len(hub_seeds)} seed(s) that are massive hubs "
203
+ f"(>{self.max_neighbors_threshold} neighbors):"
204
+ )
205
+
206
+ for hub, total in list(hub_seeds.items()):
207
+ logger.warning(
208
+ f" - {self.uri_manager.compress_uri(hub)} : {total} neighbors"
209
+ )
210
+
211
+ if self.keep_hub_seeds is None:
212
+ self.keep_hub_seeds = Confirm.ask(
213
+ "Do you want to keep these hubs ? (Answering 'yes' will force pagination for these nodes, which may increase extraction time)"
214
+ )
215
+
216
+ if not self.keep_hub_seeds:
217
+ logger.info(
218
+ "These hubs will be skipped during extraction, which may lead to missing paths but will speed up the process.\n"
219
+ )
220
+ return [
221
+ row
222
+ for row in seeds
223
+ if not any(seed in row for seed in hub_seeds.keys())
224
+ ]
225
+ else:
226
+ self.wrapper.forced_hubs.update(hub_seeds.keys())
227
+ logger.info(
228
+ "Pagination forced for seed hubs. They will not be skipped during extraction.\n"
229
+ )
230
+ return seeds
231
+ except ValueError as e:
232
+ faulty_nodes = e.args[0]
233
+
234
+ logger.error(
235
+ "[red]Error:[/] The following seed nodes could not be checked for neighbors (most likely because they are not valid URIs): "
236
+ )
237
+ for node in faulty_nodes:
238
+ logger.error(f" . '{node}'")
239
+
240
+ return []
241
+
242
+ def extract_subgraph(
243
+ self, seeds: list[str]
244
+ ) -> tuple[list[tuple[str, str, str]], set[str]]:
245
+ all_triplets: list[tuple[str, str, str]] = []
246
+ seeds_found: set[str] = set()
247
+
248
+ with self.ui.create_progress_bar() as progress:
249
+ task = progress.add_task("Processing rows", total=len(seeds))
250
+ for row in seeds:
251
+ clean_nodes = [
252
+ str(n).strip() for n in row if pd.notna(n) and str(n).strip()
253
+ ]
254
+
255
+ if not clean_nodes:
256
+ progress.advance(task)
257
+ continue
258
+
259
+ triplets = self.extractor_strategy.execute_task(
260
+ clean_nodes, progress, task
261
+ )
262
+
263
+ if not triplets:
264
+ self.stats["not_found"] += 1
265
+ else:
266
+ all_triplets.extend(triplets)
267
+ seeds_found.update(clean_nodes)
268
+ self.stats["found"] += 1
269
+
270
+ self.exporter.save_triplets(all_triplets, self.uri_manager.namespaces)
271
+ self.exporter.save_graph(self.extractor_strategy.graph)
272
+
273
+ logger.info("Computing final graph statistics...")
274
+
275
+ self.print_final_summary(
276
+ all_triplets, self.extractor_strategy.graph, "Extraction summary"
277
+ )
278
+
279
+ return all_triplets, seeds_found
280
+
281
+ def densify_graph(
282
+ self, triplets: list[tuple[str, str, str]], seeds_found: set[str]
283
+ ):
284
+ graph_connector: GraphConnector = GraphConnector(
285
+ self.wrapper,
286
+ self.uri_manager,
287
+ self.extractor_strategy.graph,
288
+ self.ui,
289
+ self.cfg,
290
+ )
291
+ nb_components = get_connected_components(triplets)
292
+ if len(nb_components) > 1:
293
+ logger.warning(
294
+ f"The extracted graph has {len(nb_components)} disconnected components. Starting densification to connect them...\n"
295
+ )
296
+
297
+ new_triplets = graph_connector.connect(seeds_found, triplets)
298
+
299
+ logger.info(
300
+ f"Found {len(new_triplets) - len(triplets)} new triplets during densification."
301
+ )
302
+
303
+ self.save(new_triplets, graph_connector.bfs.graph, name_suffix="_densified")
304
+
305
+ self.print_final_summary(
306
+ new_triplets, graph_connector.bfs.graph, "Densification summary"
307
+ )
308
+ else:
309
+ logger.info(
310
+ "The extracted graph is already fully connected. No densification needed."
311
+ )
312
+
313
+ def print_final_summary(
314
+ self,
315
+ all_triplets: list[tuple[str, str, str]],
316
+ graph: nx.MultiGraph,
317
+ table_title: str,
318
+ ) -> None:
319
+ detailed_stats = GraphStatistics.compute(all_triplets)
320
+ self.ui.print_summary(self.stats, detailed_stats, graph, table_title)
321
+
322
+ def save(
323
+ self,
324
+ triplets: list[tuple[str, str, str]],
325
+ graph: nx.MultiGraph,
326
+ name_suffix: str = "",
327
+ ) -> None:
328
+ self.exporter.save_triplets(triplets, self.uri_manager.namespaces, name_suffix)
329
+ self.exporter.save_graph(graph, name_suffix)
330
+
331
+ def run(self) -> None:
332
+ """Process all node seeds from a CSV file and export the results."""
333
+ input_path = Path(self.cfg["data"]["input_path"]).resolve()
334
+
335
+ seeds = pd.read_csv(input_path).values.tolist()
336
+
337
+ if not seeds:
338
+ logger.error(
339
+ "No seeds found in the input file. Please provide a valid CSV with seed nodes."
340
+ )
341
+ return
342
+
343
+ unique_seeds = list(set([str(seed).strip() for row in seeds for seed in row]))
344
+
345
+ if self.check_seeds_validity:
346
+ invalid_seeds = self._check_seeds_validity(unique_seeds)
347
+ if invalid_seeds:
348
+ logger.error(
349
+ "[red]Error:[/] The following seed nodes are not valid (most likely because they are not valid URIs or do not exist in the graph): "
350
+ )
351
+ for node in invalid_seeds:
352
+ logger.error(f" . '{node}'")
353
+
354
+ logger.error("Please fix the invalid seeds and try again.")
355
+ return
356
+
357
+ logger.info("All seeds are valid.\n")
358
+
359
+ max_neighbors_threshold = self.wrapper.max_neighbors_threshold
360
+
361
+ if max_neighbors_threshold < float("inf") and self.check_hub_seeds:
362
+ seeds = self._handle_hub_seeds(seeds, unique_seeds)
363
+
364
+ if not seeds:
365
+ logger.error("Stopping execution due to errors in seed checking.")
366
+ return
367
+
368
+ all_triplets, seeds_found = self.extract_subgraph(seeds)
369
+
370
+ if not all_triplets:
371
+ logger.error("No triplets were extracted. Exiting without saving.")
372
+ return
373
+
374
+ if self.skip_densification:
375
+ return
376
+
377
+ self.densify_graph(all_triplets, seeds_found)
@@ -0,0 +1,59 @@
1
+ {
2
+ "data": {
3
+ "input_path": "seed.csv",
4
+ "output_format": "csv",
5
+ "output_path": "output/result"
6
+ },
7
+ "client": {
8
+ "type": "SPARQL",
9
+ "user_agent": "YOUR_PROJECT_NAME (contact: YOUR_EMAIL)",
10
+ "endpoint": "https://dbpedia.org/sparql",
11
+ "request_delay": 1,
12
+ "retry_attempts": 3,
13
+ "retry_delay": 3.0,
14
+ "rate_limit_wait": 60.0,
15
+ "timeout": 40.0
16
+ },
17
+ "graph_filters": {
18
+ "include_uri_prefixes": [
19
+ "http://dbpedia.org/resource/",
20
+ "http://xmlns.com/foaf/0.1/"
21
+ ],
22
+ "exclude_uri_prefixes": [
23
+ "http://dbpedia.org/resource/Category:",
24
+ "http://dbpedia.org/resource/Template:"
25
+ ],
26
+ "exclude_nodes": [],
27
+ "exclude_properties": [
28
+ "wikiPageUsesTemplate"
29
+ ],
30
+ "namespaces": {
31
+ "dbr": "http://dbpedia.org/resource/",
32
+ "dbo": "http://dbpedia.org/ontology/",
33
+ "dbp": "http://dbpedia.org/property/",
34
+ "foaf": "http://xmlns.com/foaf/0.1/",
35
+ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
36
+ "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
37
+ }
38
+ },
39
+ "extraction": {
40
+ "strategy": "bfs",
41
+ "batch_size": 15,
42
+ "max_hops": 6,
43
+ "hub_pagination_threshold": 70000,
44
+ "max_neighbors_threshold": 300000,
45
+ "hub_pairs_batch_size": 100,
46
+ "min_triplets_per_property": 2,
47
+ "check_seeds_validity": true,
48
+ "check_hub_seeds": false,
49
+ "keep_hub_seeds": null
50
+ },
51
+ "densification": {
52
+ "mode": "most_connected",
53
+ "skip_densification": false
54
+ },
55
+ "debug": {
56
+ "debug_enabled": false,
57
+ "request_logging": false
58
+ }
59
+ }
@@ -0,0 +1,47 @@
1
+ {
2
+ "data": {
3
+ "input_path": "seeds.csv",
4
+ "output_format": "csv",
5
+ "output_path": "output"
6
+ },
7
+ "client": {
8
+ "type": "SPARQL",
9
+ "user_agent": "graph-seeder-default (contact: unknown)",
10
+ "endpoint": "http://localhost:7200/repositories/my-repo",
11
+ "request_delay": 0.5,
12
+ "retry_attempts": 3,
13
+ "retry_delay": 2.0,
14
+ "rate_limit_wait": 30.0,
15
+ "timeout": 30.0
16
+ },
17
+ "graph_filters": {
18
+ "include_uri_prefixes": [],
19
+ "exclude_uri_prefixes": [],
20
+ "exclude_nodes": [],
21
+ "exclude_properties": [],
22
+ "namespaces": {
23
+ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
24
+ "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
25
+ }
26
+ },
27
+ "extraction": {
28
+ "strategy": "bfs",
29
+ "batch_size": 15,
30
+ "max_hops": 6,
31
+ "hub_pagination_threshold": 50000,
32
+ "max_neighbors_threshold": 100000,
33
+ "hub_pairs_batch_size": 100,
34
+ "min_triplets_per_property": 2,
35
+ "check_seeds_validity": false,
36
+ "check_hub_seeds": false,
37
+ "keep_hub_seeds": null
38
+ },
39
+ "densification": {
40
+ "mode": "most_connected",
41
+ "skip_densification": false
42
+ },
43
+ "debug": {
44
+ "debug_enabled": false,
45
+ "request_logging": false
46
+ }
47
+ }
@@ -0,0 +1,50 @@
1
+ {
2
+ "data": {
3
+ "input_path": "seed.csv",
4
+ "output_format": "csv",
5
+ "output_path": "output/result"
6
+ },
7
+ "client": {
8
+ "type": "SPARQL",
9
+ "user_agent": "YOUR_PROJECT_NAME (contact: YOUR_EMAIL)",
10
+ "endpoint": "https://sparql.europeana.eu/",
11
+ "request_delay": 1,
12
+ "retry_attempts": 3,
13
+ "retry_delay": 3.0,
14
+ "rate_limit_wait": 60.0,
15
+ "timeout": 40.0
16
+ },
17
+ "graph_filters": {
18
+ "include_uri_prefixes": [],
19
+ "exclude_uri_prefixes": [],
20
+ "exclude_nodes": [],
21
+ "exclude_properties": [],
22
+ "namespaces": {
23
+ "edm": "http://www.europeana.eu/schemas/edm/",
24
+ "ore": "http://www.openarchives.org/ore/terms/",
25
+ "dc": "http://purl.org/dc/elements/1.1/",
26
+ "dcterms": "http://purl.org/dc/terms/",
27
+ "skos": "http://www.w3.org/2004/02/skos/core#"
28
+ }
29
+ },
30
+ "extraction": {
31
+ "strategy": "bfs",
32
+ "batch_size": 15,
33
+ "max_hops": 6,
34
+ "hub_pagination_threshold": 60000,
35
+ "max_neighbors_threshold": 150000,
36
+ "hub_pairs_batch_size": 100,
37
+ "min_triplets_per_property": 2,
38
+ "check_seeds_validity": false,
39
+ "check_hub_seeds": true,
40
+ "keep_hub_seeds": null
41
+ },
42
+ "densification": {
43
+ "mode": "most_connected",
44
+ "skip_densification": false
45
+ },
46
+ "debug": {
47
+ "debug_enabled": false,
48
+ "request_logging": false
49
+ }
50
+ }
@@ -0,0 +1,47 @@
1
+ {
2
+ "data": {
3
+ "input_path": "seed.csv",
4
+ "output_format": "csv",
5
+ "output_path": "output/result"
6
+ },
7
+ "client": {
8
+ "type": "SPARQL",
9
+ "user_agent": "YOUR_PROJECT_NAME (contact: YOUR_EMAIL)",
10
+ "endpoint": "https://pgxlod.loria.fr/sparql",
11
+ "request_delay": 1,
12
+ "retry_attempts": 3,
13
+ "retry_delay": 3.0,
14
+ "rate_limit_wait": 60.0,
15
+ "timeout": 40.0
16
+ },
17
+ "graph_filters": {
18
+ "include_uri_prefixes": [],
19
+ "exclude_uri_prefixes": [],
20
+ "exclude_nodes": [],
21
+ "exclude_properties": [],
22
+ "namespaces": {
23
+ "pharmgkb": "http://bio2rdf.org/pharmgkb",
24
+ "pgxlod": "http://pgxlod.loria.fr/resource/"
25
+ }
26
+ },
27
+ "extraction": {
28
+ "strategy": "bfs",
29
+ "batch_size": 15,
30
+ "max_hops": 6,
31
+ "hub_pagination_threshold": 60000,
32
+ "max_neighbors_threshold": 150000,
33
+ "hub_pairs_batch_size": 100,
34
+ "min_triplets_per_property": 2,
35
+ "check_seeds_validity": false,
36
+ "check_hub_seeds": true,
37
+ "keep_hub_seeds": null
38
+ },
39
+ "densification": {
40
+ "mode": "most_connected",
41
+ "skip_densification": false
42
+ },
43
+ "debug": {
44
+ "debug_enabled": false,
45
+ "request_logging": false
46
+ }
47
+ }