PyPI - graph-seeder - Versions diffs - 1.0.0.dev0__py3-none-any.whl - Mend

graph-seeder 1.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

graph_seeder/GraphSeeder.py +47 -0
graph_seeder/SubgraphExtractor.py +377 -0
graph_seeder/configs/dbpedia_default.json +59 -0
graph_seeder/configs/default.json +47 -0
graph_seeder/configs/europeana_default.json +50 -0
graph_seeder/configs/pgxlod_default.json +47 -0
graph_seeder/configs/wikidata_default.json +70 -0
graph_seeder/densification/GraphConnector.py +113 -0
graph_seeder/extraction/BFS/BFS.py +192 -0
graph_seeder/extraction/ExtractionStrategy.py +70 -0
graph_seeder/extraction/Hop/HopExpansion.py +92 -0
graph_seeder/utils/ConsoleUI.py +273 -0
graph_seeder/utils/Factory.py +64 -0
graph_seeder/utils/GraphExporter.py +84 -0
graph_seeder/utils/GraphStatistics.py +32 -0
graph_seeder/utils/URIManager.py +95 -0
graph_seeder/utils/utils.py +217 -0
graph_seeder/wrapper/NeighborhoodWrapper.py +47 -0
graph_seeder/wrapper/hashmap/HashMapWrapper.py +124 -0
graph_seeder/wrapper/sparql/BaseClient.py +23 -0
graph_seeder/wrapper/sparql/GraphWrapper.py +269 -0
graph_seeder/wrapper/sparql/SparqlQueryBuilder.py +175 -0
graph_seeder/wrapper/sparql/client/SparqlClient.py +118 -0
graph_seeder/wrapper/sparql/client/TurtleClient.py +47 -0
graph_seeder-1.0.0.dev0.dist-info/METADATA +191 -0
graph_seeder-1.0.0.dev0.dist-info/RECORD +28 -0
graph_seeder-1.0.0.dev0.dist-info/WHEEL +4 -0
graph_seeder-1.0.0.dev0.dist-info/entry_points.txt +2 -0

graph_seeder/GraphSeeder.py ADDED Viewed

@@ -0,0 +1,47 @@
+from graph_seeder.SubgraphExtractor import SubgraphExtractor
+from graph_seeder.utils.utils import OVERRIDE_MAP
+import argparse
+def main():
+    parser = argparse.ArgumentParser(
+        description="Extract subgraph paths from Knowledge Graphs."
+    )
+    parser.add_argument(
+        "--config", default=None, help="Built-in template name OR path to a JSON file."
+    )
+    groups = {}
+    for key, mapping in OVERRIDE_MAP.items():
+        section = mapping[0]
+        expected_type = mapping[1]
+        description = mapping[2]
+        if section not in groups:
+            groups[section] = parser.add_argument_group(
+                f"{section.replace('_', ' ').title()} Options"
+            )
+        kwargs = {"dest": key, "help": f"{description} (Type: {expected_type})"}
+        if isinstance(expected_type, tuple) or expected_type == list:
+            kwargs["type"] = str
+            kwargs["nargs"] = "+"
+        elif expected_type is bool:
+            kwargs["action"] = argparse.BooleanOptionalAction
+        else:
+            kwargs["type"] = expected_type
+        groups[section].add_argument(f"--{key.replace('_', '-')}", **kwargs)
+    args = parser.parse_args()
+    overrides = {k: v for k, v in vars(args).items() if k != "config" and v is not None}
+    extractor = SubgraphExtractor(config_path=args.config, **overrides)
+    extractor.run()
+if __name__ == "__main__":
+    main()

graph_seeder/SubgraphExtractor.py ADDED Viewed

@@ -0,0 +1,377 @@
+import logging
+from pathlib import Path
+import pandas as pd
+import networkx as nx
+from rich.console import Console
+from rich.logging import RichHandler
+from rich.prompt import Confirm
+from graph_seeder.extraction.ExtractionStrategy import ExtractionStrategy
+from graph_seeder.densification.GraphConnector import GraphConnector
+from graph_seeder.utils.Factory import ComponentFactory
+from graph_seeder.utils.ConsoleUI import ConsoleUI
+from graph_seeder.utils.GraphExporter import GraphExporter
+from graph_seeder.utils.URIManager import URIManager
+from graph_seeder.utils.GraphStatistics import GraphStatistics
+from graph_seeder.utils.utils import get_connected_components, load_config
+console = Console(emoji=False)
+# Logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(message)s",
+    handlers=[
+        RichHandler(console=console, rich_tracebacks=True, markup=True, show_path=False)
+    ],
+)
+logger = logging.getLogger("subgraph")
+class SubgraphExtractor:
+    """Extract subgraphs connecting seed nodes based on a specified strategy and export the results."""
+    def __init__(self, config_path: str = None, **kwargs) -> None:
+        """Initialize the extractor and all collaborating components."""
+        self.cfg = load_config(config_path, kwargs)
+        self._validate_and_clean_config()
+        if self.cfg["debug"]["debug_enabled"]:
+            logger.setLevel(logging.DEBUG)
+            logger.debug("Debug mode enabled.")
+        self.uri_manager = URIManager(self.cfg["graph_filters"].get("namespaces", {}))
+        self.ui = ConsoleUI(console)
+        self.exporter = GraphExporter(self.cfg["data"])
+        excluded_properties_display = [
+            self.uri_manager.compress_uri(prop)
+            for prop in self.cfg["graph_filters"]["exclude_properties"]
+        ]
+        excluded_nodes_display = [
+            self.uri_manager.compress_uri(node)
+            for node in self.cfg["graph_filters"]["exclude_nodes"]
+        ]
+        self.ui.print_config(
+            self.cfg,
+            excluded_nodes_display,
+            excluded_properties_display,
+        )
+        client_type = self.cfg.get("client", {}).get("type", "sparql").lower()
+        client = None
+        if client_type in ["sparql", "turtle"]:
+            client = ComponentFactory.create_client(self.cfg)
+        self.wrapper = ComponentFactory.create_wrapper(
+            self.uri_manager, self.cfg, client
+        )
+        self.extractor_strategy: ExtractionStrategy = ComponentFactory.create_strategy(
+            self.wrapper, self.uri_manager, self.cfg
+        )
+        self.check_seeds_validity = self.cfg["extraction"].get(
+            "check_seeds_validity", True
+        )
+        self.check_hub_seeds = self.cfg["extraction"].get("check_hub_seeds", True)
+        self.keep_hub_seeds = self.cfg["extraction"].get("keep_hub_seeds", None)
+        self.max_neighbors_threshold = self.wrapper.max_neighbors_threshold
+        self.skip_densification = self.cfg.get("densification", {}).get(
+            "skip_densification", False
+        )
+        self.stats = {
+            "found": 0,
+            "not_found": 0,
+        }
+    def _validate_and_clean_config(self) -> None:
+        """Validate config values and apply defaults where necessary."""
+        # Validate data config
+        data_cfg = self.cfg.get("data", {})
+        input_path = data_cfg.get("input_path")
+        if not input_path:
+            raise ValueError("Data config requires an 'input_path' to a CSV file.")
+        output_format = data_cfg.get("output_format", {})
+        if output_format not in ["csv", "json", "ttl"]:
+            raise ValueError(
+                "Data config 'output_format' must be one of: 'csv', 'json', 'ttl'."
+            )
+        # Validate client config
+        client_cfg = self.cfg.get("client", {})
+        if client_cfg.get("type").lower() == "sparql":
+            endpoint = client_cfg.get("endpoint")
+            if not endpoint:
+                raise ValueError(
+                    "SPARQL client requires an 'endpoint' URL in the config."
+                )
+            user_agent = client_cfg.get("user_agent", "")
+            if (
+                not user_agent
+                or "YOUR_PROJECT_NAME" in user_agent
+                or user_agent.strip() == ""
+            ):
+                logger.warning(
+                    """SPARQL user agent has not been set or is using the default placeholder. Note that this may lead to some endpoints (e.g. Wikidata) blocking your requests or reducing your rate limits. Set a custom user agent (e.g. "your-project/1.0 (your-email@example.com)") in the config to avoid this issue and to help endpoint operators identify your traffic.
+                    """
+                )
+        client_parameters = [
+            "request_delay",
+            "retry_attempts",
+            "retry_delay",
+            "rate_limit_wait",
+            "timeout",
+        ]
+        for param in client_parameters:
+            val = client_cfg.get(param)
+            if val is None:
+                continue
+            if not isinstance(val, (int, float)):
+                raise ValueError(f"Client parameter '{param}' must be a number.")
+            if val < 0:
+                raise ValueError(f"Client parameter '{param}' must be non-negative.")
+        # Validate extraction config
+        extraction_cfg = self.cfg.get("extraction", {})
+        extraction_parameters = [
+            "batch_size",
+            "max_hops",
+            "hub_pagination_threshold",
+            "max_neighbors_threshold",
+            "min_triplets_per_property",
+        ]
+        batch_size = extraction_cfg.get("batch_size")
+        if batch_size is None:
+            self.cfg["extraction"]["batch_size"] = 15
+        for param in extraction_parameters:
+            val = extraction_cfg.get(param)
+            if val is None:
+                continue
+            if not isinstance(val, (int, float)):
+                raise ValueError(f"Client parameter '{param}' must be a number.")
+            if val < 1:
+                raise ValueError(f"Client parameter '{param}' must be greater than 0.")
+        strategy = extraction_cfg.get("strategy", "").lower()
+        available_strategies = ["bfs", "hop"]
+        if strategy not in available_strategies:
+            raise ValueError(
+                f"Extraction strategy '{strategy}' is not supported. Available strategies are: {available_strategies}."
+            )
+    def _check_seeds_validity(self, seeds: list[str]) -> list[str]:
+        """Check if the given seeds are valid and return a list of invalid seeds."""
+        logger.info(f"Checking validity of {len(seeds)} unique seed nodes...")
+        validity_dict = self.wrapper.check_seeds_validity(seeds)
+        invalid_seeds = [
+            seed for seed, is_valid in validity_dict.items() if not is_valid
+        ]
+        return invalid_seeds
+    def _handle_hub_seeds(
+        self, seeds: list[list[str]], unique_seeds: list[str]
+    ) -> list[list[str]]:
+        """Check if any seed nodes exceed the max neighbors threshold and ask the user how to handle them."""
+        logger.info(
+            f"Checking number of neighbors for {len(unique_seeds)} unique seed nodes..."
+        )
+        try:
+            seed_totals = self.wrapper.count_neighbors(unique_seeds)
+            hub_seeds = {
+                seed: total
+                for seed, total in seed_totals.items()
+                if total >= self.max_neighbors_threshold
+            }
+            if not hub_seeds:
+                logger.info("No seed nodes exceed the max neighbors threshold.\n")
+                return seeds
+            logger.warning(
+                f"Found {len(hub_seeds)} seed(s) that are massive hubs "
+                f"(>{self.max_neighbors_threshold} neighbors):"
+            )
+            for hub, total in list(hub_seeds.items()):
+                logger.warning(
+                    f" - {self.uri_manager.compress_uri(hub)} : {total} neighbors"
+                )
+            if self.keep_hub_seeds is None:
+                self.keep_hub_seeds = Confirm.ask(
+                    "Do you want to keep these hubs ? (Answering 'yes' will force pagination for these nodes, which may increase extraction time)"
+                )
+            if not self.keep_hub_seeds:
+                logger.info(
+                    "These hubs will be skipped during extraction, which may lead to missing paths but will speed up the process.\n"
+                )
+                return [
+                    row
+                    for row in seeds
+                    if not any(seed in row for seed in hub_seeds.keys())
+                ]
+            else:
+                self.wrapper.forced_hubs.update(hub_seeds.keys())
+                logger.info(
+                    "Pagination forced for seed hubs. They will not be skipped during extraction.\n"
+                )
+                return seeds
+        except ValueError as e:
+            faulty_nodes = e.args[0]
+            logger.error(
+                "[red]Error:[/] The following seed nodes could not be checked for neighbors (most likely because they are not valid URIs): "
+            )
+            for node in faulty_nodes:
+                logger.error(f" . '{node}'")
+            return []
+    def extract_subgraph(
+        self, seeds: list[str]
+    ) -> tuple[list[tuple[str, str, str]], set[str]]:
+        all_triplets: list[tuple[str, str, str]] = []
+        seeds_found: set[str] = set()
+        with self.ui.create_progress_bar() as progress:
+            task = progress.add_task("Processing rows", total=len(seeds))
+            for row in seeds:
+                clean_nodes = [
+                    str(n).strip() for n in row if pd.notna(n) and str(n).strip()
+                ]
+                if not clean_nodes:
+                    progress.advance(task)
+                    continue
+                triplets = self.extractor_strategy.execute_task(
+                    clean_nodes, progress, task
+                )
+                if not triplets:
+                    self.stats["not_found"] += 1
+                else:
+                    all_triplets.extend(triplets)
+                    seeds_found.update(clean_nodes)
+                    self.stats["found"] += 1
+        self.exporter.save_triplets(all_triplets, self.uri_manager.namespaces)
+        self.exporter.save_graph(self.extractor_strategy.graph)
+        logger.info("Computing final graph statistics...")
+        self.print_final_summary(
+            all_triplets, self.extractor_strategy.graph, "Extraction summary"
+        )
+        return all_triplets, seeds_found
+    def densify_graph(
+        self, triplets: list[tuple[str, str, str]], seeds_found: set[str]
+    ):
+        graph_connector: GraphConnector = GraphConnector(
+            self.wrapper,
+            self.uri_manager,
+            self.extractor_strategy.graph,
+            self.ui,
+            self.cfg,
+        )
+        nb_components = get_connected_components(triplets)
+        if len(nb_components) > 1:
+            logger.warning(
+                f"The extracted graph has {len(nb_components)} disconnected components. Starting densification to connect them...\n"
+            )
+            new_triplets = graph_connector.connect(seeds_found, triplets)
+            logger.info(
+                f"Found {len(new_triplets) - len(triplets)} new triplets during densification."
+            )
+            self.save(new_triplets, graph_connector.bfs.graph, name_suffix="_densified")
+            self.print_final_summary(
+                new_triplets, graph_connector.bfs.graph, "Densification summary"
+            )
+        else:
+            logger.info(
+                "The extracted graph is already fully connected. No densification needed."
+            )
+    def print_final_summary(
+        self,
+        all_triplets: list[tuple[str, str, str]],
+        graph: nx.MultiGraph,
+        table_title: str,
+    ) -> None:
+        detailed_stats = GraphStatistics.compute(all_triplets)
+        self.ui.print_summary(self.stats, detailed_stats, graph, table_title)
+    def save(
+        self,
+        triplets: list[tuple[str, str, str]],
+        graph: nx.MultiGraph,
+        name_suffix: str = "",
+    ) -> None:
+        self.exporter.save_triplets(triplets, self.uri_manager.namespaces, name_suffix)
+        self.exporter.save_graph(graph, name_suffix)
+    def run(self) -> None:
+        """Process all node seeds from a CSV file and export the results."""
+        input_path = Path(self.cfg["data"]["input_path"]).resolve()
+        seeds = pd.read_csv(input_path).values.tolist()
+        if not seeds:
+            logger.error(
+                "No seeds found in the input file. Please provide a valid CSV with seed nodes."
+            )
+            return
+        unique_seeds = list(set([str(seed).strip() for row in seeds for seed in row]))
+        if self.check_seeds_validity:
+            invalid_seeds = self._check_seeds_validity(unique_seeds)
+            if invalid_seeds:
+                logger.error(
+                    "[red]Error:[/] The following seed nodes are not valid (most likely because they are not valid URIs or do not exist in the graph): "
+                )
+                for node in invalid_seeds:
+                    logger.error(f" . '{node}'")
+                logger.error("Please fix the invalid seeds and try again.")
+                return
+            logger.info("All seeds are valid.\n")
+        max_neighbors_threshold = self.wrapper.max_neighbors_threshold
+        if max_neighbors_threshold < float("inf") and self.check_hub_seeds:
+            seeds = self._handle_hub_seeds(seeds, unique_seeds)
+            if not seeds:
+                logger.error("Stopping execution due to errors in seed checking.")
+                return
+        all_triplets, seeds_found = self.extract_subgraph(seeds)
+        if not all_triplets:
+            logger.error("No triplets were extracted. Exiting without saving.")
+            return
+        if self.skip_densification:
+            return
+        self.densify_graph(all_triplets, seeds_found)

graph_seeder/configs/dbpedia_default.json ADDED Viewed

@@ -0,0 +1,59 @@
+{
+    "data": {
+        "input_path": "seed.csv",
+        "output_format": "csv",
+        "output_path": "output/result"
+    },
+    "client": {
+        "type": "SPARQL",
+        "user_agent": "YOUR_PROJECT_NAME (contact: YOUR_EMAIL)",
+        "endpoint": "https://dbpedia.org/sparql",
+        "request_delay": 1,
+        "retry_attempts": 3,
+        "retry_delay": 3.0,
+        "rate_limit_wait": 60.0,
+        "timeout": 40.0
+    },
+    "graph_filters": {
+        "include_uri_prefixes": [
+            "http://dbpedia.org/resource/",
+            "http://xmlns.com/foaf/0.1/"
+        ],
+        "exclude_uri_prefixes": [
+            "http://dbpedia.org/resource/Category:",
+            "http://dbpedia.org/resource/Template:"
+        ],
+        "exclude_nodes": [],
+        "exclude_properties": [
+            "wikiPageUsesTemplate"
+        ],
+      "namespaces": {
+        "dbr": "http://dbpedia.org/resource/",
+        "dbo": "http://dbpedia.org/ontology/",
+        "dbp": "http://dbpedia.org/property/",
+        "foaf": "http://xmlns.com/foaf/0.1/",
+        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+        "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
+      }
+    },
+    "extraction": {
+        "strategy": "bfs",
+        "batch_size": 15,
+        "max_hops": 6,
+        "hub_pagination_threshold": 70000,
+        "max_neighbors_threshold": 300000,
+        "hub_pairs_batch_size": 100,
+        "min_triplets_per_property": 2,
+        "check_seeds_validity": true,
+        "check_hub_seeds": false,
+        "keep_hub_seeds": null
+    },
+    "densification": {
+        "mode": "most_connected",
+        "skip_densification": false
+    },
+    "debug": {
+        "debug_enabled": false,
+        "request_logging": false
+    }
+}

graph_seeder/configs/default.json ADDED Viewed

@@ -0,0 +1,47 @@
+{
+  "data": {
+    "input_path": "seeds.csv",
+    "output_format": "csv",
+    "output_path": "output"
+  },
+  "client": {
+    "type": "SPARQL",
+    "user_agent": "graph-seeder-default (contact: unknown)",
+    "endpoint": "http://localhost:7200/repositories/my-repo",
+    "request_delay": 0.5,
+    "retry_attempts": 3,
+    "retry_delay": 2.0,
+    "rate_limit_wait": 30.0,
+    "timeout": 30.0
+  },
+  "graph_filters": {
+    "include_uri_prefixes": [],
+    "exclude_uri_prefixes": [],
+    "exclude_nodes": [],
+    "exclude_properties": [],
+    "namespaces": {
+      "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+      "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
+    }
+  },
+  "extraction": {
+    "strategy": "bfs",
+    "batch_size": 15,
+    "max_hops": 6,
+    "hub_pagination_threshold": 50000,
+    "max_neighbors_threshold": 100000,
+    "hub_pairs_batch_size": 100,
+    "min_triplets_per_property": 2,
+    "check_seeds_validity": false,
+    "check_hub_seeds": false,
+    "keep_hub_seeds": null
+  },
+  "densification": {
+    "mode": "most_connected",
+    "skip_densification": false
+  },
+  "debug": {
+    "debug_enabled": false,
+    "request_logging": false
+  }
+}

graph_seeder/configs/europeana_default.json ADDED Viewed

@@ -0,0 +1,50 @@
+{
+    "data": {
+        "input_path": "seed.csv",
+        "output_format": "csv",
+        "output_path": "output/result"
+    },
+    "client": {
+        "type": "SPARQL",
+        "user_agent": "YOUR_PROJECT_NAME (contact: YOUR_EMAIL)",
+        "endpoint": "https://sparql.europeana.eu/",
+        "request_delay": 1,
+        "retry_attempts": 3,
+        "retry_delay": 3.0,
+        "rate_limit_wait": 60.0,
+        "timeout": 40.0
+    },
+    "graph_filters": {
+        "include_uri_prefixes": [],
+        "exclude_uri_prefixes": [],
+        "exclude_nodes": [],
+      "exclude_properties": [],
+      "namespaces": {
+        "edm": "http://www.europeana.eu/schemas/edm/",
+        "ore": "http://www.openarchives.org/ore/terms/",
+        "dc": "http://purl.org/dc/elements/1.1/",
+        "dcterms": "http://purl.org/dc/terms/",
+        "skos": "http://www.w3.org/2004/02/skos/core#"
+      }
+    },
+    "extraction": {
+        "strategy": "bfs",
+        "batch_size": 15,
+        "max_hops": 6,
+        "hub_pagination_threshold": 60000,
+        "max_neighbors_threshold": 150000,
+        "hub_pairs_batch_size": 100,
+        "min_triplets_per_property": 2,
+        "check_seeds_validity": false,
+        "check_hub_seeds": true,
+        "keep_hub_seeds": null
+    },
+    "densification": {
+        "mode": "most_connected",
+        "skip_densification": false
+    },
+    "debug": {
+        "debug_enabled": false,
+        "request_logging": false
+    }
+}

graph_seeder/configs/pgxlod_default.json ADDED Viewed

@@ -0,0 +1,47 @@
+{
+    "data": {
+        "input_path": "seed.csv",
+        "output_format": "csv",
+        "output_path": "output/result"
+    },
+    "client": {
+        "type": "SPARQL",
+        "user_agent": "YOUR_PROJECT_NAME (contact: YOUR_EMAIL)",
+        "endpoint": "https://pgxlod.loria.fr/sparql",
+        "request_delay": 1,
+        "retry_attempts": 3,
+        "retry_delay": 3.0,
+        "rate_limit_wait": 60.0,
+        "timeout": 40.0
+    },
+    "graph_filters": {
+        "include_uri_prefixes": [],
+        "exclude_uri_prefixes": [],
+        "exclude_nodes": [],
+      "exclude_properties": [],
+      "namespaces": {
+        "pharmgkb": "http://bio2rdf.org/pharmgkb",
+        "pgxlod": "http://pgxlod.loria.fr/resource/"
+      }
+    },
+    "extraction": {
+        "strategy": "bfs",
+        "batch_size": 15,
+        "max_hops": 6,
+        "hub_pagination_threshold": 60000,
+        "max_neighbors_threshold": 150000,
+        "hub_pairs_batch_size": 100,
+        "min_triplets_per_property": 2,
+        "check_seeds_validity": false,
+        "check_hub_seeds": true,
+        "keep_hub_seeds": null
+    },
+    "densification": {
+        "mode": "most_connected",
+        "skip_densification": false
+    },
+    "debug": {
+        "debug_enabled": false,
+        "request_logging": false
+    }
+}