PyPI - graph-seeder - Versions diffs - 1.0.0.dev0__py3-none-any.whl - Mend

graph-seeder 1.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

graph_seeder/GraphSeeder.py +47 -0
graph_seeder/SubgraphExtractor.py +377 -0
graph_seeder/configs/dbpedia_default.json +59 -0
graph_seeder/configs/default.json +47 -0
graph_seeder/configs/europeana_default.json +50 -0
graph_seeder/configs/pgxlod_default.json +47 -0
graph_seeder/configs/wikidata_default.json +70 -0
graph_seeder/densification/GraphConnector.py +113 -0
graph_seeder/extraction/BFS/BFS.py +192 -0
graph_seeder/extraction/ExtractionStrategy.py +70 -0
graph_seeder/extraction/Hop/HopExpansion.py +92 -0
graph_seeder/utils/ConsoleUI.py +273 -0
graph_seeder/utils/Factory.py +64 -0
graph_seeder/utils/GraphExporter.py +84 -0
graph_seeder/utils/GraphStatistics.py +32 -0
graph_seeder/utils/URIManager.py +95 -0
graph_seeder/utils/utils.py +217 -0
graph_seeder/wrapper/NeighborhoodWrapper.py +47 -0
graph_seeder/wrapper/hashmap/HashMapWrapper.py +124 -0
graph_seeder/wrapper/sparql/BaseClient.py +23 -0
graph_seeder/wrapper/sparql/GraphWrapper.py +269 -0
graph_seeder/wrapper/sparql/SparqlQueryBuilder.py +175 -0
graph_seeder/wrapper/sparql/client/SparqlClient.py +118 -0
graph_seeder/wrapper/sparql/client/TurtleClient.py +47 -0
graph_seeder-1.0.0.dev0.dist-info/METADATA +191 -0
graph_seeder-1.0.0.dev0.dist-info/RECORD +28 -0
graph_seeder-1.0.0.dev0.dist-info/WHEEL +4 -0
graph_seeder-1.0.0.dev0.dist-info/entry_points.txt +2 -0

graph_seeder/configs/wikidata_default.json ADDED Viewed

@@ -0,0 +1,70 @@
+{
+    "data": {
+        "input_path": "seed.csv",
+        "output_format": "csv",
+        "output_path": "output/result"
+    },
+    "client": {
+        "type": "SPARQL",
+        "user_agent": "YOUR_PROJECT_NAME (contact: YOUR_EMAIL)",
+        "endpoint": "https://query.wikidata.org/sparql",
+        "request_delay": 1,
+        "retry_attempts": 3,
+        "retry_delay": 3.0,
+        "rate_limit_wait": 60.0,
+        "timeout": 40.0
+    },
+    "graph_filters": {
+        "include_uri_prefixes": [
+            "http://www.wikidata.org/entity/Q"
+        ],
+        "exclude_uri_prefixes": [],
+        "exclude_nodes": [
+            "http://www.wikidata.org/entity/Q5",
+            "http://www.wikidata.org/entity/Q30",
+            "http://www.wikidata.org/entity/Q145",
+            "http://www.wikidata.org/entity/Q215627",
+            "http://www.wikidata.org/entity/Q43229",
+            "http://www.wikidata.org/entity/Q6256",
+            "http://www.wikidata.org/entity/Q11424",
+            "http://www.wikidata.org/entity/Q17"
+        ],
+        "exclude_properties": [
+            "http://www.wikidata.org/prop/direct/P31",
+            "http://www.wikidata.org/prop/direct/P279",
+            "http://www.wikidata.org/prop/direct/P361",
+            "http://www.wikidata.org/prop/direct/P527",
+            "http://www.wikidata.org/prop/direct/P155",
+            "http://www.wikidata.org/prop/direct/P156",
+            "http://www.wikidata.org/prop/direct/P21",
+            "http://www.wikidata.org/prop/direct/P17",
+            "http://www.wikidata.org/prop/direct/P27",
+            "http://www.wikidata.org/prop/direct/P1412",
+            "http://www.wikidata.org/prop/direct/P407"
+        ],
+        "namespaces": {
+            "wdt": "http://www.wikidata.org/prop/direct/",
+            "wd": "http://www.wikidata.org/entity/"
+        }
+    },
+    "extraction": {
+        "strategy": "bfs",
+        "batch_size": 15,
+        "max_hops": 6,
+        "hub_pagination_threshold": 70000,
+        "max_neighbors_threshold": 300000,
+        "hub_pairs_batch_size": 100,
+        "min_triplets_per_property": 2,
+        "check_seeds_validity": false,
+        "check_hub_seeds": false,
+        "keep_hub_seeds": null
+    },
+    "densification": {
+        "mode": "most_connected",
+        "skip_densification": false
+    },
+    "debug": {
+        "debug_enabled": false,
+        "request_logging": false
+    }
+}

graph_seeder/densification/GraphConnector.py ADDED Viewed

@@ -0,0 +1,113 @@
+import logging
+import networkx as nx
+import random
+import itertools
+from graph_seeder.extraction.BFS.BFS import BidirectionalBFS
+from graph_seeder.utils.ConsoleUI import ConsoleUI
+from graph_seeder.utils.URIManager import URIManager
+from graph_seeder.utils.utils import get_connected_components
+from graph_seeder.utils.Factory import ComponentFactory
+from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
+logger = logging.getLogger("subgraph")
+class GraphConnector:
+    def __init__(
+        self,
+        wrapper: NeighborhoodWrapper,
+        uri_manager: URIManager,
+        graph: nx.MultiGraph,
+        ui: ConsoleUI,
+        config: dict,
+    ):
+        self.wrapper = wrapper
+        self.uri_manager = uri_manager
+        self.graph = graph
+        self.ui = ui
+        self.bfs_config = dict(config)
+        self.bfs_config["extraction"]["strategy"] = "bfs"
+        self.mode = (
+            config.get("densification", {}).get("mode", "most_connected").lower()
+        )
+        self.bfs: BidirectionalBFS = ComponentFactory.create_strategy(
+            wrapper, uri_manager, self.bfs_config
+        )
+        self.bfs.load_graph(graph)
+    def _pick_representative(self, comp_seeds: list[str]) -> str:
+        if self.mode == "most_connected":
+            return max(comp_seeds, key=lambda n: self.graph.degree(n))
+        elif self.mode == "random":
+            return random.choice(comp_seeds)
+        else:
+            return comp_seeds[0]
+    def connect(
+        self, found_seeds: set[str], triplets: list[tuple[str, str, str]]
+    ) -> list[tuple[str, str, str]]:
+        new_triplets: list[tuple[str, str, str]] = list(triplets)
+        failed_component_pairs = set()
+        with self.ui.create_progress_bar() as progress:
+            task = progress.add_task("Components densification", total=None)
+            while True:
+                components = get_connected_components(new_triplets)
+                if len(components) <= 1:
+                    current_completed = progress.tasks[0].completed
+                    progress.update(task, total=current_completed)
+                    logger.info("[green]✓[/] All components are now connected.")
+                    break
+                for ca, cb in itertools.combinations(components, 2):
+                    seeds_a = found_seeds & ca
+                    seeds_b = found_seeds & cb
+                    if not seeds_a or not seeds_b:
+                        continue
+                    sorted_a = tuple(sorted(seeds_a))
+                    sorted_b = tuple(sorted(seeds_b))
+                    pair_id = tuple(sorted([sorted_a, sorted_b]))
+                    if pair_id not in failed_component_pairs:
+                        break
+                else:
+                    current_completed = progress.tasks[0].completed
+                    progress.update(task, total=current_completed)
+                    logger.warning(
+                        "[red]✗[/] No valid pairs left to connect. Densification aborted."
+                    )
+                    break
+                if not seeds_a or not seeds_b:
+                    logger.warning(
+                        "[red]✗[/] No seeds available to connect components "
+                    )
+                    break
+                source = self._pick_representative(list(seeds_a))
+                target = self._pick_representative(list(seeds_b))
+                triplets = self.bfs.execute_task(
+                    [source, target],
+                    progress,
+                    task,
+                )
+                if triplets:
+                    new_triplets.extend(triplets)
+                    for s, p, o in triplets:
+                        self.graph.add_edge(s, o)
+                else:
+                    failed_component_pairs.add(
+                        tuple(sorted([tuple(seeds_a), tuple(seeds_b)]))
+                    )
+        return new_triplets

graph_seeder/extraction/BFS/BFS.py ADDED Viewed

@@ -0,0 +1,192 @@
+import logging
+from graph_seeder.utils.URIManager import URIManager
+from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
+from graph_seeder.extraction.ExtractionStrategy import ExtractionStrategy
+import networkx as nx
+logger = logging.getLogger("subgraph")
+class BidirectionalBFS(ExtractionStrategy):
+    """Bidirectional BFS over a knowledge graph.
+    The graph is grown on demand: nodes are only fetched from the SPARQL endpoint
+    when the BFS frontier reaches them.  Hub nodes (high-degree connectors) are
+    detected and skipped before each expansion to avoid timeout storms.
+    """
+    def __init__(
+        self,
+        wrapper: NeighborhoodWrapper,
+        uri_manager: URIManager,
+        cfg: dict,
+    ) -> None:
+        super().__init__(wrapper, uri_manager, cfg)
+        self.uri_manager = uri_manager
+        self.cfg = cfg
+        self.explored_nodes: set[str] = set()
+        self._has_path: bool = False
+        self._excluded_nodes = set(cfg["graph_filters"]["exclude_nodes"])
+        max_hops_val = cfg["extraction"].get("max_hops")
+        self.max_hops = max_hops_val if max_hops_val is not None else float("inf")
+    def format_progress_description(self, nodes: list[str]) -> str:
+        source_uri = self.uri_manager.compress_uri(nodes[0])
+        target_uri = self.uri_manager.compress_uri(nodes[1])
+        return f"{source_uri} → {target_uri}"
+    def format_start_message(
+        self,
+        nodes: list[str],
+    ) -> str:
+        return f"Extracting path for {self.format_progress_description(nodes)}"
+    def extract(self, nodes: list[str]) -> tuple[list[tuple[str, str, str]], str]:
+        """Extract a subgraph connecting the given seed nodes using bidirectional BFS.
+        Returns:
+             Tuple of (list of path triplets, result message)"""
+        if len(nodes) != 2:
+            raise ValueError(
+                "BidirectionalBFS extraction requires exactly 2 seed nodes."
+            )
+        source, target = nodes
+        return self._find_path(source, target)
+    def _find_path(
+        self, source: str, target: str
+    ) -> tuple[list[tuple[str, str, str]], str]:
+        """Find a path between two nodes within the configured hop limit.
+        Returns:
+             Tuple of (path_triplets, result message)
+        """
+        self.graph.add_node(source)
+        self.graph.add_node(target)
+        source_uri = self.uri_manager.compress_uri(source)
+        target_uri = self.uri_manager.compress_uri(target)
+        if source == target:
+            return [], f"[red]✗[/] Source and target are the same node: {source_uri}"
+        q_src: set[str] = {source}
+        q_tgt: set[str] = {target}
+        visited_src: set[str] = {source}
+        visited_tgt: set[str] = {target}
+        p_src = p_tgt = 0
+        self._has_path = False
+        result_message = None
+        while q_src and q_tgt and p_src + p_tgt <= self.max_hops:
+            # We expand the smaller frontier to balance the search
+            if len(q_src) <= len(q_tgt):
+                logger.info(
+                    f"Expanding from source {source_uri!r} at depth {p_src} "
+                    f"({len(q_src)} nodes)"
+                )
+                q_src = self._expand_level(q_src, visited_src, visited_tgt)
+                visited_src.update(q_src)
+                p_src += 1
+            else:
+                logger.info(
+                    f"Expanding from target {target_uri!r} at depth {p_tgt} "
+                    f"({len(q_tgt)} nodes)"
+                )
+                q_tgt = self._expand_level(q_tgt, visited_tgt, visited_src)
+                visited_tgt.update(q_tgt)
+                p_tgt += 1
+            if self._has_path:
+                break
+            if p_src + p_tgt >= self.max_hops:
+                result_message = f"[yellow]✗[/] Path not found for {source_uri} → {target_uri} : Max hops limit ({self.max_hops}) reached before finding a connection."
+                break
+        if not self._has_path:
+            if not result_message:
+                result_message = f"[yellow]✗[/] Path not found for {source_uri} → {target_uri} : No path exists (search space exhausted, nodes may be isolated due to filters)."
+            return [], result_message
+        path_triplets = self._extract_path_triplets(source, target)
+        if len(path_triplets) > self.max_hops:
+            result_message = f"[yellow]✗[/] Path found for {source_uri} → {target_uri} but discarded: length {len(path_triplets)} exceeds max_hops ({self.max_hops})."
+            logger.warning(result_message)
+            return [], result_message
+        return (
+            path_triplets,
+            f"[green]✓[/] Path found for {source_uri} → {target_uri} : {len(path_triplets)} hops.",
+        )
+    def _expand_level(
+        self,
+        current_level: set[str],
+        nodes_visited: set[str],
+        visited_other_side: set[str],
+    ) -> set[str]:
+        """Expand one BFS frontier level using cached and remote neighborhoods."""
+        next_level: set[str] = set()
+        nodes_to_query: list[str] = []
+        for node in current_level:
+            if node in self.explored_nodes:
+                for neighbor in self.graph.neighbors(node):
+                    if neighbor not in nodes_visited:
+                        next_level.add(neighbor)
+            else:
+                nodes_to_query.append(node)
+        if next_level & visited_other_side:
+            self._has_path = True
+            return next_level
+        if not nodes_to_query:
+            return next_level
+        self.explored_nodes.update(nodes_to_query)
+        for node in nodes_to_query:
+            self.graph.add_node(node)
+        for triplets in self.wrapper.get_neighborhood(nodes_to_query):
+            for subj, predicate, obj in triplets:
+                if subj in self._excluded_nodes or obj in self._excluded_nodes:
+                    continue
+                self.graph.add_edge(
+                    subj, obj, predicate=predicate, original_subj=subj, original_obj=obj
+                )
+                for n in (subj, obj):
+                    if n not in nodes_visited:
+                        next_level.add(n)
+            if next_level & visited_other_side:
+                self._has_path = True
+                break
+        return next_level
+    def _extract_path_triplets(
+        self, source: str, target: str
+    ) -> list[tuple[str, str, str]]:
+        """Build a triple sequence for the shortest path currently in the graph."""
+        path_nodes: list[str] = nx.shortest_path(
+            self.graph, source=source, target=target
+        )
+        triplets: list[tuple[str, str, str]] = []
+        for u, v in zip(path_nodes, path_nodes[1:]):
+            edges = self.graph[u][v]
+            edge_data = edges[next(iter(edges))]
+            predicate = edge_data.get("predicate", "unknown_property")
+            subj = edge_data.get("original_subj", u)
+            obj = edge_data.get("original_obj", v)
+            triplets.append((subj, predicate, obj))
+        return triplets

graph_seeder/extraction/ExtractionStrategy.py ADDED Viewed

@@ -0,0 +1,70 @@
+from abc import ABC, abstractmethod
+from networkx import MultiGraph
+from graph_seeder.utils.URIManager import URIManager
+from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
+from rich.progress import (
+    Progress,
+    TaskID,
+)
+import logging
+from time import time
+logger = logging.getLogger("subgraph")
+class ExtractionStrategy(ABC):
+    def __init__(
+        self, wrapper: NeighborhoodWrapper, uri_manager: URIManager, config: dict
+    ):
+        """Base class for extraction strategies that define how to extract a subgraph given a set of seed nodes."""
+        self.wrapper = wrapper
+        self.uri_manager = uri_manager
+        self.config = config
+        self.graph: MultiGraph = MultiGraph()
+    @abstractmethod
+    def extract(self, nodes: list[str]) -> tuple[list[tuple[str, str, str]], str]:
+        """Extract a subgraph given a list of seed nodes.
+        Returns:
+            Tuple of (list of triplets, result message)"""
+    @abstractmethod
+    def format_progress_description(self, nodes: list[str]) -> str:
+        """Short message logged in the progress bar(ex: A -> B or just A for single node extractions)"""
+    @abstractmethod
+    def format_start_message(
+        self,
+        nodes: list[str],
+    ) -> str:
+        """Full message logged when starting the extraction of a row"""
+    def execute_task(
+        self, nodes: list[str], progress: Progress, task: TaskID
+    ) -> list[tuple[str, str, str]]:
+        """Execute the extraction task with progress bar updates and error handling."""
+        task_description = self.format_progress_description(nodes)
+        start_message = self.format_start_message(nodes)
+        progress.update(task, description=f"[cyan]{task_description}[/]")
+        logger.info(f"[bold blue]Starting:[/] {start_message}")
+        start_time = time()
+        triplets, result_message = self.extract(nodes)
+        duration = time() - start_time
+        if not triplets:
+            logger.warning(f"{result_message} - Took {duration:.2f} sec\n")
+        else:
+            logger.info(f"{result_message} - Took {duration:.2f} sec\n")
+        progress.advance(task)
+        return triplets
+    def load_graph(self, graph: MultiGraph) -> None:
+        """Load an existing graph that will be used during extraction"""
+        self.graph = graph

graph_seeder/extraction/Hop/HopExpansion.py ADDED Viewed

@@ -0,0 +1,92 @@
+import logging
+from graph_seeder.extraction.ExtractionStrategy import ExtractionStrategy
+from graph_seeder.utils.URIManager import URIManager
+from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
+logger = logging.getLogger("subgraph")
+class HopExpansion(ExtractionStrategy):
+    """Simple expansion strategy that expands each node level by level up to max_hops."""
+    def __init__(
+        self, wrapper: NeighborhoodWrapper, uri_manager: URIManager, config: dict
+    ):
+        super().__init__(wrapper, uri_manager, config)
+        self.max_hops = config["extraction"]["max_hops"]
+        self.excluded_nodes = set(
+            config.get("graph_filters", {}).get("exclude_nodes", [])
+        )
+    def format_progress_description(self, nodes: list[str]) -> str:
+        return self.uri_manager.compress_uri(nodes[0])
+    def format_start_message(
+        self,
+        nodes: list[str],
+    ) -> str:
+        return f"Expanding {self.max_hops} hops for {self.format_progress_description(nodes)}"
+    def extract(self, nodes: list[str]) -> tuple[list[tuple[str, str, str]], str]:
+        """
+        Extract a subgraph by expanding from the given seed nodes up to max_hops.
+        Returns:
+             Tuple of (list of path triplets, result message)
+        """
+        triplets = self._expand(nodes)
+        if not triplets:
+            return [], "[yellow]✗[/] No triplets found within the specified hop limit."
+        return (
+            triplets,
+            f"[green]✓[/] Extracted {len(triplets)} triplets within {self.max_hops} hops.",
+        )
+    def _expand(self, nodes: list[str]) -> list[tuple[str, str, str]] | None:
+        """
+        Expand a list of nodes radially up to max_hops.
+        Returns a list of all discovered unique triplets.
+        """
+        valid_nodes = [n for n in nodes if n not in self.excluded_nodes]
+        if not valid_nodes:
+            return []
+        visited_nodes: set[str] = set(valid_nodes)
+        current_level_nodes: set[str] = set(valid_nodes)
+        self.graph.add_nodes_from(valid_nodes)
+        all_triplets: set[tuple[str, str, str]] = set()
+        for hop in range(self.max_hops):
+            logger.info(
+                f"Expanding hop {hop + 1}/{self.max_hops} with {len(current_level_nodes)} nodes..."
+            )
+            next_level_nodes: set[str] = set()
+            for triplets in self.wrapper.get_neighborhood(list(current_level_nodes)):
+                for subj, pred, obj in triplets:
+                    self.graph.add_edge(subj, obj, key=pred)
+                    if subj in self.excluded_nodes or obj in self.excluded_nodes:
+                        continue
+                    all_triplets.add((subj, pred, obj))
+                    if subj not in visited_nodes:
+                        next_level_nodes.add(subj)
+                    if obj not in visited_nodes:
+                        next_level_nodes.add(obj)
+            if not next_level_nodes:
+                logger.info("No more nodes to expand. Graph is fully explored.")
+                break
+            visited_nodes.update(next_level_nodes)
+            current_level_nodes = next_level_nodes
+            self.graph.add_nodes_from(current_level_nodes)
+            logger.info(
+                f"Hop {hop + 1} expansion complete. Discovered {len(all_triplets)} unique triplets"
+            )
+        return list(all_triplets)