PyPI - graph-seeder - Versions diffs - 1.0.0.dev0__py3-none-any.whl - Mend

graph-seeder 1.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

graph_seeder/GraphSeeder.py +47 -0
graph_seeder/SubgraphExtractor.py +377 -0
graph_seeder/configs/dbpedia_default.json +59 -0
graph_seeder/configs/default.json +47 -0
graph_seeder/configs/europeana_default.json +50 -0
graph_seeder/configs/pgxlod_default.json +47 -0
graph_seeder/configs/wikidata_default.json +70 -0
graph_seeder/densification/GraphConnector.py +113 -0
graph_seeder/extraction/BFS/BFS.py +192 -0
graph_seeder/extraction/ExtractionStrategy.py +70 -0
graph_seeder/extraction/Hop/HopExpansion.py +92 -0
graph_seeder/utils/ConsoleUI.py +273 -0
graph_seeder/utils/Factory.py +64 -0
graph_seeder/utils/GraphExporter.py +84 -0
graph_seeder/utils/GraphStatistics.py +32 -0
graph_seeder/utils/URIManager.py +95 -0
graph_seeder/utils/utils.py +217 -0
graph_seeder/wrapper/NeighborhoodWrapper.py +47 -0
graph_seeder/wrapper/hashmap/HashMapWrapper.py +124 -0
graph_seeder/wrapper/sparql/BaseClient.py +23 -0
graph_seeder/wrapper/sparql/GraphWrapper.py +269 -0
graph_seeder/wrapper/sparql/SparqlQueryBuilder.py +175 -0
graph_seeder/wrapper/sparql/client/SparqlClient.py +118 -0
graph_seeder/wrapper/sparql/client/TurtleClient.py +47 -0
graph_seeder-1.0.0.dev0.dist-info/METADATA +191 -0
graph_seeder-1.0.0.dev0.dist-info/RECORD +28 -0
graph_seeder-1.0.0.dev0.dist-info/WHEEL +4 -0
graph_seeder-1.0.0.dev0.dist-info/entry_points.txt +2 -0

graph_seeder/utils/ConsoleUI.py ADDED Viewed

@@ -0,0 +1,273 @@
+from typing import Mapping, Any
+import networkx as nx
+from rich.console import Console
+from rich.panel import Panel
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    ProgressColumn,
+    SpinnerColumn,
+    TextColumn,
+    TimeElapsedColumn,
+)
+from rich.text import Text
+from rich.table import Table
+class SimplifiedEMAColumn(ProgressColumn):
+    def __init__(self, alpha: float = 0.2):
+        super().__init__()
+        self._ema: float = 0.0
+        self._alpha = alpha
+        self._last_completed = 0
+        self._last_time: float = 0.0
+    def render(self, task) -> Text:
+        if self._last_time == 0.0:
+            self._last_time = task._get_time()
+        if task.total is None or task.completed == 0 or task.total == 0:
+            return Text("--:--:--", style="progress.remaining")
+        if task.completed >= task.total:
+            return Text("00:00:00", style="progress.remaining")
+        if task.completed > self._last_completed:
+            now = task._get_time()
+            elapsed = now - self._last_time
+            self._ema = self._alpha * elapsed + (1 - self._alpha) * self._ema
+            self._last_time = now
+            self._last_completed = task.completed
+        remaining_steps = task.total - self._last_completed
+        elapsed_current = task._get_time() - self._last_time
+        virtual_ema = self._alpha * elapsed_current + (1 - self._alpha) * self._ema
+        remaining_secs = elapsed_current + (remaining_steps - 1) * virtual_ema
+        hours, rem = divmod(int(remaining_secs), 3600)
+        minutes, seconds = divmod(rem, 60)
+        return Text(f"{hours}:{minutes:02}:{seconds:02}", style="progress.remaining")
+class ConsoleUI:
+    """Helper class for displaying configuration and extraction summaries using Rich."""
+    def __init__(self, console: Console) -> None:
+        """Initialize the Rich console used by the UI helper."""
+        self.console = console
+    def _add_row_if_exists(
+        self, table: Table, label: str, value: Any, suffix: str = ""
+    ) -> None:
+        """Helper to only add a row to the table if the value is not None."""
+        if value is not None:
+            table.add_row(label, f"{value}{suffix}")
+    def _format_threshold(self, value: Any, is_lower_bound: bool = False) -> str:
+        """Format threshold values for display, handling special cases like None and infinity."""
+        if value is None:
+            return "Disabled"
+        if value == float("inf"):
+            return "Disabled (No limit)"
+        if is_lower_bound and value == 0:
+            return "Disabled"
+        return str(value)
+    def print_config(
+        self,
+        cfg: dict,
+        excluded_nodes_display: list[str],
+        excluded_properties_display: list[str],
+    ) -> None:
+        """Display the configuration settings dynamically based on what is provided."""
+        table = Table(show_header=False, box=None, padding=(0, 2))
+        table.add_column(style="bold cyan")
+        table.add_column(style="white")
+        data_config = cfg.get("data", {})
+        client_config = cfg.get("client", {})
+        extraction_config = cfg.get("extraction", {})
+        densification_config = cfg.get("densification", {})
+        filters_config = cfg.get("graph_filters", {})
+        debug_config = cfg.get("debug", {})
+        # Data settings
+        self._add_row_if_exists(table, "Input path", data_config.get("input_path"))
+        self._add_row_if_exists(
+            table, "Output format", data_config.get("output_format")
+        )
+        self._add_row_if_exists(table, "Output path", data_config.get("output_path"))
+        # Client settings
+        self._add_row_if_exists(table, "Client Type", client_config.get("type"))
+        self._add_row_if_exists(table, "Endpoint", client_config.get("endpoint"))
+        self._add_row_if_exists(table, "Database Path", client_config.get("path"))
+        self._add_row_if_exists(table, "User-Agent", client_config.get("user_agent"))
+        self._add_row_if_exists(
+            table, "Request delay", client_config.get("request_delay"), "s"
+        )
+        self._add_row_if_exists(
+            table, "Retry attempts", client_config.get("retry_attempts")
+        )
+        self._add_row_if_exists(
+            table, "Retry delay", client_config.get("retry_delay"), "s"
+        )
+        self._add_row_if_exists(
+            table, "Rate limit wait", client_config.get("rate_limit_wait"), "s"
+        )
+        self._add_row_if_exists(table, "Timeout", client_config.get("timeout"), "s")
+        # Extraction settings
+        self._add_row_if_exists(table, "Strategy", extraction_config.get("strategy"))
+        self._add_row_if_exists(
+            table, "Batch size", extraction_config.get("batch_size")
+        )
+        self._add_row_if_exists(
+            table,
+            "Max neighbors threshold",
+            self._format_threshold(extraction_config.get("max_neighbors_threshold")),
+        )
+        self._add_row_if_exists(
+            table,
+            "Max hops limit",
+            self._format_threshold(extraction_config.get("max_hops")),
+        )
+        self._add_row_if_exists(
+            table, "Hub pairs batch size", extraction_config.get("hub_pairs_batch_size")
+        )
+        self._add_row_if_exists(
+            table,
+            "Hub pagination threshold",
+            self._format_threshold(extraction_config.get("hub_pagination_threshold")),
+        )
+        self._add_row_if_exists(
+            table,
+            "Min triplets per property",
+            self._format_threshold(
+                extraction_config.get("min_triplets_per_property"),
+                is_lower_bound=True,
+            ),
+        )
+        self._add_row_if_exists(
+            table,
+            "Check seeds validity",
+            str(extraction_config.get("check_seeds_validity")),
+        )
+        # Hub seeds settings
+        self._add_row_if_exists(
+            table,
+            "Check hub seeds",
+            str(extraction_config.get("check_hub_seeds")),
+        )
+        self._add_row_if_exists(
+            table, "Keep hub seeds", str(extraction_config.get("keep_hub_seeds"))
+        )
+        # Densification settings
+        self._add_row_if_exists(
+            table,
+            "Skip densification",
+            str(densification_config.get("skip_densification", False)),
+        )
+        self._add_row_if_exists(
+            table, "Densification mode", densification_config.get("mode")
+        )
+        # Filters settings
+        include_uris = filters_config.get("include_uri_prefixes", [])
+        exclude_uris = filters_config.get("exclude_uri_prefixes", [])
+        self._add_row_if_exists(
+            table,
+            "Included URI prefixes",
+            "\n".join(include_uris) if include_uris else "None",
+        )
+        self._add_row_if_exists(
+            table,
+            "Excluded URI prefixes",
+            "\n".join(exclude_uris) if exclude_uris else "None",
+        )
+        self._add_row_if_exists(
+            table, "Excluded nodes", ", ".join(excluded_nodes_display) or "None"
+        )
+        self._add_row_if_exists(
+            table,
+            "Excluded properties",
+            ", ".join(excluded_properties_display) or "None",
+        )
+        namespaces = "\n".join(
+            f"{prefix}: {uri}"
+            for prefix, uri in filters_config.get("namespaces", {}).items()
+        )
+        self._add_row_if_exists(
+            table, "Loaded namespaces", namespaces if namespaces else "Default only"
+        )
+        # Debug settings
+        self._add_row_if_exists(table, "Debug mode", debug_config.get("debug_enabled"))
+        self._add_row_if_exists(
+            table, "Request logging", debug_config.get("request_logging")
+        )
+        self.console.print(
+            Panel(table, title="[bold green]Configuration[/]", expand=False)
+        )
+    def print_summary(
+        self,
+        found_stats: Mapping[str, int],
+        stats: Mapping[str, int],
+        graph: nx.Graph,
+        table_title: str = "Extraction Summary",
+    ) -> None:
+        """Display a dynamic summary of extraction results.
+        Args:
+            found_stats: Statistics for found triplets.
+            stats: Aggregated extraction counters (dynamically rendered).
+            graph: In-memory graph built during extraction.
+            table_title: Title for the summary table.
+        """
+        table = Table(show_header=False, box=None, padding=(0, 2))
+        table.add_column(style="bold cyan")
+        table.add_column(style="white")
+        for key, value in found_stats.items():
+            display_key = key.replace("_", " ").capitalize()
+            if "not" in key.lower() or "fail" in key.lower() or "error" in key.lower():
+                color = "yellow"
+            else:
+                color = "green"
+            table.add_row(display_key, f"[{color}]{value}[/]")
+        for key, value in stats.items():
+            display_key = key.replace("_", " ").capitalize()
+            table.add_row(display_key, f"{value}")
+        table.add_section()
+        table.add_row("Graph nodes", str(len(graph.nodes)))
+        table.add_row("Graph edges", str(len(graph.edges)))
+        self.console.print(
+            Panel(table, title=f"[bold green]{table_title}[/]", expand=False)
+        )
+    def create_progress_bar(self) -> Progress:
+        """Create a Rich progress bar for tracking extraction progress."""
+        return Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            MofNCompleteColumn(),
+            TextColumn("•"),
+            TimeElapsedColumn(),
+            TextColumn("•"),
+            SimplifiedEMAColumn(),
+            console=self.console,
+            refresh_per_second=1,
+        )

graph_seeder/utils/Factory.py ADDED Viewed

@@ -0,0 +1,64 @@
+from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
+from graph_seeder.wrapper.sparql.GraphWrapper import GraphWrapper
+from graph_seeder.wrapper.sparql.client.SparqlClient import SparqlClient
+from graph_seeder.wrapper.sparql.client.TurtleClient import TurtleClient
+from graph_seeder.wrapper.hashmap.HashMapWrapper import HashMapWrapper
+from graph_seeder.extraction.BFS.BFS import BidirectionalBFS
+from graph_seeder.extraction.Hop.HopExpansion import HopExpansion
+class ComponentFactory:
+    @staticmethod
+    def create_client(config: dict):
+        """Create a SPARQL client based on the configuration."""
+        client_type = config.get("client", {}).get("type", "sparql").lower()
+        registered_clients = {
+            "sparql": SparqlClient,
+            "turtle": TurtleClient,
+        }
+        if client_type not in registered_clients:
+            raise ValueError(f"Unknown client type: {client_type}")
+        return registered_clients[client_type](config)
+    @staticmethod
+    def create_wrapper(uri_manager, config: dict, client=None) -> NeighborhoodWrapper:
+        """Create a graph wrapper based on the configuration."""
+        wrapper_type = config.get("client", {}).get("type", "sparql").lower()
+        registered_wrappers = {
+            "sparql": GraphWrapper,
+            "turtle": GraphWrapper,
+            "hashmap": HashMapWrapper,
+        }
+        if wrapper_type not in registered_wrappers:
+            raise ValueError(f"Unknown wrapper type: {wrapper_type}")
+        wrapper_class = registered_wrappers[wrapper_type]
+        if wrapper_class == GraphWrapper:
+            if client is None:
+                client = ComponentFactory.create_client(config)
+            return wrapper_class(uri_manager, config, client=client)
+        return wrapper_class(uri_manager, config)
+    @staticmethod
+    def create_strategy(wrapper, uri_manager, config: dict):
+        """Create an extraction strategy based on the configuration."""
+        strategy_type = config.get("extraction", {}).get("strategy", "bfs").lower()
+        registered_strategies = {
+            "bfs": BidirectionalBFS,
+            "hop": HopExpansion,
+        }
+        if strategy_type not in registered_strategies:
+            raise ValueError(
+                f"Unknown strategy type: '{strategy_type}'. Valid options: {list(registered_strategies.keys())}"
+            )
+        return registered_strategies[strategy_type](wrapper, uri_manager, config)

graph_seeder/utils/GraphExporter.py ADDED Viewed

@@ -0,0 +1,84 @@
+import csv
+import json
+import logging
+import pickle
+from pathlib import Path
+import networkx as nx
+from rdflib import Graph, Namespace, URIRef
+logger = logging.getLogger("subgraph")
+class GraphExporter:
+    """Export triplets and graphs to disk in various formats."""
+    def __init__(self, data_cfg: dict) -> None:
+        """Initialize exporter settings.
+        Args:
+            output_format: Target format for triplet export.
+        """
+        self.output_format = data_cfg.get("output_format")
+        self.output_path = Path(data_cfg.get("output_path", ".")).resolve()
+        self.output_path.parent.mkdir(parents=True, exist_ok=True)
+    def save_triplets(
+        self,
+        triplets: list[tuple[str, str, str]],
+        namespaces: dict[str, str],
+        name_suffix: str = "",
+    ) -> None:
+        """Write triplets to disk using the configured output format.
+        Args:
+            triplets: Sequence of ``(subject, predicate, object)`` identifiers.
+            namespaces: Dictionary mapping namespace prefixes to URIs.
+            name_suffix: Optional suffix for the output file name (before extension).
+        Raises:
+            ValueError: If the configured output format is not supported.
+        """
+        path = self.output_path.with_name(
+            f"{self.output_path.stem}{name_suffix}.{self.output_format}"
+        )
+        fmt = self.output_format
+        if fmt == "csv":
+            with open(path, "w", newline="", encoding="utf-8") as f:
+                writer = csv.writer(f)
+                writer.writerow(["subject", "predicate", "object"])
+                writer.writerows(triplets)
+        elif fmt == "json":
+            data = [{"subject": s, "predicate": p, "object": o} for s, p, o in triplets]
+            with open(path, "w", encoding="utf-8") as f:
+                json.dump(data, f, indent=2)
+        elif fmt == "ttl":
+            rdf = Graph()
+            for prefix, uri in namespaces.items():
+                rdf.bind(prefix, Namespace(uri))
+            for s, p, o in triplets:
+                rdf.add((URIRef(s), URIRef(p), URIRef(o)))
+            rdf.serialize(destination=str(path), format="turtle")
+        else:
+            raise ValueError(f"Unsupported output format: {fmt!r}")
+        logger.info(f"Triplets saved → [bold]{path}[/]")
+    def save_graph(self, graph: nx.MultiGraph, name_suffix: str = "") -> None:
+        """Serialize a NetworkX graph to a gpickle file.
+        Args:
+            graph: In-memory graph built during extraction.
+            name_suffix: Optional suffix for the output file name (before extension).
+        """
+        graph_path = self.output_path.with_name(
+            f"{self.output_path.stem}{name_suffix}.gpickle"
+        )
+        with open(graph_path, "wb") as f:
+            pickle.dump(graph, f, protocol=pickle.HIGHEST_PROTOCOL)
+        logger.info(f"Graph saved   → [bold]{graph_path}[/]\n")

graph_seeder/utils/GraphStatistics.py ADDED Viewed

@@ -0,0 +1,32 @@
+import networkx as nx
+class GraphStatistics:
+    """Utility class for computing statistics on a graph."""
+    @staticmethod
+    def compute(triplets: list[tuple[str, str, str]]) -> dict:
+        """Compute statistics on the graph given a list of triplets and the graph itself."""
+        subjects = set()
+        predicates = set()
+        objects = set()
+        triplets_graph = nx.Graph()
+        for s, p, o in triplets:
+            subjects.add(s)
+            predicates.add(p)
+            objects.add(o)
+            triplets_graph.add_edge(s, o)
+        nb_components = nx.number_connected_components(triplets_graph)
+        unique_entities = len(subjects | objects)
+        return {
+            "total_triplets": len(triplets),
+            "unique_subjects": len(subjects),
+            "unique_predicates": len(predicates),
+            "unique_objects": len(objects),
+            "unique_entities": unique_entities,
+            "connected_components": nb_components,
+        }

graph_seeder/utils/URIManager.py ADDED Viewed

@@ -0,0 +1,95 @@
+import json
+import re
+import time
+from pathlib import Path
+from typing import Any
+import requests
+import urllib3
+from importlib import resources
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+CACHE_DIR = Path.home() / ".cache" / "graph_seeder"
+CACHE_FILE = CACHE_DIR / "context.json"
+CACHE_TTL = 60 * 60 * 24 * 7  # 1 week
+class URIManager:
+    """Utility class to manage URI compression and namespace resolution."""
+    def __init__(self, custom_namespaces: dict[str, str] = None):
+        self.namespaces = custom_namespaces or {}
+        self._update_sorted_namespaces()
+        context_data = self._get_cached_or_fresh_context()
+        if not context_data:
+            try:
+                fallback_path = resources.files("graph_seeder.utils").joinpath(
+                    "context.json"
+                )
+                with fallback_path.open("r", encoding="utf-8") as f:
+                    context_data = json.load(f)
+            except Exception:
+                context_data = {}
+        self.context = context_data.get("@context", {})
+    def _update_sorted_namespaces(self):
+        self.sorted_namespaces = sorted(
+            self.namespaces.items(),
+            key=lambda item: len(item[1]),
+            reverse=True,
+        )
+    def _get_context_fallback_prefix(
+        self, uri: str
+    ) -> tuple[Any, Any] | tuple[None, None]:
+        """Return the prefix and namespace for a given URI from the context."""
+        for prefix, namespace in self.context.items():
+            if uri.startswith(namespace):
+                return prefix, namespace
+        return None, None
+    def compress_uri(self, full_uri: str) -> str:
+        """Compress a full URI into a prefix:local_id format if possible."""
+        full_uri = full_uri.strip()
+        for prefix, namespace_uri in self.sorted_namespaces:
+            if full_uri.startswith(namespace_uri):
+                local_id = full_uri[len(namespace_uri) :]
+                if re.match(r"^[\w\-\.:]+$", local_id):
+                    return f"{prefix}:{local_id}"
+        prefix, namespace = self._get_context_fallback_prefix(full_uri)
+        if prefix:
+            local_id = full_uri[len(namespace) :]
+            if re.match(r"^[\w\-\.:]+$", local_id):
+                if prefix not in self.namespaces:
+                    self.namespaces[prefix] = namespace
+                    self._update_sorted_namespaces()
+                return f"{prefix}:{local_id}"
+        return f"<{full_uri}>"
+    def _get_cached_or_fresh_context(self) -> dict | None:
+        if CACHE_FILE.exists():
+            file_age = time.time() - CACHE_FILE.stat().st_mtime
+            if file_age < CACHE_TTL:
+                try:
+                    with open(CACHE_FILE, "r") as f:
+                        return json.load(f)
+                except json.JSONDecodeError:
+                    pass
+        try:
+            response = requests.get(
+                "https://prefix.cc/context", verify=False, timeout=5
+            )
+            response.raise_for_status()
+            context = response.json()
+            CACHE_DIR.mkdir(parents=True, exist_ok=True)
+            with open(CACHE_FILE, "w") as f:
+                json.dump(context, f)
+            return context
+        except (requests.RequestException, json.JSONDecodeError):
+            return None