graph-seeder 1.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graph_seeder/GraphSeeder.py +47 -0
- graph_seeder/SubgraphExtractor.py +377 -0
- graph_seeder/configs/dbpedia_default.json +59 -0
- graph_seeder/configs/default.json +47 -0
- graph_seeder/configs/europeana_default.json +50 -0
- graph_seeder/configs/pgxlod_default.json +47 -0
- graph_seeder/configs/wikidata_default.json +70 -0
- graph_seeder/densification/GraphConnector.py +113 -0
- graph_seeder/extraction/BFS/BFS.py +192 -0
- graph_seeder/extraction/ExtractionStrategy.py +70 -0
- graph_seeder/extraction/Hop/HopExpansion.py +92 -0
- graph_seeder/utils/ConsoleUI.py +273 -0
- graph_seeder/utils/Factory.py +64 -0
- graph_seeder/utils/GraphExporter.py +84 -0
- graph_seeder/utils/GraphStatistics.py +32 -0
- graph_seeder/utils/URIManager.py +95 -0
- graph_seeder/utils/utils.py +217 -0
- graph_seeder/wrapper/NeighborhoodWrapper.py +47 -0
- graph_seeder/wrapper/hashmap/HashMapWrapper.py +124 -0
- graph_seeder/wrapper/sparql/BaseClient.py +23 -0
- graph_seeder/wrapper/sparql/GraphWrapper.py +269 -0
- graph_seeder/wrapper/sparql/SparqlQueryBuilder.py +175 -0
- graph_seeder/wrapper/sparql/client/SparqlClient.py +118 -0
- graph_seeder/wrapper/sparql/client/TurtleClient.py +47 -0
- graph_seeder-1.0.0.dev0.dist-info/METADATA +191 -0
- graph_seeder-1.0.0.dev0.dist-info/RECORD +28 -0
- graph_seeder-1.0.0.dev0.dist-info/WHEEL +4 -0
- graph_seeder-1.0.0.dev0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from graph_seeder.utils.URIManager import URIManager
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class SparqlQueryBuilder:
|
|
6
|
+
def __init__(
|
|
7
|
+
self,
|
|
8
|
+
uri_manager: URIManager,
|
|
9
|
+
graph_filters: dict,
|
|
10
|
+
context: dict = None,
|
|
11
|
+
):
|
|
12
|
+
self.uri_manager = uri_manager
|
|
13
|
+
self.context = context or {}
|
|
14
|
+
self.include_uri_prefixes = graph_filters.get("include_uri_prefixes", [])
|
|
15
|
+
self.exclude_uri_prefixes = graph_filters.get("exclude_uri_prefixes", [])
|
|
16
|
+
|
|
17
|
+
self.prop_filter = self.format_prop_filter(
|
|
18
|
+
set(graph_filters.get("exclude_properties", []))
|
|
19
|
+
)
|
|
20
|
+
self.node_filter_obj = self.format_node_filter(
|
|
21
|
+
set(graph_filters.get("exclude_nodes", [])), "object"
|
|
22
|
+
)
|
|
23
|
+
self.node_filter_subj = self.format_node_filter(
|
|
24
|
+
set(graph_filters.get("exclude_nodes", [])), "subject"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def build_checking_validity_query(self, seeds: list[str]) -> str:
|
|
28
|
+
values_str = self.format_values(seeds)
|
|
29
|
+
return f"""
|
|
30
|
+
{self.format_namespaces()}
|
|
31
|
+
SELECT DISTINCT ?node
|
|
32
|
+
WHERE {{
|
|
33
|
+
VALUES ?node {{ {values_str} }}
|
|
34
|
+
?node ?p ?o .
|
|
35
|
+
}}
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def build_total_neighbors_query(self, nodes: list[str]) -> str:
|
|
39
|
+
"""Build a SPARQL query to count total neighbors for each node."""
|
|
40
|
+
values_str = self.format_values(nodes)
|
|
41
|
+
return f"""
|
|
42
|
+
{self.format_namespaces()}
|
|
43
|
+
SELECT ?node (COUNT(?property) AS ?total) WHERE {{
|
|
44
|
+
VALUES ?node {{ {values_str} }}
|
|
45
|
+
{{
|
|
46
|
+
?node ?property ?object .
|
|
47
|
+
}}
|
|
48
|
+
UNION
|
|
49
|
+
{{
|
|
50
|
+
?subject ?property ?node .
|
|
51
|
+
}}
|
|
52
|
+
}} GROUP BY ?node
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def _build_prop_occurrence_query(self, values_str: str) -> str:
|
|
56
|
+
return f"""
|
|
57
|
+
{self.format_namespaces()}
|
|
58
|
+
SELECT ?entity ?property (COUNT(?property) AS ?count) WHERE {{
|
|
59
|
+
VALUES ?entity {{ {values_str} }}
|
|
60
|
+
{{ ?entity ?property ?object . }}
|
|
61
|
+
UNION
|
|
62
|
+
{{ ?subject ?property ?entity . }}
|
|
63
|
+
}}
|
|
64
|
+
GROUP BY ?entity ?property
|
|
65
|
+
ORDER BY ?entity DESC(?count)
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def _build_neighborhood_query(self, values_str: str) -> str:
|
|
69
|
+
return f"""
|
|
70
|
+
{self.format_namespaces()}
|
|
71
|
+
SELECT ?subject ?property ?object WHERE {{
|
|
72
|
+
VALUES ?node {{ {values_str} }}
|
|
73
|
+
{{
|
|
74
|
+
?node ?property ?object .
|
|
75
|
+
BIND (?node AS ?subject)
|
|
76
|
+
{self.node_filter_obj}
|
|
77
|
+
{self.prop_filter}
|
|
78
|
+
{self.format_entity_filter("object")}
|
|
79
|
+
}}
|
|
80
|
+
UNION
|
|
81
|
+
{{
|
|
82
|
+
?subject ?property ?node .
|
|
83
|
+
BIND (?node AS ?object)
|
|
84
|
+
{self.node_filter_subj}
|
|
85
|
+
{self.prop_filter}
|
|
86
|
+
{self.format_entity_filter("subject")}
|
|
87
|
+
}}
|
|
88
|
+
}}
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def _build_hub_neighborhood_query(self, values_pairs_str: str) -> str:
|
|
92
|
+
return f"""
|
|
93
|
+
{self.format_namespaces()}
|
|
94
|
+
SELECT ?subject ?property ?object WHERE {{
|
|
95
|
+
VALUES (?node ?property) {{ {values_pairs_str} }}
|
|
96
|
+
{{
|
|
97
|
+
?node ?property ?object .
|
|
98
|
+
BIND (?node AS ?subject)
|
|
99
|
+
{self.node_filter_obj} {self.prop_filter} {self.format_entity_filter("object")}
|
|
100
|
+
}}
|
|
101
|
+
UNION
|
|
102
|
+
{{
|
|
103
|
+
?subject ?property ?node .
|
|
104
|
+
BIND (?node AS ?object)
|
|
105
|
+
{self.node_filter_subj} {self.prop_filter} {self.format_entity_filter("subject")}
|
|
106
|
+
}}
|
|
107
|
+
}}
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
def build_neighborhood_query(self, nodes: list[str]) -> str:
|
|
111
|
+
return self._build_neighborhood_query(self.format_values(nodes))
|
|
112
|
+
|
|
113
|
+
def build_prop_occurrence_query(self, nodes: list[str]) -> str:
|
|
114
|
+
return self._build_prop_occurrence_query(self.format_values(nodes))
|
|
115
|
+
|
|
116
|
+
def build_hub_neighborhood_query(self, pairs: list[tuple[str, str]]) -> str:
|
|
117
|
+
return self._build_hub_neighborhood_query(self.format_values_pairs(pairs))
|
|
118
|
+
|
|
119
|
+
def format_namespaces(self) -> str:
|
|
120
|
+
return "\n".join(
|
|
121
|
+
f"PREFIX {p}: <{uri}>" for p, uri in self.uri_manager.namespaces.items()
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def format_node(self, node_id: str) -> str:
|
|
125
|
+
if node_id.startswith("<") and node_id.endswith(">"):
|
|
126
|
+
return node_id
|
|
127
|
+
|
|
128
|
+
clean_node = node_id.replace("\\'", "'").replace('\\"', '"')
|
|
129
|
+
compressed_uri = self.uri_manager.compress_uri(clean_node)
|
|
130
|
+
|
|
131
|
+
if compressed_uri.startswith("<"):
|
|
132
|
+
return compressed_uri
|
|
133
|
+
|
|
134
|
+
if ":" in compressed_uri and not compressed_uri.startswith("http"):
|
|
135
|
+
prefix, local_name = compressed_uri.split(":", 1)
|
|
136
|
+
if re.match(r"^[A-Za-z0-9_-]+$", local_name):
|
|
137
|
+
return compressed_uri
|
|
138
|
+
|
|
139
|
+
if clean_node.startswith("http://") or clean_node.startswith("https://"):
|
|
140
|
+
return f"<{clean_node}>"
|
|
141
|
+
|
|
142
|
+
return clean_node
|
|
143
|
+
|
|
144
|
+
def format_values_pairs(self, pairs: list[tuple[str, str]]) -> str:
|
|
145
|
+
return " ".join(
|
|
146
|
+
f"({self.format_node(n)} {self.format_node(p)})" for n, p in pairs
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def format_values(self, entity_ids: list[str]) -> str:
|
|
150
|
+
return " ".join(self.format_node(e) for e in entity_ids)
|
|
151
|
+
|
|
152
|
+
def format_prop_filter(self, properties: set[str]) -> str:
|
|
153
|
+
if not properties:
|
|
154
|
+
return ""
|
|
155
|
+
return f"FILTER (?property NOT IN ({', '.join(self.format_node(p) for p in properties)}))"
|
|
156
|
+
|
|
157
|
+
def format_node_filter(self, nodes: set[str], var_name: str) -> str:
|
|
158
|
+
if not nodes:
|
|
159
|
+
return ""
|
|
160
|
+
return f"FILTER (?{var_name} NOT IN ({', '.join(self.format_node(n) for n in nodes)}))"
|
|
161
|
+
|
|
162
|
+
def format_entity_filter(self, var_name: str) -> str:
|
|
163
|
+
conditions = [f"isIRI(?{var_name})", f"!isBlank(?{var_name})"]
|
|
164
|
+
if self.include_uri_prefixes:
|
|
165
|
+
checks = [
|
|
166
|
+
f'STRSTARTS(STR(?{var_name}), "{b}")' for b in self.include_uri_prefixes
|
|
167
|
+
]
|
|
168
|
+
conditions.append(f"({' || '.join(checks)})")
|
|
169
|
+
if self.exclude_uri_prefixes:
|
|
170
|
+
exclusions = [
|
|
171
|
+
f'!STRSTARTS(STR(?{var_name}), "{b}")'
|
|
172
|
+
for b in self.exclude_uri_prefixes
|
|
173
|
+
]
|
|
174
|
+
conditions.append(f"({' && '.join(exclusions)})")
|
|
175
|
+
return f"FILTER({' && '.join(conditions)})"
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from urllib.error import HTTPError
|
|
4
|
+
from SPARQLWrapper import JSON, SPARQLWrapper
|
|
5
|
+
from SPARQLWrapper.SPARQLExceptions import QueryBadFormed, URITooLong
|
|
6
|
+
from graph_seeder.wrapper.sparql.BaseClient import BaseClient
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("subgraph")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SparqlClient(BaseClient):
|
|
12
|
+
def __init__(self, cfg: dict) -> None:
|
|
13
|
+
super().__init__(cfg)
|
|
14
|
+
client_cfg = cfg.get("client", {})
|
|
15
|
+
|
|
16
|
+
endpoint = client_cfg.get("endpoint")
|
|
17
|
+
if not endpoint:
|
|
18
|
+
raise ValueError("SPARQL endpoint URL must be provided in config.")
|
|
19
|
+
|
|
20
|
+
user_agent = client_cfg.get("user_agent")
|
|
21
|
+
if not user_agent:
|
|
22
|
+
self.sparql = SPARQLWrapper(endpoint)
|
|
23
|
+
else:
|
|
24
|
+
self.sparql = SPARQLWrapper(endpoint, agent=user_agent)
|
|
25
|
+
self.sparql.setTimeout(client_cfg.get("timeout", 50))
|
|
26
|
+
self.sparql.setReturnFormat(JSON)
|
|
27
|
+
|
|
28
|
+
self.retry_attempts = client_cfg.get("retry_attempts", 3)
|
|
29
|
+
self.retry_delay = client_cfg.get("retry_delay", 3)
|
|
30
|
+
self.request_delay = client_cfg.get("request_delay", 1)
|
|
31
|
+
self.rate_limit_wait = client_cfg.get("rate_limit_wait", 60)
|
|
32
|
+
|
|
33
|
+
def _log_failed_query(self, query: str, error: Exception) -> None:
|
|
34
|
+
err_str = str(error)
|
|
35
|
+
|
|
36
|
+
error_lines = err_str.split("\n")
|
|
37
|
+
short_msg = next(
|
|
38
|
+
(line for line in error_lines if "Virtuoso" in line), error_lines[0]
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
logger.warning(
|
|
42
|
+
f"SPARQL Error: {short_msg}\n See failed_queries.log for more details."
|
|
43
|
+
)
|
|
44
|
+
logger.debug(f"Full error response:\n{error}")
|
|
45
|
+
try:
|
|
46
|
+
with open("failed_queries.log", "a", encoding="utf-8") as f:
|
|
47
|
+
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
48
|
+
f.write(
|
|
49
|
+
f"=== {timestamp} | {type(error).__name__} ===\n{error}\n{query}\n{'=' * 40}\n\n"
|
|
50
|
+
)
|
|
51
|
+
except Exception as file_err:
|
|
52
|
+
logger.error(f"Could not write to failed_queries.log: {file_err}")
|
|
53
|
+
|
|
54
|
+
def query(
|
|
55
|
+
self, sparql_query: str, silent: bool = False, retries: int = None
|
|
56
|
+
) -> list[dict]:
|
|
57
|
+
"""Execute a SPARQL query with retries.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
sparql_query: Full SPARQL query string.
|
|
61
|
+
silent: If True, suppress logging of failed queries.
|
|
62
|
+
retries: Number of retry attempts.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
List of binding dicts from the ``results.bindings`` array.
|
|
66
|
+
|
|
67
|
+
Raises:
|
|
68
|
+
RuntimeError: When all retry attempts are exhausted.
|
|
69
|
+
"""
|
|
70
|
+
retries = retries if retries is not None else self.retry_attempts
|
|
71
|
+
self.sparql.setQuery(sparql_query)
|
|
72
|
+
|
|
73
|
+
for attempt in range(1, retries + 1):
|
|
74
|
+
try:
|
|
75
|
+
results = self.sparql.query().convert()
|
|
76
|
+
if self.request_delay > 0:
|
|
77
|
+
time.sleep(self.request_delay)
|
|
78
|
+
return results["results"]["bindings"]
|
|
79
|
+
|
|
80
|
+
except (URITooLong, QueryBadFormed) as e:
|
|
81
|
+
if not silent:
|
|
82
|
+
self._log_failed_query(sparql_query, e)
|
|
83
|
+
raise RuntimeError(
|
|
84
|
+
f"Query rejected by endpoint ({type(e).__name__}): {e}. "
|
|
85
|
+
f"See failed_queries.log for the full query."
|
|
86
|
+
) from e
|
|
87
|
+
|
|
88
|
+
except HTTPError as e:
|
|
89
|
+
if e.code == 429:
|
|
90
|
+
logger.warning(
|
|
91
|
+
f"Rate limited (429). Waiting {self.rate_limit_wait}s…"
|
|
92
|
+
)
|
|
93
|
+
time.sleep(self.rate_limit_wait)
|
|
94
|
+
continue
|
|
95
|
+
logger.warning(
|
|
96
|
+
f"HTTP {e.code} on attempt {attempt}/{retries}: {e}. "
|
|
97
|
+
f"Retrying in {self.retry_delay}s…"
|
|
98
|
+
)
|
|
99
|
+
time.sleep(self.retry_delay)
|
|
100
|
+
|
|
101
|
+
except Exception as e:
|
|
102
|
+
if attempt < retries:
|
|
103
|
+
logger.warning(
|
|
104
|
+
f"Attempt {attempt}/{retries} failed: {e}. Retrying…"
|
|
105
|
+
)
|
|
106
|
+
time.sleep(self.retry_delay)
|
|
107
|
+
else:
|
|
108
|
+
if not silent:
|
|
109
|
+
self._log_failed_query(sparql_query, e)
|
|
110
|
+
raise RuntimeError(
|
|
111
|
+
f"SPARQL query failed after {retries} attempts: {e}. "
|
|
112
|
+
f"See failed_queries.log for the full query."
|
|
113
|
+
) from e
|
|
114
|
+
|
|
115
|
+
raise RuntimeError(
|
|
116
|
+
f"SPARQL query failed after {retries} attempts (retries exhausted). "
|
|
117
|
+
f"See failed_queries.log for details."
|
|
118
|
+
)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import rdflib
|
|
3
|
+
from graph_seeder.wrapper.sparql.BaseClient import BaseClient
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger("subgraph")
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TurtleClient(BaseClient):
|
|
9
|
+
"""Client that loads a local Turtle file into memory and allows SPARQL querying over it."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, cfg: dict) -> None:
|
|
12
|
+
super().__init__(cfg)
|
|
13
|
+
data: dict = cfg.get("data", {})
|
|
14
|
+
self.file_path = data.get("turtle_path")
|
|
15
|
+
|
|
16
|
+
if not self.file_path:
|
|
17
|
+
raise ValueError("file_path must be provided in config for TurtleClient.")
|
|
18
|
+
|
|
19
|
+
logger.info(
|
|
20
|
+
f"Loading Turtle graph from {self.file_path}. This may take a moment..."
|
|
21
|
+
)
|
|
22
|
+
self.graph = rdflib.Graph()
|
|
23
|
+
self.graph.parse(self.file_path, format="turtle")
|
|
24
|
+
logger.info(f"Successfully loaded {len(self.graph)} triples.")
|
|
25
|
+
|
|
26
|
+
self.optimal_batch_size = 500
|
|
27
|
+
|
|
28
|
+
def query(
|
|
29
|
+
self, sparql_query: str, silent: bool = False, retries: int = None
|
|
30
|
+
) -> list[dict]:
|
|
31
|
+
"""Execute the SPARQL query on the local rdflib graph and format the output."""
|
|
32
|
+
try:
|
|
33
|
+
results = self.graph.query(sparql_query)
|
|
34
|
+
bindings = []
|
|
35
|
+
|
|
36
|
+
for row in results:
|
|
37
|
+
binding = {}
|
|
38
|
+
for var in results.vars:
|
|
39
|
+
val = row[var]
|
|
40
|
+
if val is not None:
|
|
41
|
+
binding[str(var)] = {"value": str(val)}
|
|
42
|
+
bindings.append(binding)
|
|
43
|
+
|
|
44
|
+
return bindings
|
|
45
|
+
except Exception as e:
|
|
46
|
+
logger.error(f"Failed to execute local Turtle query: {e}")
|
|
47
|
+
raise RuntimeError(f"Turtle query failed: {e}") from e
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: graph-seeder
|
|
3
|
+
Version: 1.0.0.dev0
|
|
4
|
+
Summary: A powerful tool to extract and densify subgraphs from Knowledge Graphs via SPARQL or LMDB, with different extraction strategies.
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
Requires-Dist: lmdb>=2.2.0
|
|
7
|
+
Requires-Dist: networkx<4.0.0,>=3.2.1
|
|
8
|
+
Requires-Dist: pandas<3.0.0,>=2.3.3
|
|
9
|
+
Requires-Dist: rdflib>=7.6.0
|
|
10
|
+
Requires-Dist: requests>=2.32.5
|
|
11
|
+
Requires-Dist: rich>=15.0.0
|
|
12
|
+
Requires-Dist: sparqlwrapper>=2.0.0
|
|
13
|
+
Requires-Dist: urllib3>=2.6.3
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# Graph densifier
|
|
17
|
+
|
|
18
|
+
Graph Densifier is a collection of tools that can be used for enriching, analyzing, and extracting subgraphs from
|
|
19
|
+
knowledge graphs represented as triplet datasets.
|
|
20
|
+
|
|
21
|
+
It offers the following functions:
|
|
22
|
+
|
|
23
|
+
- **Densify graphs** by enriching an existing knowledge graph with additional Wikidata triplets between known entities.
|
|
24
|
+
- **Compute statistics** to analyze the graph's composition and connectivity.
|
|
25
|
+
- **Extract paths from Wikidata** dynamically by finding connections between pairs of entities.
|
|
26
|
+
- **Extract local subgraphs** from an existing dataset using shortest paths or neighborhood expansion around seed
|
|
27
|
+
entities.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
Follow these steps to set up the project locally:
|
|
32
|
+
|
|
33
|
+
### 1. Clone the repository
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
git clone https://github.com/Wimmics/graph-densifier.git
|
|
37
|
+
cd graph-densifier
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### 2. Install dependencies
|
|
41
|
+
|
|
42
|
+
#### Option A (recommended): using uv
|
|
43
|
+
|
|
44
|
+
We recommend using uv for fast and reliable dependency management.
|
|
45
|
+
|
|
46
|
+
- [Install uv](https://docs.astral.sh/uv/#installation) by following the official guide
|
|
47
|
+
- Then run:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
uv sync
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
#### Option B: using pip
|
|
54
|
+
|
|
55
|
+
If you prefer not to use uv, you can install dependencies with pip:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install -r requirements.txt
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### 3. Environment configuration
|
|
62
|
+
|
|
63
|
+
> [!note]
|
|
64
|
+
> To avoid being rate-limited or blocked when querying Wikidata, you should configure a user identity.
|
|
65
|
+
> - Create a `.env` file at the root of the project
|
|
66
|
+
> - Add the following line to the file with your Wikidata username:
|
|
67
|
+
> ```bash
|
|
68
|
+
> USER_AGENT="graph_densify/1.0 (contact: wikidata_username)"
|
|
69
|
+
> ```
|
|
70
|
+
> While this step is not strictly required to run the project, it is **recommended**. Without it, requests to Wikidata
|
|
71
|
+
> may be throttled or blocked during large runs, which can interrupt the graph densification and path extraction
|
|
72
|
+
> processes.
|
|
73
|
+
|
|
74
|
+
## Usage
|
|
75
|
+
|
|
76
|
+
The project provides four main scripts:
|
|
77
|
+
|
|
78
|
+
1. **graph_densify.py** – enrich a local graph with additional Wikidata triplets.
|
|
79
|
+
2. **statistics.py** – compute statistics for a triplet dataset.
|
|
80
|
+
3. **subgraph_extract.py** – query Wikidata to find paths between entity pairs.
|
|
81
|
+
4. **hashmap_extract_subgraph.py** – extract relevant subgraphs from a local CSV graph.
|
|
82
|
+
|
|
83
|
+
## 1. Graph Densification (`graph_densify.py`)
|
|
84
|
+
|
|
85
|
+
This script enriches the input graph by querying Wikidata for additional relationships between entities already present
|
|
86
|
+
in the graph. It identifies all unique entities in the `subject` and `object` columns and adds any newly discovered
|
|
87
|
+
direct relations to the dataset.
|
|
88
|
+
|
|
89
|
+
### Command
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
python src/graph_densify.py --input path/to/input.csv --output path/to/output.csv
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## 2. Graph Statistics (`statistics.py`)
|
|
96
|
+
|
|
97
|
+
This script computes descriptive statistics for a triplet dataset and generates a summary CSV file in the `stat/`
|
|
98
|
+
directory.
|
|
99
|
+
|
|
100
|
+
### Command
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
python src/statistics.py --input path/to/graph.csv
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Computed Statistics
|
|
107
|
+
|
|
108
|
+
| Metric | Description |
|
|
109
|
+
|-------------------------------|----------------------------------------------------|
|
|
110
|
+
| `total_triplets` | Total number of triplets |
|
|
111
|
+
| `unique_subjects` | Number of unique subjects |
|
|
112
|
+
| `unique_predicates` | Number of unique predicates |
|
|
113
|
+
| `unique_objects` | Number of unique objects |
|
|
114
|
+
| `unique_entities` | Unique entities across subjects and objects |
|
|
115
|
+
| `unique_subject_object_pairs` | Distinct `(subject, object)` pairs |
|
|
116
|
+
| `connected_components` | Number of weakly connected components in the graph |
|
|
117
|
+
|
|
118
|
+
## 3. Wikidata Path Extraction (`subgraph_extract.py`)
|
|
119
|
+
|
|
120
|
+
This script takes a list of entity pairs and dynamically queries Wikidata to find a short path (not necessarily the
|
|
121
|
+
shortest) between them. It outputs the discovered path triplets as a CSV and saves the explored network as a `.gpickle`
|
|
122
|
+
graph file.
|
|
123
|
+
|
|
124
|
+
### Command
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
python src/subgraph_extract.py --input path/to/pairs.csv --output path/to/extracted_paths.csv
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## 4. Local Subgraph Extraction (`hashmap_extract_subgraph.py`)
|
|
133
|
+
|
|
134
|
+
This script extracts subgraphs from a **local** graph dataset (CSV) using one of the two modes:
|
|
135
|
+
|
|
136
|
+
### Mode A — Shortest paths between seed/target pairs
|
|
137
|
+
|
|
138
|
+
Extracts all shortest paths between specified source-target entity pairs.
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
python src/hashmap_extract_subgraph.py \
|
|
142
|
+
--sub_graph path/to/main_graph.csv \
|
|
143
|
+
--seed_target_pairs path/to/pairs.csv
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Mode B — Radius around seed nodes
|
|
147
|
+
|
|
148
|
+
Extracts all nodes within a specified number of hops (default: 2) from a list of seed entities.
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
python src/hashmap_extract_subgraph.py \
|
|
152
|
+
--sub_graph path/to/main_graph.csv \
|
|
153
|
+
--seeds_only path/to/seeds.csv \
|
|
154
|
+
--max_length 2
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
**Output:** The extracted subgraph is saved by default to `data/extracted_subgraph.csv`.
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## Dataset Structure
|
|
162
|
+
|
|
163
|
+
All datasets are expected to be provided as CSV files.
|
|
164
|
+
|
|
165
|
+
### Main Graph Dataset
|
|
166
|
+
|
|
167
|
+
Must contain three columns representing a knowledge graph triplet:
|
|
168
|
+
|
|
169
|
+
| subject | predicate | object |
|
|
170
|
+
|---------|-----------|--------|
|
|
171
|
+
| Q937 | P36 | Q90 |
|
|
172
|
+
| Q90 | P17 | Q142 |
|
|
173
|
+
|
|
174
|
+
### Seed-Target Pair Dataset
|
|
175
|
+
|
|
176
|
+
Used for finding paths between specific entities. Must contain two columns:
|
|
177
|
+
|
|
178
|
+
| subject | object |
|
|
179
|
+
|---------|--------|
|
|
180
|
+
| Q937 | Q304 |
|
|
181
|
+
| Q90 | Q183 |
|
|
182
|
+
|
|
183
|
+
### Seed-Only Dataset
|
|
184
|
+
|
|
185
|
+
Used for neighborhood expansion. Must contain one column representing the seed entity (the column can be named `seed` or
|
|
186
|
+
be the first column):
|
|
187
|
+
|
|
188
|
+
| seed |
|
|
189
|
+
|------|
|
|
190
|
+
| Q937 |
|
|
191
|
+
| Q90 |
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
graph_seeder/GraphSeeder.py,sha256=3ODCN8IyUZt3NB04n7FUKTgNu9_56IMomuqYLLHzRSA,1457
|
|
2
|
+
graph_seeder/SubgraphExtractor.py,sha256=v5ajBQldM9u2BChTPKpeqqO61OH4fuLhRL9A3p36nJQ,14539
|
|
3
|
+
graph_seeder/configs/dbpedia_default.json,sha256=Ds-42oRLvWW2f0LP0m9yf3n_XvDwym4ol9_2BxMQ3sI,1802
|
|
4
|
+
graph_seeder/configs/default.json,sha256=7d2DUfcMvodMcxEIoI0CBLZareJ3aSet9pakT0NHV4Y,1214
|
|
5
|
+
graph_seeder/configs/europeana_default.json,sha256=mS_c9CBz1CDvSUZt_N4pwLs-M7CWoX9PqrKZMl_6hSU,1501
|
|
6
|
+
graph_seeder/configs/pgxlod_default.json,sha256=YGTGXM02uw87LR85JMW1uDfoQMgh7KyjWzXpOSLs_xc,1339
|
|
7
|
+
graph_seeder/configs/wikidata_default.json,sha256=lZ4q5_RPnwK_dduuYSCvm4E4_yjynPJOjwHGl0occAY,2474
|
|
8
|
+
graph_seeder/densification/GraphConnector.py,sha256=1SsXuKNHRZbIOixerEHgPTREWvuy8_OC9EPuQwExzsY,4067
|
|
9
|
+
graph_seeder/extraction/ExtractionStrategy.py,sha256=IIlnk_7RxLdquyNQSJIhuqL_P2BDSQfBp5WDLiXq21k,2409
|
|
10
|
+
graph_seeder/extraction/BFS/BFS.py,sha256=r-QidFmV9oR1phYnVqDgim4KS8RcnrteQa6c6N80oJw,7301
|
|
11
|
+
graph_seeder/extraction/Hop/HopExpansion.py,sha256=bKikLNwrL0sseXcPPI8brb5p64yN7u3exEhg7Nw2Q8I,3514
|
|
12
|
+
graph_seeder/utils/ConsoleUI.py,sha256=FIggOlXAecfZvSM361VlqHpQTD_YtXrK0Pgz_iyOPhg,10153
|
|
13
|
+
graph_seeder/utils/Factory.py,sha256=iz1TI0YHkvKz9VRhR4cfpxNZAVzjCvbJNZBM_0yZfgY,2549
|
|
14
|
+
graph_seeder/utils/GraphExporter.py,sha256=GAF_MLv3AnGNZEQyMjLXpE4qVGaufnHanjAE5etFHaY,3016
|
|
15
|
+
graph_seeder/utils/GraphStatistics.py,sha256=rczbakVcSjt9itTBq6WnYDIv9OhfFiqCiqAfK954cSk,1023
|
|
16
|
+
graph_seeder/utils/URIManager.py,sha256=j-5HD1oMDyrS7dtdXTWWHyo-M-iETp3hRYBTTa6R198,3445
|
|
17
|
+
graph_seeder/utils/utils.py,sha256=AQXXCcUGSP_4eLYUU6u_EKsFZhoQh7y22TsA-IM4OF8,6759
|
|
18
|
+
graph_seeder/wrapper/NeighborhoodWrapper.py,sha256=VYrJHQOhXchgO0YqK3zldHeM3KJUNxR_CYNMCuI9zxU,1766
|
|
19
|
+
graph_seeder/wrapper/hashmap/HashMapWrapper.py,sha256=9KOZ61XmNv0XC_hn9VcdhBXmRL4iU1aw4LZG33pJyms,4823
|
|
20
|
+
graph_seeder/wrapper/sparql/BaseClient.py,sha256=3HShDXLEYwk7QARVWVQwFA7kbVuakKN1_96Yj-3UzII,684
|
|
21
|
+
graph_seeder/wrapper/sparql/GraphWrapper.py,sha256=6_FLk8nDpQBNxZ7AY-U39OvuAViZl5K4IBNFOF5yUQM,10136
|
|
22
|
+
graph_seeder/wrapper/sparql/SparqlQueryBuilder.py,sha256=L7mLnq1k5hV8xIfqt5NlgCHP6dY5p3lWgN0G3dRIGgk,6542
|
|
23
|
+
graph_seeder/wrapper/sparql/client/SparqlClient.py,sha256=HMUBwIbMrHj-bdVW8zFi9hwsJcZP8-P_PuK9t3YpzT0,4670
|
|
24
|
+
graph_seeder/wrapper/sparql/client/TurtleClient.py,sha256=x7RXsn0zrC9sagZGIUTM97SorX5ELqRHIF-BXfrpuVs,1685
|
|
25
|
+
graph_seeder-1.0.0.dev0.dist-info/METADATA,sha256=E9IKc2FwiqGrx8KmSfq4n8NDf55qDZT7KIF46hbXs44,6133
|
|
26
|
+
graph_seeder-1.0.0.dev0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
27
|
+
graph_seeder-1.0.0.dev0.dist-info/entry_points.txt,sha256=eM5WRu0uS9fNN_nmZ09zHX1p5UbsHrFi8A9yE-B3X9w,63
|
|
28
|
+
graph_seeder-1.0.0.dev0.dist-info/RECORD,,
|