graph-seeder 1.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graph_seeder/GraphSeeder.py +47 -0
- graph_seeder/SubgraphExtractor.py +377 -0
- graph_seeder/configs/dbpedia_default.json +59 -0
- graph_seeder/configs/default.json +47 -0
- graph_seeder/configs/europeana_default.json +50 -0
- graph_seeder/configs/pgxlod_default.json +47 -0
- graph_seeder/configs/wikidata_default.json +70 -0
- graph_seeder/densification/GraphConnector.py +113 -0
- graph_seeder/extraction/BFS/BFS.py +192 -0
- graph_seeder/extraction/ExtractionStrategy.py +70 -0
- graph_seeder/extraction/Hop/HopExpansion.py +92 -0
- graph_seeder/utils/ConsoleUI.py +273 -0
- graph_seeder/utils/Factory.py +64 -0
- graph_seeder/utils/GraphExporter.py +84 -0
- graph_seeder/utils/GraphStatistics.py +32 -0
- graph_seeder/utils/URIManager.py +95 -0
- graph_seeder/utils/utils.py +217 -0
- graph_seeder/wrapper/NeighborhoodWrapper.py +47 -0
- graph_seeder/wrapper/hashmap/HashMapWrapper.py +124 -0
- graph_seeder/wrapper/sparql/BaseClient.py +23 -0
- graph_seeder/wrapper/sparql/GraphWrapper.py +269 -0
- graph_seeder/wrapper/sparql/SparqlQueryBuilder.py +175 -0
- graph_seeder/wrapper/sparql/client/SparqlClient.py +118 -0
- graph_seeder/wrapper/sparql/client/TurtleClient.py +47 -0
- graph_seeder-1.0.0.dev0.dist-info/METADATA +191 -0
- graph_seeder-1.0.0.dev0.dist-info/RECORD +28 -0
- graph_seeder-1.0.0.dev0.dist-info/WHEEL +4 -0
- graph_seeder-1.0.0.dev0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import networkx as nx
|
|
2
|
+
import json
|
|
3
|
+
from importlib import resources
|
|
4
|
+
|
|
5
|
+
BUILTIN_CONFIGS = [
|
|
6
|
+
"dbpedia_default",
|
|
7
|
+
"wikidata_default",
|
|
8
|
+
"pgxlod_default",
|
|
9
|
+
"europeana_default",
|
|
10
|
+
"default",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
OVERRIDE_MAP = {
|
|
14
|
+
# Data section
|
|
15
|
+
"input_path": ("data", str, "Path to the input csv file containing seed nodes."),
|
|
16
|
+
"output_format": (
|
|
17
|
+
"data",
|
|
18
|
+
str,
|
|
19
|
+
"Format of the extracted graph output ('csv', 'json', 'ttl').",
|
|
20
|
+
),
|
|
21
|
+
"output_path": (
|
|
22
|
+
"data",
|
|
23
|
+
str,
|
|
24
|
+
"Destination path and base filename for the extracted graph (e.g., 'output/result').",
|
|
25
|
+
),
|
|
26
|
+
"turtle_path": (
|
|
27
|
+
"data",
|
|
28
|
+
str,
|
|
29
|
+
"Path to a local Turtle file (if using local extraction instead of a SPARQL endpoint).",
|
|
30
|
+
),
|
|
31
|
+
# Client section
|
|
32
|
+
"endpoint": (
|
|
33
|
+
"client",
|
|
34
|
+
str,
|
|
35
|
+
"URL of the SPARQL endpoint to query (e.g., 'https://dbpedia.org/sparql').",
|
|
36
|
+
),
|
|
37
|
+
"user_agent": (
|
|
38
|
+
"client",
|
|
39
|
+
str,
|
|
40
|
+
"HTTP User-Agent header to identify your requests to the server.",
|
|
41
|
+
),
|
|
42
|
+
"request_delay": (
|
|
43
|
+
"client",
|
|
44
|
+
float,
|
|
45
|
+
"Delay in seconds between consecutive requests to avoid overloading the server.",
|
|
46
|
+
),
|
|
47
|
+
"retry_attempts": (
|
|
48
|
+
"client",
|
|
49
|
+
int,
|
|
50
|
+
"Number of times to retry a failed HTTP request.",
|
|
51
|
+
),
|
|
52
|
+
"retry_delay": (
|
|
53
|
+
"client",
|
|
54
|
+
float,
|
|
55
|
+
"Delay in seconds before retrying a failed request.",
|
|
56
|
+
),
|
|
57
|
+
"rate_limit_wait": (
|
|
58
|
+
"client",
|
|
59
|
+
float,
|
|
60
|
+
"Time to wait in seconds when a rate limit (HTTP 429) is encountered.",
|
|
61
|
+
),
|
|
62
|
+
"timeout": (
|
|
63
|
+
"client",
|
|
64
|
+
float,
|
|
65
|
+
"Maximum time in seconds to wait for a server response.",
|
|
66
|
+
),
|
|
67
|
+
# Extraction section
|
|
68
|
+
"strategy": (
|
|
69
|
+
"extraction",
|
|
70
|
+
str,
|
|
71
|
+
"Graph extraction algorithm to use ('bfs', 'hop').",
|
|
72
|
+
),
|
|
73
|
+
"batch_size": (
|
|
74
|
+
"extraction",
|
|
75
|
+
int,
|
|
76
|
+
"Number of entities to process in a single SPARQL query.",
|
|
77
|
+
),
|
|
78
|
+
"max_hops": (
|
|
79
|
+
"extraction",
|
|
80
|
+
int,
|
|
81
|
+
"Maximum depth or distance from the seed nodes to explore.",
|
|
82
|
+
),
|
|
83
|
+
"hub_pagination_threshold": (
|
|
84
|
+
"extraction",
|
|
85
|
+
int,
|
|
86
|
+
"Number of neighbors at which the extractor will start paginating queries for a node.",
|
|
87
|
+
),
|
|
88
|
+
"max_neighbors_threshold": (
|
|
89
|
+
"extraction",
|
|
90
|
+
int,
|
|
91
|
+
"Maximum number of neighbors allowed before a node is considered a massive hub.",
|
|
92
|
+
),
|
|
93
|
+
"min_triplets_per_property": (
|
|
94
|
+
"extraction",
|
|
95
|
+
int,
|
|
96
|
+
"Minimum number of triplets required per property to be kept when paginating.",
|
|
97
|
+
),
|
|
98
|
+
"check_seeds_validity": (
|
|
99
|
+
"extraction",
|
|
100
|
+
bool,
|
|
101
|
+
"Verify if seed nodes have valid URIs.",
|
|
102
|
+
),
|
|
103
|
+
"check_hub_seeds": (
|
|
104
|
+
"extraction",
|
|
105
|
+
bool,
|
|
106
|
+
"Check the degree of seed nodes beforehand to identify massive hubs.",
|
|
107
|
+
),
|
|
108
|
+
"keep_hub_seeds": (
|
|
109
|
+
"extraction",
|
|
110
|
+
bool,
|
|
111
|
+
"Whether to keep (True), skip (False), or prompt the user (None) about massive hub seeds.",
|
|
112
|
+
),
|
|
113
|
+
# Densification section
|
|
114
|
+
"mode": (
|
|
115
|
+
"densification",
|
|
116
|
+
str,
|
|
117
|
+
"Strategy used to connect disconnected components during densification ('most_connected', 'random').",
|
|
118
|
+
),
|
|
119
|
+
"skip_densification": (
|
|
120
|
+
"densification",
|
|
121
|
+
bool,
|
|
122
|
+
"Skip the post-extraction step that attempts to connect isolated subgraphs.",
|
|
123
|
+
),
|
|
124
|
+
# Graph filters
|
|
125
|
+
"include_uri_prefixes": (
|
|
126
|
+
"graph_filters",
|
|
127
|
+
list,
|
|
128
|
+
"Only explore nodes whose URIs start with one of these prefixes.",
|
|
129
|
+
),
|
|
130
|
+
"exclude_uri_prefixes": (
|
|
131
|
+
"graph_filters",
|
|
132
|
+
list,
|
|
133
|
+
"Ignore nodes whose URIs start with any of these prefixes.",
|
|
134
|
+
),
|
|
135
|
+
"exclude_properties": (
|
|
136
|
+
"graph_filters",
|
|
137
|
+
list,
|
|
138
|
+
"Specific properties (URIs) to completely ignore during extraction.",
|
|
139
|
+
),
|
|
140
|
+
"exclude_nodes": (
|
|
141
|
+
"graph_filters",
|
|
142
|
+
list,
|
|
143
|
+
"Specific nodes (URIs) to completely ignore during extraction.",
|
|
144
|
+
),
|
|
145
|
+
"namespaces": (
|
|
146
|
+
"graph_filters",
|
|
147
|
+
(dict, list),
|
|
148
|
+
"Custom namespaces in 'prefix=URI' format (e.g., ex=http://example.com/).",
|
|
149
|
+
),
|
|
150
|
+
# Debug section
|
|
151
|
+
"debug_enabled": ("debug", bool, "Enable verbose debug-level logging."),
|
|
152
|
+
"request_logging": (
|
|
153
|
+
"debug",
|
|
154
|
+
bool,
|
|
155
|
+
"Log details of all SPARQL queries and HTTP requests.",
|
|
156
|
+
),
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def load_config(config_path: str | None, overrides: dict) -> dict:
|
|
161
|
+
"""Load configuration from a JSON file and apply overrides."""
|
|
162
|
+
cfg: dict = {}
|
|
163
|
+
|
|
164
|
+
if config_path is not None:
|
|
165
|
+
config_name = config_path.lower().replace(".json", "")
|
|
166
|
+
|
|
167
|
+
if config_name in BUILTIN_CONFIGS:
|
|
168
|
+
template_path = resources.files("graph_seeder.configs").joinpath(
|
|
169
|
+
f"{config_name}.json"
|
|
170
|
+
)
|
|
171
|
+
with template_path.open("r", encoding="utf-8") as f:
|
|
172
|
+
cfg = json.load(f)
|
|
173
|
+
else:
|
|
174
|
+
with open(config_path, "r", encoding="utf-8") as f:
|
|
175
|
+
cfg = json.load(f)
|
|
176
|
+
|
|
177
|
+
for key, value in overrides.items():
|
|
178
|
+
if key not in OVERRIDE_MAP:
|
|
179
|
+
raise ValueError(
|
|
180
|
+
f"Unknown override key: '{key}'. Valid keys: {list(OVERRIDE_MAP)}"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
mapping = OVERRIDE_MAP[key]
|
|
184
|
+
section = mapping[0]
|
|
185
|
+
expected_type = mapping[1]
|
|
186
|
+
|
|
187
|
+
if expected_type == (dict, list):
|
|
188
|
+
expected_type = dict
|
|
189
|
+
|
|
190
|
+
# If the value is already a dict (e.g., from a config file), we keep it as is. If it's a list (from command-line), we parse it into a dict.
|
|
191
|
+
if isinstance(value, list):
|
|
192
|
+
parsed_dict = {}
|
|
193
|
+
for item in value:
|
|
194
|
+
if "=" not in item:
|
|
195
|
+
raise ValueError(
|
|
196
|
+
f"Namespace override '{item}' is invalid. Use 'prefix=URI' format."
|
|
197
|
+
)
|
|
198
|
+
pref, uri = item.split("=", 1)
|
|
199
|
+
parsed_dict[pref] = uri
|
|
200
|
+
value = parsed_dict
|
|
201
|
+
|
|
202
|
+
if not isinstance(value, expected_type):
|
|
203
|
+
raise TypeError(
|
|
204
|
+
f"Override '{key}' must be {expected_type}, got {type(value).__name__}"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
target_section = cfg.setdefault(section, {})
|
|
208
|
+
|
|
209
|
+
target_section[key] = value
|
|
210
|
+
|
|
211
|
+
return cfg
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def get_connected_components(triplets: list[tuple[str, str, str]]) -> list[set[str]]:
|
|
215
|
+
graph: nx.Graph = nx.Graph()
|
|
216
|
+
graph.add_edges_from((s, o) for s, p, o in triplets)
|
|
217
|
+
return list(nx.connected_components(graph))
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from collections.abc import Generator
|
|
3
|
+
from graph_seeder.utils.URIManager import URIManager
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class NeighborhoodWrapper(ABC):
|
|
7
|
+
def __init__(self, uri_manager: URIManager, config: dict):
|
|
8
|
+
self.uri_manager = uri_manager
|
|
9
|
+
self.cfg = config
|
|
10
|
+
|
|
11
|
+
skip_val = config["extraction"].get("max_neighbors_threshold")
|
|
12
|
+
self.max_neighbors_threshold = (
|
|
13
|
+
skip_val if skip_val is not None else float("inf")
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
self.excluded_nodes = {
|
|
17
|
+
n for n in config.get("graph_filters", {}).get("exclude_nodes", [])
|
|
18
|
+
}
|
|
19
|
+
self.excluded_properties = {
|
|
20
|
+
p for p in config.get("graph_filters", {}).get("exclude_properties", [])
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
self.included_uri_prefixes = {
|
|
24
|
+
p for p in config.get("graph_filters", {}).get("include_uri_prefixes", [])
|
|
25
|
+
}
|
|
26
|
+
self.excluded_uri_prefixes = {
|
|
27
|
+
p for p in config.get("graph_filters", {}).get("exclude_uri_prefixes", [])
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
self.forced_hubs: set[str] = set()
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def check_seeds_validity(self, seeds: list[str]) -> dict[str, bool]:
|
|
34
|
+
"""Check if the given seeds are valid and return a dict mapping each seed to a boolean indicating if it's valid."""
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def count_neighbors(self, seeds: list[str]) -> dict[str, int]:
|
|
38
|
+
"""Return a dict mapping each seed to its total number of neighbors."""
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
def get_neighborhood(
|
|
42
|
+
self, nodes: list[str]
|
|
43
|
+
) -> Generator[list[tuple[str, str, str]], None, None]:
|
|
44
|
+
"""
|
|
45
|
+
Yields the neighborhood of a list of nodes in batches.
|
|
46
|
+
Allows the consumer to break the loop to stop early.
|
|
47
|
+
"""
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
import lmdb
|
|
3
|
+
import logging
|
|
4
|
+
from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
|
|
5
|
+
from graph_seeder.utils.URIManager import URIManager
|
|
6
|
+
import json
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("subgraph")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class HashMapWrapper(NeighborhoodWrapper):
|
|
12
|
+
"""
|
|
13
|
+
Local graph wrapper using an LMDB hashmap.
|
|
14
|
+
dictionaries of properties and neighbor lists.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, uri_manager: URIManager, config: dict):
|
|
18
|
+
super().__init__(uri_manager, config)
|
|
19
|
+
|
|
20
|
+
db_path = config.get("data", {}).get("hashmap_path", "data")
|
|
21
|
+
self.env = lmdb.open(db_path, readonly=True, lock=False)
|
|
22
|
+
|
|
23
|
+
self._included_prefixes_tuple = tuple(self.included_uri_prefixes)
|
|
24
|
+
self._excluded_prefixes_tuple = tuple(self.excluded_uri_prefixes)
|
|
25
|
+
|
|
26
|
+
def check_seeds_validity(self, seeds: list[str]) -> dict[str, bool]:
|
|
27
|
+
"""Return a dict mapping each seed to a boolean indicating if it's valid."""
|
|
28
|
+
results = {}
|
|
29
|
+
with self.env.begin() as txn:
|
|
30
|
+
for seed in seeds:
|
|
31
|
+
val = txn.get(seed.encode("utf-8"))
|
|
32
|
+
results[seed] = val is not None
|
|
33
|
+
return results
|
|
34
|
+
|
|
35
|
+
def count_neighbors(self, seeds: list[str]) -> dict[str, int]:
|
|
36
|
+
"""Return a dict mapping each seed to its total number of neighbors."""
|
|
37
|
+
seed_totals = {}
|
|
38
|
+
with self.env.begin() as txn:
|
|
39
|
+
for node in seeds:
|
|
40
|
+
val = txn.get(node.encode("utf-8"))
|
|
41
|
+
if val:
|
|
42
|
+
try:
|
|
43
|
+
data = json.loads(val.decode("utf-8"))
|
|
44
|
+
seed_totals[node] = sum(len(n) for n in data.values())
|
|
45
|
+
except Exception:
|
|
46
|
+
pass
|
|
47
|
+
return seed_totals
|
|
48
|
+
|
|
49
|
+
def get_neighborhood(
|
|
50
|
+
self, nodes: list[str]
|
|
51
|
+
) -> Generator[list[tuple[str, str, str]], None, None]:
|
|
52
|
+
triplets: list[tuple[str, str, str]] = []
|
|
53
|
+
skipped_nodes: set[str] = set()
|
|
54
|
+
|
|
55
|
+
with self.env.begin() as txn:
|
|
56
|
+
for original_node in nodes:
|
|
57
|
+
value = txn.get(original_node.encode("utf-8"))
|
|
58
|
+
|
|
59
|
+
if value is None:
|
|
60
|
+
logger.warning(f"Node {original_node} not found in hashmap.")
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
neighbors_data = value.decode("utf-8")
|
|
65
|
+
except UnicodeDecodeError:
|
|
66
|
+
logger.error(
|
|
67
|
+
f"Failed to decode value for node {original_node}. Skipping."
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
neighbors_data: dict[str, list[str]] = json.loads(neighbors_data)
|
|
72
|
+
except json.JSONDecodeError:
|
|
73
|
+
logger.error(
|
|
74
|
+
f"Failed to parse JSON for node {original_node}. Skipping."
|
|
75
|
+
)
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
total_neighbors = sum(
|
|
79
|
+
len(neighbors) for neighbors in neighbors_data.values()
|
|
80
|
+
)
|
|
81
|
+
if (
|
|
82
|
+
total_neighbors >= self.max_neighbors_threshold
|
|
83
|
+
and original_node not in self.forced_hubs
|
|
84
|
+
):
|
|
85
|
+
logger.warning(
|
|
86
|
+
f"Node {original_node} has {total_neighbors} neighbors, which exceeds the skip threshold of {self.max_neighbors_threshold}. Skipping."
|
|
87
|
+
)
|
|
88
|
+
skipped_nodes.add(original_node)
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
for prop, neighbors in neighbors_data.items():
|
|
92
|
+
is_inverse = prop.startswith("(-)")
|
|
93
|
+
clean_prop = prop[3:] if is_inverse else prop
|
|
94
|
+
|
|
95
|
+
if clean_prop in self.excluded_properties:
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
for neighbor in neighbors:
|
|
99
|
+
if neighbor in self.excluded_nodes or neighbor in skipped_nodes:
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
if self._included_prefixes_tuple and not neighbor.startswith(
|
|
103
|
+
self._included_prefixes_tuple
|
|
104
|
+
):
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
if self._excluded_prefixes_tuple and neighbor.startswith(
|
|
108
|
+
self._excluded_prefixes_tuple
|
|
109
|
+
):
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
if is_inverse:
|
|
113
|
+
triplets.append((neighbor, clean_prop, original_node))
|
|
114
|
+
else:
|
|
115
|
+
triplets.append((original_node, clean_prop, neighbor))
|
|
116
|
+
|
|
117
|
+
if triplets:
|
|
118
|
+
yield triplets
|
|
119
|
+
|
|
120
|
+
def __del__(self):
|
|
121
|
+
try:
|
|
122
|
+
self.env.close()
|
|
123
|
+
except Exception:
|
|
124
|
+
pass
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BaseClient(ABC):
|
|
5
|
+
"""Abstract base class for Knowledge Graph querying clients."""
|
|
6
|
+
|
|
7
|
+
def __init__(self, config: dict):
|
|
8
|
+
self.hub_pairs_batch_size = config.get("extraction", {}).get(
|
|
9
|
+
"hub_pairs_batch_size", 128
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
@abstractmethod
|
|
13
|
+
def query(
|
|
14
|
+
self, query_string: str, silent: bool = False, retries: int = None
|
|
15
|
+
) -> list[dict]:
|
|
16
|
+
"""
|
|
17
|
+
Execute a query and return results in the standard SPARQL JSON bindings format:
|
|
18
|
+
[
|
|
19
|
+
{"subject": {"value": "http://..."}, "property": {"value": "..."}},
|
|
20
|
+
...
|
|
21
|
+
]
|
|
22
|
+
"""
|
|
23
|
+
pass
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
|
|
3
|
+
from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
|
|
4
|
+
import logging
|
|
5
|
+
from graph_seeder.wrapper.sparql.SparqlQueryBuilder import SparqlQueryBuilder
|
|
6
|
+
from graph_seeder.utils.URIManager import URIManager
|
|
7
|
+
from graph_seeder.wrapper.sparql.BaseClient import BaseClient
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger("subgraph")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class GraphWrapper(NeighborhoodWrapper):
|
|
13
|
+
def __init__(self, uri_manager: URIManager, config: dict, client: BaseClient):
|
|
14
|
+
super().__init__(uri_manager, config)
|
|
15
|
+
|
|
16
|
+
self.client = client
|
|
17
|
+
|
|
18
|
+
extraction_config: dict = config.get("extraction", {})
|
|
19
|
+
|
|
20
|
+
max_neighbors_val = extraction_config.get("max_neighbors_threshold")
|
|
21
|
+
self.max_neighbors_threshold = (
|
|
22
|
+
max_neighbors_val if max_neighbors_val is not None else float("inf")
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
pagination_val = extraction_config.get("hub_pagination_threshold")
|
|
26
|
+
self.hub_pagination_threshold = (
|
|
27
|
+
pagination_val if pagination_val is not None else float("inf")
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
prop_skip_val = extraction_config.get("min_triplets_per_property")
|
|
31
|
+
self.min_triplets_per_property = (
|
|
32
|
+
prop_skip_val if prop_skip_val is not None else 0
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
self.request_logging = config.get("debug", {}).get("request_logging", False)
|
|
36
|
+
|
|
37
|
+
self.query_builder = SparqlQueryBuilder(
|
|
38
|
+
self.uri_manager,
|
|
39
|
+
config["graph_filters"],
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
self.batch_size = extraction_config.get("batch_size", 20)
|
|
43
|
+
|
|
44
|
+
def _execute_with_dichotomy(
|
|
45
|
+
self, items, build_query
|
|
46
|
+
) -> Generator[list[dict], None, None]:
|
|
47
|
+
"""Recursively execute a query with dichotomy splitting on failure.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
items: List of items to include in the query (e.g., nodes or node-property pairs).
|
|
51
|
+
build_query: Function that takes a list of items and returns a SPARQL query string
|
|
52
|
+
|
|
53
|
+
Yields:
|
|
54
|
+
Lists of (subject, property, object) tuples from successful query executions.
|
|
55
|
+
"""
|
|
56
|
+
if not items:
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
query = build_query(items)
|
|
60
|
+
if self.request_logging:
|
|
61
|
+
logger.info(f"Query for {len(items)} items:\n{query.strip()}")
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
rows = self.client.query(query)
|
|
65
|
+
if rows:
|
|
66
|
+
yield rows
|
|
67
|
+
except RuntimeError:
|
|
68
|
+
if len(items) <= 1:
|
|
69
|
+
logger.error(f"Cannot split further ({len(items)} item(s)), skipping.")
|
|
70
|
+
return
|
|
71
|
+
mid = len(items) // 2
|
|
72
|
+
logger.warning(
|
|
73
|
+
f"Query failed for {len(items)} items, splitting into halves."
|
|
74
|
+
)
|
|
75
|
+
yield from self._execute_with_dichotomy(items[:mid], build_query)
|
|
76
|
+
yield from self._execute_with_dichotomy(items[mid:], build_query)
|
|
77
|
+
|
|
78
|
+
def check_seeds_validity(self, seeds: list[str]) -> dict[str, bool]:
|
|
79
|
+
"""Return a dict mapping each seed to a boolean indicating if it's valid."""
|
|
80
|
+
results = {}
|
|
81
|
+
|
|
82
|
+
for i in range(0, len(seeds), self.batch_size):
|
|
83
|
+
batch = seeds[i : i + self.batch_size]
|
|
84
|
+
|
|
85
|
+
batch_results = self._check_batch_strict(batch)
|
|
86
|
+
results.update(batch_results)
|
|
87
|
+
|
|
88
|
+
return results
|
|
89
|
+
|
|
90
|
+
def _check_batch_strict(self, batch: list[str]) -> dict[str, bool]:
|
|
91
|
+
if not batch:
|
|
92
|
+
return {}
|
|
93
|
+
|
|
94
|
+
query = self.query_builder.build_checking_validity_query(batch)
|
|
95
|
+
if self.request_logging:
|
|
96
|
+
logger.info(f"Checking seeds with query:\n{query.strip()}")
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
rows = self.client.query(query, silent=True, retries=1)
|
|
100
|
+
results = {r["node"]["value"]: True for r in rows}
|
|
101
|
+
return results
|
|
102
|
+
|
|
103
|
+
except RuntimeError:
|
|
104
|
+
if len(batch) == 1:
|
|
105
|
+
return {batch[0]: False}
|
|
106
|
+
|
|
107
|
+
mid = len(batch) // 2
|
|
108
|
+
left_results = self._check_batch_strict(batch[mid:])
|
|
109
|
+
right_results = self._check_batch_strict(batch[:mid])
|
|
110
|
+
|
|
111
|
+
left_results.update(right_results)
|
|
112
|
+
|
|
113
|
+
return left_results
|
|
114
|
+
|
|
115
|
+
def count_neighbors(self, seeds: list[str]) -> dict[str, int]:
|
|
116
|
+
"""Return a dict mapping each seed to its total number of neighbors."""
|
|
117
|
+
results = {}
|
|
118
|
+
|
|
119
|
+
for i in range(0, len(seeds), self.batch_size):
|
|
120
|
+
batch = seeds[i : i + self.batch_size]
|
|
121
|
+
|
|
122
|
+
batch_results = self._get_hub_seeds(batch)
|
|
123
|
+
for r in batch_results:
|
|
124
|
+
results.update(r)
|
|
125
|
+
|
|
126
|
+
return results
|
|
127
|
+
|
|
128
|
+
def _get_hub_seeds(self, seeds: list[str]) -> Generator[dict[str, int], None, None]:
|
|
129
|
+
"""Return the list of hub seeds among the given seeds."""
|
|
130
|
+
for raw_rows in self._execute_with_dichotomy(
|
|
131
|
+
seeds, self.query_builder.build_total_neighbors_query
|
|
132
|
+
):
|
|
133
|
+
for r in raw_rows:
|
|
134
|
+
yield {r["node"]["value"]: int(r["total"]["value"])}
|
|
135
|
+
|
|
136
|
+
def get_neighborhood(
|
|
137
|
+
self, nodes: list[str]
|
|
138
|
+
) -> Generator[list[tuple[str, str, str]], None, None]:
|
|
139
|
+
"""Fetch one-hop neighbors using property occurrence to decide strategy."""
|
|
140
|
+
if not nodes:
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
for i in range(0, len(nodes), self.batch_size):
|
|
144
|
+
batch_nodes = nodes[i : i + self.batch_size]
|
|
145
|
+
|
|
146
|
+
prop_data = self._get_properties_statistics(batch_nodes)
|
|
147
|
+
|
|
148
|
+
node_totals: dict[str, int] = {}
|
|
149
|
+
node_to_props: dict[str, list[tuple[str, int]]] = {}
|
|
150
|
+
|
|
151
|
+
for node, prop, count in prop_data:
|
|
152
|
+
node_totals[node] = node_totals.get(node, 0) + count
|
|
153
|
+
if node not in node_to_props:
|
|
154
|
+
node_to_props[node] = []
|
|
155
|
+
node_to_props[node].append((prop, count))
|
|
156
|
+
|
|
157
|
+
safe_nodes = []
|
|
158
|
+
hub_pairs_to_chunk: list[tuple[str, str, int]] = []
|
|
159
|
+
|
|
160
|
+
for node in batch_nodes:
|
|
161
|
+
total = node_totals.get(node, 0)
|
|
162
|
+
is_force_paginate = node in self.forced_hubs
|
|
163
|
+
|
|
164
|
+
if total > self.max_neighbors_threshold and not is_force_paginate:
|
|
165
|
+
logger.warning(
|
|
166
|
+
f"Node {node} has total {total} neighbors, which exceeds the skip threshold of {self.max_neighbors_threshold}. Skipping."
|
|
167
|
+
)
|
|
168
|
+
elif total > self.hub_pagination_threshold or is_force_paginate:
|
|
169
|
+
for prop, count in node_to_props.get(node, []):
|
|
170
|
+
if count >= self.min_triplets_per_property:
|
|
171
|
+
hub_pairs_to_chunk.append((node, prop, count))
|
|
172
|
+
else:
|
|
173
|
+
safe_nodes.append(node)
|
|
174
|
+
|
|
175
|
+
if safe_nodes:
|
|
176
|
+
for raw_rows in self._execute_with_dichotomy(
|
|
177
|
+
safe_nodes, self.query_builder.build_neighborhood_query
|
|
178
|
+
):
|
|
179
|
+
yield [
|
|
180
|
+
(
|
|
181
|
+
r["subject"]["value"],
|
|
182
|
+
r["property"]["value"],
|
|
183
|
+
r["object"]["value"],
|
|
184
|
+
)
|
|
185
|
+
for r in raw_rows
|
|
186
|
+
]
|
|
187
|
+
|
|
188
|
+
if hub_pairs_to_chunk:
|
|
189
|
+
for batch in self._chunk_hub_properties(hub_pairs_to_chunk):
|
|
190
|
+
for raw_rows in self._execute_with_dichotomy(
|
|
191
|
+
batch, self.query_builder.build_hub_neighborhood_query
|
|
192
|
+
):
|
|
193
|
+
yield [
|
|
194
|
+
(
|
|
195
|
+
r["subject"]["value"],
|
|
196
|
+
r["property"]["value"],
|
|
197
|
+
r["object"]["value"],
|
|
198
|
+
)
|
|
199
|
+
for r in raw_rows
|
|
200
|
+
]
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
def _get_properties_statistics(
|
|
204
|
+
self, nodes: list[str]
|
|
205
|
+
) -> list[tuple[str, str, int]]:
|
|
206
|
+
"""Execute the prop occurrence query and return raw results.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
nodes: List of node IDs to analyze.
|
|
210
|
+
Returns:
|
|
211
|
+
List of (node, property, count) tuples from the query results.
|
|
212
|
+
"""
|
|
213
|
+
all_stats = []
|
|
214
|
+
for raw_rows in self._execute_with_dichotomy(
|
|
215
|
+
nodes, self.query_builder.build_prop_occurrence_query
|
|
216
|
+
):
|
|
217
|
+
all_stats.extend(
|
|
218
|
+
[
|
|
219
|
+
(
|
|
220
|
+
r["entity"]["value"],
|
|
221
|
+
r["property"]["value"],
|
|
222
|
+
int(r["count"]["value"]),
|
|
223
|
+
)
|
|
224
|
+
for r in raw_rows
|
|
225
|
+
]
|
|
226
|
+
)
|
|
227
|
+
return all_stats
|
|
228
|
+
|
|
229
|
+
def _chunk_hub_properties(
|
|
230
|
+
self, prop_counts: list[tuple[str, str, int]]
|
|
231
|
+
) -> list[list[tuple[str, str]]]:
|
|
232
|
+
"""
|
|
233
|
+
Group (node, prop) pairs into batches using Largest-Smallest strategy.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
prop_counts: List of (node, property, count) tuples sorted by count descending.
|
|
237
|
+
Returns:
|
|
238
|
+
List of batches, where each batch is a list of (node, property) pairs.
|
|
239
|
+
"""
|
|
240
|
+
if not prop_counts:
|
|
241
|
+
return []
|
|
242
|
+
|
|
243
|
+
max_pairs_per_batch = self.client.hub_pairs_batch_size
|
|
244
|
+
|
|
245
|
+
sorted_props = sorted(prop_counts, key=lambda x: x[2], reverse=True)
|
|
246
|
+
batches = []
|
|
247
|
+
|
|
248
|
+
left = 0
|
|
249
|
+
right = len(sorted_props) - 1
|
|
250
|
+
|
|
251
|
+
while left <= right:
|
|
252
|
+
current_batch = []
|
|
253
|
+
node_l, prop_l, count_l = sorted_props[left]
|
|
254
|
+
current_batch.append((node_l, prop_l))
|
|
255
|
+
current_sum = count_l
|
|
256
|
+
left += 1
|
|
257
|
+
|
|
258
|
+
while left <= right and len(current_batch) < max_pairs_per_batch:
|
|
259
|
+
node_r, prop_r, count_r = sorted_props[right]
|
|
260
|
+
if current_sum + count_r <= self.max_neighbors_threshold:
|
|
261
|
+
current_batch.append((node_r, prop_r))
|
|
262
|
+
current_sum += count_r
|
|
263
|
+
right -= 1
|
|
264
|
+
else:
|
|
265
|
+
break
|
|
266
|
+
|
|
267
|
+
batches.append(current_batch)
|
|
268
|
+
|
|
269
|
+
return batches
|