graph-seeder 1.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,217 @@
1
+ import networkx as nx
2
+ import json
3
+ from importlib import resources
4
+
5
+ BUILTIN_CONFIGS = [
6
+ "dbpedia_default",
7
+ "wikidata_default",
8
+ "pgxlod_default",
9
+ "europeana_default",
10
+ "default",
11
+ ]
12
+
13
+ OVERRIDE_MAP = {
14
+ # Data section
15
+ "input_path": ("data", str, "Path to the input csv file containing seed nodes."),
16
+ "output_format": (
17
+ "data",
18
+ str,
19
+ "Format of the extracted graph output ('csv', 'json', 'ttl').",
20
+ ),
21
+ "output_path": (
22
+ "data",
23
+ str,
24
+ "Destination path and base filename for the extracted graph (e.g., 'output/result').",
25
+ ),
26
+ "turtle_path": (
27
+ "data",
28
+ str,
29
+ "Path to a local Turtle file (if using local extraction instead of a SPARQL endpoint).",
30
+ ),
31
+ # Client section
32
+ "endpoint": (
33
+ "client",
34
+ str,
35
+ "URL of the SPARQL endpoint to query (e.g., 'https://dbpedia.org/sparql').",
36
+ ),
37
+ "user_agent": (
38
+ "client",
39
+ str,
40
+ "HTTP User-Agent header to identify your requests to the server.",
41
+ ),
42
+ "request_delay": (
43
+ "client",
44
+ float,
45
+ "Delay in seconds between consecutive requests to avoid overloading the server.",
46
+ ),
47
+ "retry_attempts": (
48
+ "client",
49
+ int,
50
+ "Number of times to retry a failed HTTP request.",
51
+ ),
52
+ "retry_delay": (
53
+ "client",
54
+ float,
55
+ "Delay in seconds before retrying a failed request.",
56
+ ),
57
+ "rate_limit_wait": (
58
+ "client",
59
+ float,
60
+ "Time to wait in seconds when a rate limit (HTTP 429) is encountered.",
61
+ ),
62
+ "timeout": (
63
+ "client",
64
+ float,
65
+ "Maximum time in seconds to wait for a server response.",
66
+ ),
67
+ # Extraction section
68
+ "strategy": (
69
+ "extraction",
70
+ str,
71
+ "Graph extraction algorithm to use ('bfs', 'hop').",
72
+ ),
73
+ "batch_size": (
74
+ "extraction",
75
+ int,
76
+ "Number of entities to process in a single SPARQL query.",
77
+ ),
78
+ "max_hops": (
79
+ "extraction",
80
+ int,
81
+ "Maximum depth or distance from the seed nodes to explore.",
82
+ ),
83
+ "hub_pagination_threshold": (
84
+ "extraction",
85
+ int,
86
+ "Number of neighbors at which the extractor will start paginating queries for a node.",
87
+ ),
88
+ "max_neighbors_threshold": (
89
+ "extraction",
90
+ int,
91
+ "Maximum number of neighbors allowed before a node is considered a massive hub.",
92
+ ),
93
+ "min_triplets_per_property": (
94
+ "extraction",
95
+ int,
96
+ "Minimum number of triplets required per property to be kept when paginating.",
97
+ ),
98
+ "check_seeds_validity": (
99
+ "extraction",
100
+ bool,
101
+ "Verify if seed nodes have valid URIs.",
102
+ ),
103
+ "check_hub_seeds": (
104
+ "extraction",
105
+ bool,
106
+ "Check the degree of seed nodes beforehand to identify massive hubs.",
107
+ ),
108
+ "keep_hub_seeds": (
109
+ "extraction",
110
+ bool,
111
+ "Whether to keep (True), skip (False), or prompt the user (None) about massive hub seeds.",
112
+ ),
113
+ # Densification section
114
+ "mode": (
115
+ "densification",
116
+ str,
117
+ "Strategy used to connect disconnected components during densification ('most_connected', 'random').",
118
+ ),
119
+ "skip_densification": (
120
+ "densification",
121
+ bool,
122
+ "Skip the post-extraction step that attempts to connect isolated subgraphs.",
123
+ ),
124
+ # Graph filters
125
+ "include_uri_prefixes": (
126
+ "graph_filters",
127
+ list,
128
+ "Only explore nodes whose URIs start with one of these prefixes.",
129
+ ),
130
+ "exclude_uri_prefixes": (
131
+ "graph_filters",
132
+ list,
133
+ "Ignore nodes whose URIs start with any of these prefixes.",
134
+ ),
135
+ "exclude_properties": (
136
+ "graph_filters",
137
+ list,
138
+ "Specific properties (URIs) to completely ignore during extraction.",
139
+ ),
140
+ "exclude_nodes": (
141
+ "graph_filters",
142
+ list,
143
+ "Specific nodes (URIs) to completely ignore during extraction.",
144
+ ),
145
+ "namespaces": (
146
+ "graph_filters",
147
+ (dict, list),
148
+ "Custom namespaces in 'prefix=URI' format (e.g., ex=http://example.com/).",
149
+ ),
150
+ # Debug section
151
+ "debug_enabled": ("debug", bool, "Enable verbose debug-level logging."),
152
+ "request_logging": (
153
+ "debug",
154
+ bool,
155
+ "Log details of all SPARQL queries and HTTP requests.",
156
+ ),
157
+ }
158
+
159
+
160
+ def load_config(config_path: str | None, overrides: dict) -> dict:
161
+ """Load configuration from a JSON file and apply overrides."""
162
+ cfg: dict = {}
163
+
164
+ if config_path is not None:
165
+ config_name = config_path.lower().replace(".json", "")
166
+
167
+ if config_name in BUILTIN_CONFIGS:
168
+ template_path = resources.files("graph_seeder.configs").joinpath(
169
+ f"{config_name}.json"
170
+ )
171
+ with template_path.open("r", encoding="utf-8") as f:
172
+ cfg = json.load(f)
173
+ else:
174
+ with open(config_path, "r", encoding="utf-8") as f:
175
+ cfg = json.load(f)
176
+
177
+ for key, value in overrides.items():
178
+ if key not in OVERRIDE_MAP:
179
+ raise ValueError(
180
+ f"Unknown override key: '{key}'. Valid keys: {list(OVERRIDE_MAP)}"
181
+ )
182
+
183
+ mapping = OVERRIDE_MAP[key]
184
+ section = mapping[0]
185
+ expected_type = mapping[1]
186
+
187
+ if expected_type == (dict, list):
188
+ expected_type = dict
189
+
190
+ # If the value is already a dict (e.g., from a config file), we keep it as is. If it's a list (from command-line), we parse it into a dict.
191
+ if isinstance(value, list):
192
+ parsed_dict = {}
193
+ for item in value:
194
+ if "=" not in item:
195
+ raise ValueError(
196
+ f"Namespace override '{item}' is invalid. Use 'prefix=URI' format."
197
+ )
198
+ pref, uri = item.split("=", 1)
199
+ parsed_dict[pref] = uri
200
+ value = parsed_dict
201
+
202
+ if not isinstance(value, expected_type):
203
+ raise TypeError(
204
+ f"Override '{key}' must be {expected_type}, got {type(value).__name__}"
205
+ )
206
+
207
+ target_section = cfg.setdefault(section, {})
208
+
209
+ target_section[key] = value
210
+
211
+ return cfg
212
+
213
+
214
+ def get_connected_components(triplets: list[tuple[str, str, str]]) -> list[set[str]]:
215
+ graph: nx.Graph = nx.Graph()
216
+ graph.add_edges_from((s, o) for s, p, o in triplets)
217
+ return list(nx.connected_components(graph))
@@ -0,0 +1,47 @@
1
+ from abc import ABC, abstractmethod
2
+ from collections.abc import Generator
3
+ from graph_seeder.utils.URIManager import URIManager
4
+
5
+
6
+ class NeighborhoodWrapper(ABC):
7
+ def __init__(self, uri_manager: URIManager, config: dict):
8
+ self.uri_manager = uri_manager
9
+ self.cfg = config
10
+
11
+ skip_val = config["extraction"].get("max_neighbors_threshold")
12
+ self.max_neighbors_threshold = (
13
+ skip_val if skip_val is not None else float("inf")
14
+ )
15
+
16
+ self.excluded_nodes = {
17
+ n for n in config.get("graph_filters", {}).get("exclude_nodes", [])
18
+ }
19
+ self.excluded_properties = {
20
+ p for p in config.get("graph_filters", {}).get("exclude_properties", [])
21
+ }
22
+
23
+ self.included_uri_prefixes = {
24
+ p for p in config.get("graph_filters", {}).get("include_uri_prefixes", [])
25
+ }
26
+ self.excluded_uri_prefixes = {
27
+ p for p in config.get("graph_filters", {}).get("exclude_uri_prefixes", [])
28
+ }
29
+
30
+ self.forced_hubs: set[str] = set()
31
+
32
+ @abstractmethod
33
+ def check_seeds_validity(self, seeds: list[str]) -> dict[str, bool]:
34
+ """Check if the given seeds are valid and return a dict mapping each seed to a boolean indicating if it's valid."""
35
+
36
+ @abstractmethod
37
+ def count_neighbors(self, seeds: list[str]) -> dict[str, int]:
38
+ """Return a dict mapping each seed to its total number of neighbors."""
39
+
40
+ @abstractmethod
41
+ def get_neighborhood(
42
+ self, nodes: list[str]
43
+ ) -> Generator[list[tuple[str, str, str]], None, None]:
44
+ """
45
+ Yields the neighborhood of a list of nodes in batches.
46
+ Allows the consumer to break the loop to stop early.
47
+ """
@@ -0,0 +1,124 @@
1
+ from collections.abc import Generator
2
+ import lmdb
3
+ import logging
4
+ from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
5
+ from graph_seeder.utils.URIManager import URIManager
6
+ import json
7
+
8
+ logger = logging.getLogger("subgraph")
9
+
10
+
11
+ class HashMapWrapper(NeighborhoodWrapper):
12
+ """
13
+ Local graph wrapper using an LMDB hashmap.
14
+ dictionaries of properties and neighbor lists.
15
+ """
16
+
17
+ def __init__(self, uri_manager: URIManager, config: dict):
18
+ super().__init__(uri_manager, config)
19
+
20
+ db_path = config.get("data", {}).get("hashmap_path", "data")
21
+ self.env = lmdb.open(db_path, readonly=True, lock=False)
22
+
23
+ self._included_prefixes_tuple = tuple(self.included_uri_prefixes)
24
+ self._excluded_prefixes_tuple = tuple(self.excluded_uri_prefixes)
25
+
26
+ def check_seeds_validity(self, seeds: list[str]) -> dict[str, bool]:
27
+ """Return a dict mapping each seed to a boolean indicating if it's valid."""
28
+ results = {}
29
+ with self.env.begin() as txn:
30
+ for seed in seeds:
31
+ val = txn.get(seed.encode("utf-8"))
32
+ results[seed] = val is not None
33
+ return results
34
+
35
+ def count_neighbors(self, seeds: list[str]) -> dict[str, int]:
36
+ """Return a dict mapping each seed to its total number of neighbors."""
37
+ seed_totals = {}
38
+ with self.env.begin() as txn:
39
+ for node in seeds:
40
+ val = txn.get(node.encode("utf-8"))
41
+ if val:
42
+ try:
43
+ data = json.loads(val.decode("utf-8"))
44
+ seed_totals[node] = sum(len(n) for n in data.values())
45
+ except Exception:
46
+ pass
47
+ return seed_totals
48
+
49
+ def get_neighborhood(
50
+ self, nodes: list[str]
51
+ ) -> Generator[list[tuple[str, str, str]], None, None]:
52
+ triplets: list[tuple[str, str, str]] = []
53
+ skipped_nodes: set[str] = set()
54
+
55
+ with self.env.begin() as txn:
56
+ for original_node in nodes:
57
+ value = txn.get(original_node.encode("utf-8"))
58
+
59
+ if value is None:
60
+ logger.warning(f"Node {original_node} not found in hashmap.")
61
+ continue
62
+
63
+ try:
64
+ neighbors_data = value.decode("utf-8")
65
+ except UnicodeDecodeError:
66
+ logger.error(
67
+ f"Failed to decode value for node {original_node}. Skipping."
68
+ )
69
+
70
+ try:
71
+ neighbors_data: dict[str, list[str]] = json.loads(neighbors_data)
72
+ except json.JSONDecodeError:
73
+ logger.error(
74
+ f"Failed to parse JSON for node {original_node}. Skipping."
75
+ )
76
+ continue
77
+
78
+ total_neighbors = sum(
79
+ len(neighbors) for neighbors in neighbors_data.values()
80
+ )
81
+ if (
82
+ total_neighbors >= self.max_neighbors_threshold
83
+ and original_node not in self.forced_hubs
84
+ ):
85
+ logger.warning(
86
+ f"Node {original_node} has {total_neighbors} neighbors, which exceeds the skip threshold of {self.max_neighbors_threshold}. Skipping."
87
+ )
88
+ skipped_nodes.add(original_node)
89
+ continue
90
+
91
+ for prop, neighbors in neighbors_data.items():
92
+ is_inverse = prop.startswith("(-)")
93
+ clean_prop = prop[3:] if is_inverse else prop
94
+
95
+ if clean_prop in self.excluded_properties:
96
+ continue
97
+
98
+ for neighbor in neighbors:
99
+ if neighbor in self.excluded_nodes or neighbor in skipped_nodes:
100
+ continue
101
+
102
+ if self._included_prefixes_tuple and not neighbor.startswith(
103
+ self._included_prefixes_tuple
104
+ ):
105
+ continue
106
+
107
+ if self._excluded_prefixes_tuple and neighbor.startswith(
108
+ self._excluded_prefixes_tuple
109
+ ):
110
+ continue
111
+
112
+ if is_inverse:
113
+ triplets.append((neighbor, clean_prop, original_node))
114
+ else:
115
+ triplets.append((original_node, clean_prop, neighbor))
116
+
117
+ if triplets:
118
+ yield triplets
119
+
120
+ def __del__(self):
121
+ try:
122
+ self.env.close()
123
+ except Exception:
124
+ pass
@@ -0,0 +1,23 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class BaseClient(ABC):
5
+ """Abstract base class for Knowledge Graph querying clients."""
6
+
7
+ def __init__(self, config: dict):
8
+ self.hub_pairs_batch_size = config.get("extraction", {}).get(
9
+ "hub_pairs_batch_size", 128
10
+ )
11
+
12
+ @abstractmethod
13
+ def query(
14
+ self, query_string: str, silent: bool = False, retries: int = None
15
+ ) -> list[dict]:
16
+ """
17
+ Execute a query and return results in the standard SPARQL JSON bindings format:
18
+ [
19
+ {"subject": {"value": "http://..."}, "property": {"value": "..."}},
20
+ ...
21
+ ]
22
+ """
23
+ pass
@@ -0,0 +1,269 @@
1
+ from collections.abc import Generator
2
+
3
+ from graph_seeder.wrapper.NeighborhoodWrapper import NeighborhoodWrapper
4
+ import logging
5
+ from graph_seeder.wrapper.sparql.SparqlQueryBuilder import SparqlQueryBuilder
6
+ from graph_seeder.utils.URIManager import URIManager
7
+ from graph_seeder.wrapper.sparql.BaseClient import BaseClient
8
+
9
+ logger = logging.getLogger("subgraph")
10
+
11
+
12
+ class GraphWrapper(NeighborhoodWrapper):
13
+ def __init__(self, uri_manager: URIManager, config: dict, client: BaseClient):
14
+ super().__init__(uri_manager, config)
15
+
16
+ self.client = client
17
+
18
+ extraction_config: dict = config.get("extraction", {})
19
+
20
+ max_neighbors_val = extraction_config.get("max_neighbors_threshold")
21
+ self.max_neighbors_threshold = (
22
+ max_neighbors_val if max_neighbors_val is not None else float("inf")
23
+ )
24
+
25
+ pagination_val = extraction_config.get("hub_pagination_threshold")
26
+ self.hub_pagination_threshold = (
27
+ pagination_val if pagination_val is not None else float("inf")
28
+ )
29
+
30
+ prop_skip_val = extraction_config.get("min_triplets_per_property")
31
+ self.min_triplets_per_property = (
32
+ prop_skip_val if prop_skip_val is not None else 0
33
+ )
34
+
35
+ self.request_logging = config.get("debug", {}).get("request_logging", False)
36
+
37
+ self.query_builder = SparqlQueryBuilder(
38
+ self.uri_manager,
39
+ config["graph_filters"],
40
+ )
41
+
42
+ self.batch_size = extraction_config.get("batch_size", 20)
43
+
44
+ def _execute_with_dichotomy(
45
+ self, items, build_query
46
+ ) -> Generator[list[dict], None, None]:
47
+ """Recursively execute a query with dichotomy splitting on failure.
48
+
49
+ Args:
50
+ items: List of items to include in the query (e.g., nodes or node-property pairs).
51
+ build_query: Function that takes a list of items and returns a SPARQL query string
52
+
53
+ Yields:
54
+ Lists of (subject, property, object) tuples from successful query executions.
55
+ """
56
+ if not items:
57
+ return
58
+
59
+ query = build_query(items)
60
+ if self.request_logging:
61
+ logger.info(f"Query for {len(items)} items:\n{query.strip()}")
62
+
63
+ try:
64
+ rows = self.client.query(query)
65
+ if rows:
66
+ yield rows
67
+ except RuntimeError:
68
+ if len(items) <= 1:
69
+ logger.error(f"Cannot split further ({len(items)} item(s)), skipping.")
70
+ return
71
+ mid = len(items) // 2
72
+ logger.warning(
73
+ f"Query failed for {len(items)} items, splitting into halves."
74
+ )
75
+ yield from self._execute_with_dichotomy(items[:mid], build_query)
76
+ yield from self._execute_with_dichotomy(items[mid:], build_query)
77
+
78
+ def check_seeds_validity(self, seeds: list[str]) -> dict[str, bool]:
79
+ """Return a dict mapping each seed to a boolean indicating if it's valid."""
80
+ results = {}
81
+
82
+ for i in range(0, len(seeds), self.batch_size):
83
+ batch = seeds[i : i + self.batch_size]
84
+
85
+ batch_results = self._check_batch_strict(batch)
86
+ results.update(batch_results)
87
+
88
+ return results
89
+
90
+ def _check_batch_strict(self, batch: list[str]) -> dict[str, bool]:
91
+ if not batch:
92
+ return {}
93
+
94
+ query = self.query_builder.build_checking_validity_query(batch)
95
+ if self.request_logging:
96
+ logger.info(f"Checking seeds with query:\n{query.strip()}")
97
+
98
+ try:
99
+ rows = self.client.query(query, silent=True, retries=1)
100
+ results = {r["node"]["value"]: True for r in rows}
101
+ return results
102
+
103
+ except RuntimeError:
104
+ if len(batch) == 1:
105
+ return {batch[0]: False}
106
+
107
+ mid = len(batch) // 2
108
+ left_results = self._check_batch_strict(batch[mid:])
109
+ right_results = self._check_batch_strict(batch[:mid])
110
+
111
+ left_results.update(right_results)
112
+
113
+ return left_results
114
+
115
+ def count_neighbors(self, seeds: list[str]) -> dict[str, int]:
116
+ """Return a dict mapping each seed to its total number of neighbors."""
117
+ results = {}
118
+
119
+ for i in range(0, len(seeds), self.batch_size):
120
+ batch = seeds[i : i + self.batch_size]
121
+
122
+ batch_results = self._get_hub_seeds(batch)
123
+ for r in batch_results:
124
+ results.update(r)
125
+
126
+ return results
127
+
128
+ def _get_hub_seeds(self, seeds: list[str]) -> Generator[dict[str, int], None, None]:
129
+ """Return the list of hub seeds among the given seeds."""
130
+ for raw_rows in self._execute_with_dichotomy(
131
+ seeds, self.query_builder.build_total_neighbors_query
132
+ ):
133
+ for r in raw_rows:
134
+ yield {r["node"]["value"]: int(r["total"]["value"])}
135
+
136
+ def get_neighborhood(
137
+ self, nodes: list[str]
138
+ ) -> Generator[list[tuple[str, str, str]], None, None]:
139
+ """Fetch one-hop neighbors using property occurrence to decide strategy."""
140
+ if not nodes:
141
+ return None
142
+
143
+ for i in range(0, len(nodes), self.batch_size):
144
+ batch_nodes = nodes[i : i + self.batch_size]
145
+
146
+ prop_data = self._get_properties_statistics(batch_nodes)
147
+
148
+ node_totals: dict[str, int] = {}
149
+ node_to_props: dict[str, list[tuple[str, int]]] = {}
150
+
151
+ for node, prop, count in prop_data:
152
+ node_totals[node] = node_totals.get(node, 0) + count
153
+ if node not in node_to_props:
154
+ node_to_props[node] = []
155
+ node_to_props[node].append((prop, count))
156
+
157
+ safe_nodes = []
158
+ hub_pairs_to_chunk: list[tuple[str, str, int]] = []
159
+
160
+ for node in batch_nodes:
161
+ total = node_totals.get(node, 0)
162
+ is_force_paginate = node in self.forced_hubs
163
+
164
+ if total > self.max_neighbors_threshold and not is_force_paginate:
165
+ logger.warning(
166
+ f"Node {node} has total {total} neighbors, which exceeds the skip threshold of {self.max_neighbors_threshold}. Skipping."
167
+ )
168
+ elif total > self.hub_pagination_threshold or is_force_paginate:
169
+ for prop, count in node_to_props.get(node, []):
170
+ if count >= self.min_triplets_per_property:
171
+ hub_pairs_to_chunk.append((node, prop, count))
172
+ else:
173
+ safe_nodes.append(node)
174
+
175
+ if safe_nodes:
176
+ for raw_rows in self._execute_with_dichotomy(
177
+ safe_nodes, self.query_builder.build_neighborhood_query
178
+ ):
179
+ yield [
180
+ (
181
+ r["subject"]["value"],
182
+ r["property"]["value"],
183
+ r["object"]["value"],
184
+ )
185
+ for r in raw_rows
186
+ ]
187
+
188
+ if hub_pairs_to_chunk:
189
+ for batch in self._chunk_hub_properties(hub_pairs_to_chunk):
190
+ for raw_rows in self._execute_with_dichotomy(
191
+ batch, self.query_builder.build_hub_neighborhood_query
192
+ ):
193
+ yield [
194
+ (
195
+ r["subject"]["value"],
196
+ r["property"]["value"],
197
+ r["object"]["value"],
198
+ )
199
+ for r in raw_rows
200
+ ]
201
+ return None
202
+
203
+ def _get_properties_statistics(
204
+ self, nodes: list[str]
205
+ ) -> list[tuple[str, str, int]]:
206
+ """Execute the prop occurrence query and return raw results.
207
+
208
+ Args:
209
+ nodes: List of node IDs to analyze.
210
+ Returns:
211
+ List of (node, property, count) tuples from the query results.
212
+ """
213
+ all_stats = []
214
+ for raw_rows in self._execute_with_dichotomy(
215
+ nodes, self.query_builder.build_prop_occurrence_query
216
+ ):
217
+ all_stats.extend(
218
+ [
219
+ (
220
+ r["entity"]["value"],
221
+ r["property"]["value"],
222
+ int(r["count"]["value"]),
223
+ )
224
+ for r in raw_rows
225
+ ]
226
+ )
227
+ return all_stats
228
+
229
+ def _chunk_hub_properties(
230
+ self, prop_counts: list[tuple[str, str, int]]
231
+ ) -> list[list[tuple[str, str]]]:
232
+ """
233
+ Group (node, prop) pairs into batches using Largest-Smallest strategy.
234
+
235
+ Args:
236
+ prop_counts: List of (node, property, count) tuples sorted by count descending.
237
+ Returns:
238
+ List of batches, where each batch is a list of (node, property) pairs.
239
+ """
240
+ if not prop_counts:
241
+ return []
242
+
243
+ max_pairs_per_batch = self.client.hub_pairs_batch_size
244
+
245
+ sorted_props = sorted(prop_counts, key=lambda x: x[2], reverse=True)
246
+ batches = []
247
+
248
+ left = 0
249
+ right = len(sorted_props) - 1
250
+
251
+ while left <= right:
252
+ current_batch = []
253
+ node_l, prop_l, count_l = sorted_props[left]
254
+ current_batch.append((node_l, prop_l))
255
+ current_sum = count_l
256
+ left += 1
257
+
258
+ while left <= right and len(current_batch) < max_pairs_per_batch:
259
+ node_r, prop_r, count_r = sorted_props[right]
260
+ if current_sum + count_r <= self.max_neighbors_threshold:
261
+ current_batch.append((node_r, prop_r))
262
+ current_sum += count_r
263
+ right -= 1
264
+ else:
265
+ break
266
+
267
+ batches.append(current_batch)
268
+
269
+ return batches