semnet 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- semnet/__init__.py +15 -0
- semnet/semnet.py +461 -0
- semnet-0.1.3.dist-info/METADATA +209 -0
- semnet-0.1.3.dist-info/RECORD +7 -0
- semnet-0.1.3.dist-info/WHEEL +5 -0
- semnet-0.1.3.dist-info/licenses/LICENSE +21 -0
- semnet-0.1.3.dist-info/top_level.txt +1 -0
semnet/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Semnet: Semantic Network Deduplication
|
|
2
|
+
|
|
3
|
+
A Python package for building semantic networks using embeddings and graph clustering
|
|
4
|
+
to perform intelligent deduplication of text data.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
__author__ = "Ian Goodrich"
|
|
9
|
+
__email__ = "ian@igdr.ch"
|
|
10
|
+
|
|
11
|
+
from .semnet import SemanticNetwork
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"SemanticNetwork",
|
|
15
|
+
]
|
semnet/semnet.py
ADDED
|
@@ -0,0 +1,461 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict, List, Literal, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
import networkx as nx
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from annoy import AnnoyIndex
|
|
8
|
+
from tqdm.auto import tqdm
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
MetricType = Literal["angular", "euclidean", "manhattan", "hamming", "dot"]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SemanticNetwork:
|
|
16
|
+
"""
|
|
17
|
+
A semantic network builder for creating graphs from document embeddings.
|
|
18
|
+
|
|
19
|
+
This class follows the scikit-learn pattern with fit() and transform() methods.
|
|
20
|
+
Users must provide pre-computed embeddings during the fit process.
|
|
21
|
+
|
|
22
|
+
The fitting process builds an approximate nearest neighbor index from embeddings.
|
|
23
|
+
The transformation process constructs a graph where edges represent semantic similarity.
|
|
24
|
+
|
|
25
|
+
Key Methods:
|
|
26
|
+
fit(): Build the similarity index from provided embeddings
|
|
27
|
+
transform(): Construct and return a networkx object
|
|
28
|
+
fit_transform(): Combined fit and transform in one step
|
|
29
|
+
to_pandas(): Export graph structure to pandas DataFrames for analysis
|
|
30
|
+
|
|
31
|
+
Attributes:
|
|
32
|
+
metric: Distance metric for the Annoy index
|
|
33
|
+
n_trees: Number of trees for the Annoy index
|
|
34
|
+
thresh: Similarity threshold for connecting documents
|
|
35
|
+
top_k: Maximum neighbors to check per document
|
|
36
|
+
verbose: Whether to show progress bars and detailed logging
|
|
37
|
+
is_fitted_: Whether the model has been fitted
|
|
38
|
+
embeddings_: Document embeddings array (available after fitting)
|
|
39
|
+
index_: Annoy index for similarity search (available after fitting)
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
metric: MetricType = "angular",
|
|
45
|
+
n_trees: int = 10,
|
|
46
|
+
thresh: float = 0.7,
|
|
47
|
+
top_k: int = 100,
|
|
48
|
+
verbose: bool = False,
|
|
49
|
+
) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Initialize the SemanticNetwork.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
metric: Distance metric for Annoy index ('angular', 'euclidean', etc.)
|
|
55
|
+
n_trees: Number of trees for Annoy index (more = better accuracy, slower build)
|
|
56
|
+
thresh: Similarity threshold for connecting documents (0.0 to 1.0)
|
|
57
|
+
top_k: Maximum number of neighbors to check per document
|
|
58
|
+
verbose: Whether to show progress bars and detailed logging
|
|
59
|
+
"""
|
|
60
|
+
self.metric = metric
|
|
61
|
+
self.n_trees = n_trees
|
|
62
|
+
self.thresh = thresh
|
|
63
|
+
self.top_k = top_k
|
|
64
|
+
self.verbose = verbose
|
|
65
|
+
|
|
66
|
+
# Fitted state
|
|
67
|
+
self.is_fitted_ = False
|
|
68
|
+
self.embeddings_: Optional[np.ndarray] = None
|
|
69
|
+
self.index_: Optional[AnnoyIndex] = None
|
|
70
|
+
|
|
71
|
+
# Training data (stored during fit)
|
|
72
|
+
self._labels: Optional[List[str]] = None
|
|
73
|
+
self._node_data: Optional[Dict] = None
|
|
74
|
+
|
|
75
|
+
def fit(
|
|
76
|
+
self,
|
|
77
|
+
embeddings: np.ndarray,
|
|
78
|
+
) -> "SemanticNetwork":
|
|
79
|
+
"""
|
|
80
|
+
Build the index from document embeddings.
|
|
81
|
+
|
|
82
|
+
This method uses provided embeddings to create an Annoy index for
|
|
83
|
+
fast nearest neighbor search.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
embeddings: Pre-computed embeddings array with shape (n_docs, embedding_dim).
|
|
87
|
+
labels: Optional list of text labels/documents for the embeddings.
|
|
88
|
+
If not provided, will use string indices as labels.
|
|
89
|
+
node_data: Optional dictionary containing additional data to attach to nodes.
|
|
90
|
+
Format: {node_index: {attribute_name: value, ...}, ...}
|
|
91
|
+
OR {node_index: single_value, ...} (will be stored as {'value': single_value})
|
|
92
|
+
Only nodes present in the dictionary will get additional attributes.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
self: Returns the fitted estimator
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
ValueError: If labels provided but length doesn't match embeddings
|
|
99
|
+
ValueError: If ids provided but length doesn't match embeddings
|
|
100
|
+
ValueError: If node_data values don't match embeddings length
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
self.embeddings_ = embeddings
|
|
104
|
+
|
|
105
|
+
if self.verbose:
|
|
106
|
+
logger.info(
|
|
107
|
+
f"Using provided embeddings with shape: {self.embeddings_.shape}"
|
|
108
|
+
)
|
|
109
|
+
logger.info(
|
|
110
|
+
f"Fitting SemanticNetwork on {len(embeddings)} documents"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Build the vector index
|
|
114
|
+
self._build_vector_index()
|
|
115
|
+
|
|
116
|
+
self.is_fitted_ = True
|
|
117
|
+
|
|
118
|
+
if self.verbose:
|
|
119
|
+
logger.info("Fitting complete")
|
|
120
|
+
|
|
121
|
+
return self
|
|
122
|
+
|
|
123
|
+
def transform(
|
|
124
|
+
self,
|
|
125
|
+
thresh: Optional[float] = None,
|
|
126
|
+
top_k: Optional[int] = None,
|
|
127
|
+
labels: Optional[List[str]] = None,
|
|
128
|
+
node_data: Optional[Dict] = None,
|
|
129
|
+
) -> nx.Graph:
|
|
130
|
+
"""
|
|
131
|
+
Build and return a weighted graph from the fitted embeddings.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
thresh: The similarity threshold for edge inclusion.
|
|
135
|
+
If None, uses the threshold from initialization.
|
|
136
|
+
top_k: Optional max neighbors override for this transform.
|
|
137
|
+
If None, uses the top_k from initialization.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
NetworkX graph where nodes represent documents and edges represent
|
|
141
|
+
similarities above the threshold.
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
ValueError: If the model hasn't been fitted yet
|
|
145
|
+
"""
|
|
146
|
+
if not self.is_fitted_:
|
|
147
|
+
raise ValueError(
|
|
148
|
+
"This SemanticNetwork instance is not fitted yet. Call 'fit' first."
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
n_docs = self.embeddings_.shape[0]
|
|
152
|
+
|
|
153
|
+
if labels is not None and len(labels) != n_docs:
|
|
154
|
+
raise ValueError(
|
|
155
|
+
f"Labels length ({len(labels)}) must match embeddings length ({n_docs})"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
if node_data is not None:
|
|
159
|
+
# Validate node_data format: should be {node_index: {attribute_dict}} or {node_index: value}
|
|
160
|
+
if not isinstance(node_data, dict):
|
|
161
|
+
raise ValueError("Node data must be a dictionary")
|
|
162
|
+
|
|
163
|
+
# Check if all keys are integers (node indices)
|
|
164
|
+
non_integer_keys = [
|
|
165
|
+
k
|
|
166
|
+
for k in node_data.keys()
|
|
167
|
+
if not isinstance(k, (int, np.integer))
|
|
168
|
+
]
|
|
169
|
+
if non_integer_keys:
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"Node data keys must be integer node indices, got: {non_integer_keys}"
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Validate that node_data keys are valid node indices
|
|
175
|
+
invalid_indices = [
|
|
176
|
+
idx for idx in node_data.keys() if idx >= n_docs or idx < 0
|
|
177
|
+
]
|
|
178
|
+
if invalid_indices:
|
|
179
|
+
raise ValueError(
|
|
180
|
+
f"Node data contains invalid indices {invalid_indices}. Indices must be 0 <= idx < {n_docs}"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Convert single values to dictionary format for consistency
|
|
184
|
+
# If values are not dictionaries, wrap them in a dictionary with 'value' key
|
|
185
|
+
processed_node_data = {}
|
|
186
|
+
for k, v in node_data.items():
|
|
187
|
+
if isinstance(v, dict):
|
|
188
|
+
processed_node_data[k] = v
|
|
189
|
+
else:
|
|
190
|
+
processed_node_data[k] = {"value": v}
|
|
191
|
+
node_data = processed_node_data
|
|
192
|
+
|
|
193
|
+
# Store training data
|
|
194
|
+
self._labels = (
|
|
195
|
+
labels if labels is not None else [str(i) for i in range(n_docs)]
|
|
196
|
+
)
|
|
197
|
+
self._node_data = node_data
|
|
198
|
+
|
|
199
|
+
# Use provided thresholds or fall back to instance defaults
|
|
200
|
+
effective_thresh = thresh if thresh is not None else self.thresh
|
|
201
|
+
effective_top_k = top_k if top_k is not None else self.top_k
|
|
202
|
+
|
|
203
|
+
# Get pairwise similarities
|
|
204
|
+
neighbor_data = self._get_pairwise_similarities(
|
|
205
|
+
effective_thresh, effective_top_k
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Build and return the graph
|
|
209
|
+
return self._build_graph(neighbor_data)
|
|
210
|
+
|
|
211
|
+
def fit_transform(
|
|
212
|
+
self,
|
|
213
|
+
embeddings: np.ndarray,
|
|
214
|
+
labels: Optional[List[str]] = None,
|
|
215
|
+
node_data: Optional[Dict] = None,
|
|
216
|
+
thresh: Optional[float] = None,
|
|
217
|
+
top_k: Optional[int] = None,
|
|
218
|
+
) -> nx.Graph:
|
|
219
|
+
"""
|
|
220
|
+
Fit the model and transform the embeddings in one step.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
embeddings: Pre-computed embeddings array with shape (n_docs, embedding_dim).
|
|
224
|
+
labels: Optional list of text labels/documents for the embeddings.
|
|
225
|
+
node_data: Optional dictionary containing additional data to attach to nodes.
|
|
226
|
+
thresh: Optional similarity threshold override for this transform.
|
|
227
|
+
top_k: Optional max neighbors override for this transform.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
NetworkX graph representing the semantic network
|
|
231
|
+
"""
|
|
232
|
+
return self.fit(embeddings=embeddings).transform(
|
|
233
|
+
thresh=thresh, top_k=top_k, labels=labels, node_data=node_data
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
def to_pandas(
|
|
237
|
+
self, graph: Optional[nx.Graph] = None
|
|
238
|
+
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
239
|
+
"""
|
|
240
|
+
Export a NetworkX graph to pandas DataFrames.
|
|
241
|
+
|
|
242
|
+
By default, exports the most recently transformed graph. Optionally accepts
|
|
243
|
+
an arbitrary NetworkX graph (useful for subgraphs or modified graphs).
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
graph: Optional NetworkX graph to export. If None, will raise an error
|
|
247
|
+
since no graph is stored by default after transform.
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing:
|
|
251
|
+
- nodes (pd.DataFrame): Node attributes with index as node ID.
|
|
252
|
+
Columns include all node attributes from the graph.
|
|
253
|
+
- edges (pd.DataFrame): Edge list with columns 'source', 'target',
|
|
254
|
+
and any edge attributes (e.g., 'weight').
|
|
255
|
+
|
|
256
|
+
Raises:
|
|
257
|
+
ValueError: If no graph is provided and the model hasn't been fitted yet
|
|
258
|
+
|
|
259
|
+
Examples:
|
|
260
|
+
>>> # Build and export a graph
|
|
261
|
+
>>> network = SemanticNetwork(thresh=0.8)
|
|
262
|
+
>>> graph = network.fit_transform(embeddings, labels=docs)
|
|
263
|
+
>>> nodes, edges = network.to_pandas(graph)
|
|
264
|
+
|
|
265
|
+
>>> # Export a subgraph
|
|
266
|
+
>>> subgraph = graph.subgraph([0, 1, 2])
|
|
267
|
+
>>> sub_nodes, sub_edges = network.to_pandas(subgraph)
|
|
268
|
+
"""
|
|
269
|
+
if graph is None:
|
|
270
|
+
raise ValueError(
|
|
271
|
+
"No graph provided. Call transform() to get a graph, then pass it to to_pandas()."
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Convert nodes to DataFrame
|
|
275
|
+
nodes = pd.DataFrame.from_dict(
|
|
276
|
+
dict(graph.nodes(data=True)), orient="index"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Convert edges to DataFrame
|
|
280
|
+
if graph.number_of_edges() > 0:
|
|
281
|
+
edges = nx.to_pandas_edgelist(graph)
|
|
282
|
+
else:
|
|
283
|
+
# Create empty DataFrame with expected columns if no edges
|
|
284
|
+
edges = pd.DataFrame(columns=["source", "target"])
|
|
285
|
+
|
|
286
|
+
return nodes, edges
|
|
287
|
+
|
|
288
|
+
def _build_vector_index(self) -> AnnoyIndex:
|
|
289
|
+
"""
|
|
290
|
+
Build an Annoy index for fast approximate nearest neighbor search.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
The built Annoy index
|
|
294
|
+
|
|
295
|
+
Raises:
|
|
296
|
+
ValueError: If embeddings haven't been provided yet
|
|
297
|
+
|
|
298
|
+
Note:
|
|
299
|
+
The index is stored in self.index_ and also returned.
|
|
300
|
+
"""
|
|
301
|
+
if self.embeddings_ is None:
|
|
302
|
+
raise ValueError(
|
|
303
|
+
"Embeddings not found. Please provide embeddings in fit() method."
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
embedding_dim = self.embeddings_.shape[1]
|
|
307
|
+
self.index_ = AnnoyIndex(embedding_dim, self.metric) # type: ignore
|
|
308
|
+
|
|
309
|
+
if self.verbose:
|
|
310
|
+
logger.info(
|
|
311
|
+
f"Building Annoy index with {self.n_trees} trees for {len(self.embeddings_)} embeddings"
|
|
312
|
+
)
|
|
313
|
+
iterator = tqdm(
|
|
314
|
+
enumerate(self.embeddings_),
|
|
315
|
+
total=len(self.embeddings_),
|
|
316
|
+
desc="Adding embeddings to index",
|
|
317
|
+
)
|
|
318
|
+
else:
|
|
319
|
+
iterator = enumerate(self.embeddings_)
|
|
320
|
+
|
|
321
|
+
for i, embedding_vector in iterator:
|
|
322
|
+
self.index_.add_item(i, embedding_vector)
|
|
323
|
+
|
|
324
|
+
if self.verbose:
|
|
325
|
+
logger.info("Building index trees...")
|
|
326
|
+
self.index_.build(self.n_trees)
|
|
327
|
+
|
|
328
|
+
if self.verbose:
|
|
329
|
+
logger.info("Vector index built successfully")
|
|
330
|
+
|
|
331
|
+
return self.index_
|
|
332
|
+
|
|
333
|
+
def _get_pairwise_similarities(
|
|
334
|
+
self, thresh: float, top_k: int
|
|
335
|
+
) -> pd.DataFrame:
|
|
336
|
+
"""
|
|
337
|
+
Find pairwise similarities between documents above a threshold.
|
|
338
|
+
|
|
339
|
+
Uses the Annoy index to efficiently find nearest neighbors for each document,
|
|
340
|
+
then calculates exact similarities and filters by threshold.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
thresh: Similarity threshold for including edges
|
|
344
|
+
top_k: Maximum number of neighbors to check per document
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
DataFrame of similarities with columns: source_idx, target_idx, weight, source_name, target_name
|
|
348
|
+
|
|
349
|
+
Raises:
|
|
350
|
+
ValueError: If embeddings or index haven't been built yet
|
|
351
|
+
"""
|
|
352
|
+
if self.embeddings_ is None or self.index_ is None:
|
|
353
|
+
raise ValueError(
|
|
354
|
+
"Embeddings or index not found. Please provide embeddings in fit() method and run _build_vector_index() first."
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
if self._labels is None:
|
|
358
|
+
raise ValueError("No training documents found. Call fit() first.")
|
|
359
|
+
|
|
360
|
+
if self.verbose:
|
|
361
|
+
logger.info(
|
|
362
|
+
f"Finding pairwise similarities with threshold {thresh}, checking top {top_k} neighbors"
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
results = []
|
|
366
|
+
|
|
367
|
+
if self.verbose:
|
|
368
|
+
iterator = tqdm(
|
|
369
|
+
range(len(self.embeddings_)), desc="Finding similarities"
|
|
370
|
+
)
|
|
371
|
+
else:
|
|
372
|
+
iterator = range(len(self.embeddings_))
|
|
373
|
+
|
|
374
|
+
for idx_source in iterator:
|
|
375
|
+
neighbors = self.index_.get_nns_by_item(
|
|
376
|
+
idx_source, top_k, include_distances=True
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
for idx_target, dist in zip(*neighbors):
|
|
380
|
+
similarity = 1 - dist # Convert angular distance to similarity
|
|
381
|
+
|
|
382
|
+
if idx_source != idx_target and similarity >= thresh:
|
|
383
|
+
result_dict = {
|
|
384
|
+
"source_idx": idx_source,
|
|
385
|
+
"target_idx": idx_target,
|
|
386
|
+
"weight": similarity,
|
|
387
|
+
"source_name": self._labels[idx_source],
|
|
388
|
+
"target_name": self._labels[idx_target],
|
|
389
|
+
}
|
|
390
|
+
results.append(result_dict)
|
|
391
|
+
|
|
392
|
+
neighbor_data = pd.DataFrame(results)
|
|
393
|
+
|
|
394
|
+
if self.verbose:
|
|
395
|
+
logger.info(
|
|
396
|
+
f"Found {len(neighbor_data)} similarity pairs above threshold {thresh}"
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
return neighbor_data
|
|
400
|
+
|
|
401
|
+
def _build_graph(self, neighbor_data: pd.DataFrame) -> nx.Graph:
|
|
402
|
+
"""
|
|
403
|
+
Build a NetworkX graph from pairwise similarities.
|
|
404
|
+
|
|
405
|
+
Creates a graph where:
|
|
406
|
+
- Nodes represent documents
|
|
407
|
+
- Edges represent similarities above the threshold (with 'weight' attribute representing similarity)
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
neighbor_data: DataFrame of pairwise similarities
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
The constructed NetworkX graph
|
|
414
|
+
|
|
415
|
+
Raises:
|
|
416
|
+
ValueError: If training data hasn't been set
|
|
417
|
+
|
|
418
|
+
Note:
|
|
419
|
+
The graph includes all documents as nodes, even if they have no similarities above threshold.
|
|
420
|
+
"""
|
|
421
|
+
|
|
422
|
+
if self._labels is None:
|
|
423
|
+
raise ValueError("No training documents found. Call fit() first.")
|
|
424
|
+
|
|
425
|
+
if self.verbose:
|
|
426
|
+
logger.info(
|
|
427
|
+
f"Building graph from {len(neighbor_data)} similarity edges"
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
# Instantiate undirected graph
|
|
431
|
+
G = nx.Graph()
|
|
432
|
+
|
|
433
|
+
# Add all nodes with their attributes
|
|
434
|
+
for i in range(len(self._labels)):
|
|
435
|
+
# Set basic attributes
|
|
436
|
+
attrs = {
|
|
437
|
+
"label": self._labels[i],
|
|
438
|
+
"id": i,
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
# Add custom node data if provided for this specific node
|
|
442
|
+
if self._node_data is not None and i in self._node_data:
|
|
443
|
+
attrs.update(self._node_data[i])
|
|
444
|
+
|
|
445
|
+
G.add_node(i, **attrs)
|
|
446
|
+
|
|
447
|
+
# Add edges from neighbor data
|
|
448
|
+
for _, row in neighbor_data.iterrows():
|
|
449
|
+
G.add_edge(
|
|
450
|
+
row["source_idx"],
|
|
451
|
+
row["target_idx"],
|
|
452
|
+
weight=row["weight"],
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
if self.verbose:
|
|
456
|
+
num_components = nx.number_connected_components(G)
|
|
457
|
+
logger.info(
|
|
458
|
+
f"Built graph with {G.number_of_nodes()} nodes, {G.number_of_edges()} edges, {num_components} components"
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
return G
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: semnet
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: Semantic Networks from Embeddings
|
|
5
|
+
Author-email: Ian Goodrich <ian@igdr.ch>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/specialprocedures/semnet
|
|
8
|
+
Project-URL: Documentation, https://semnetdocs.readthedocs.io
|
|
9
|
+
Project-URL: Repository, https://github.com/specialprocedures/semnet
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/specialprocedures/semnet/issues
|
|
11
|
+
Keywords: semantic networks,embeddings,graph analysis,nlp,similarity,networkx
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
25
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
26
|
+
Requires-Python: >=3.8
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: networkx>=2.5
|
|
30
|
+
Requires-Dist: annoy>=1.17.0
|
|
31
|
+
Requires-Dist: numpy>=1.19.0
|
|
32
|
+
Requires-Dist: pandas>=1.2.0
|
|
33
|
+
Requires-Dist: tqdm>=4.60.0
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest-cov>=2.10; extra == "dev"
|
|
37
|
+
Requires-Dist: black>=21.0; extra == "dev"
|
|
38
|
+
Requires-Dist: isort>=5.0; extra == "dev"
|
|
39
|
+
Requires-Dist: flake8>=3.8; extra == "dev"
|
|
40
|
+
Requires-Dist: mypy>=0.800; extra == "dev"
|
|
41
|
+
Requires-Dist: sentence-transformers>=2.0.0; extra == "dev"
|
|
42
|
+
Provides-Extra: docs
|
|
43
|
+
Requires-Dist: sphinx>=4.0; extra == "docs"
|
|
44
|
+
Requires-Dist: sphinx-rtd-theme>=1.0; extra == "docs"
|
|
45
|
+
Requires-Dist: myst-parser>=0.17; extra == "docs"
|
|
46
|
+
Provides-Extra: examples
|
|
47
|
+
Requires-Dist: sentence-transformers>=2.0.0; extra == "examples"
|
|
48
|
+
Requires-Dist: matplotlib>=3.3.0; extra == "examples"
|
|
49
|
+
Requires-Dist: jupyter>=1.0.0; extra == "examples"
|
|
50
|
+
Dynamic: license-file
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# Semnet: Graph structures from embeddings
|
|
54
|
+
|
|
55
|
+

|
|
56
|
+
_Embeddings of Guardian headlines represented as a network by Semnet and visualised in [Cosmograph](cosmograph.app)_
|
|
57
|
+
|
|
58
|
+
Semnet constructs graph structures from embeddings, enabling graph-based analysis and operations over embedded documents, images, and more.
|
|
59
|
+
|
|
60
|
+
Semnet uses [Annoy](https://github.com/spotify/annoy) to perform efficient pair-wise distance calculations across all embeddings in the dataset, then constructs [NetworkX](https://networkx.org) graphs representing relationships between embeddings.
|
|
61
|
+
|
|
62
|
+
## Use cases
|
|
63
|
+
Semnet may be used for:
|
|
64
|
+
- **Deduplication**: remove duplicate records (e.g., "Donald Trump", "Donald J. Trump) from datasets
|
|
65
|
+
- **Clustering**: find groups of similar documents via [community detection](https://networkx.org/documentation/stable/reference/algorithms/community.html) algorithms
|
|
66
|
+
- **Recommendation systems**: Account for relationships, and take advantage of graph structures such as communities and paths in search and RAG
|
|
67
|
+
- **Knowledge graph construction**: Build networks of related concepts or entities, as a regular NetworkX graph it's easy to add additional entities
|
|
68
|
+
- **Exploratory data analysis and visualisation**, [Cosmograph](https://cosmograph.app/) works brilliantly for large corpora
|
|
69
|
+
|
|
70
|
+
Exposing the full NetworkX and Annoy APIs, Semnet offers plenty of opportunity for experimentation depending on your use-case. Check out the examples for inspiration.
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
## Quick Start
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from semnet import SemanticNetwork
|
|
77
|
+
from sentence_transformers import SentenceTransformer
|
|
78
|
+
import networkx as nx
|
|
79
|
+
|
|
80
|
+
# Your documents
|
|
81
|
+
docs = [
|
|
82
|
+
"The cat sat on the mat",
|
|
83
|
+
"A cat was sitting on a mat",
|
|
84
|
+
"The dog ran in the park",
|
|
85
|
+
"I love Python",
|
|
86
|
+
"Python is a great programming language",
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
# Generate embeddings (use any embedding provider)
|
|
90
|
+
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
|
|
91
|
+
embeddings = embedding_model.encode(docs)
|
|
92
|
+
|
|
93
|
+
# Create and configure semantic network
|
|
94
|
+
sem = SemanticNetwork(thresh=0.3, verbose=True) # Larger values give sparser networks
|
|
95
|
+
|
|
96
|
+
# Build the semantic graph from your embeddings
|
|
97
|
+
G = sem.fit_transform(embeddings, labels=docs)
|
|
98
|
+
|
|
99
|
+
# Analyze the graph
|
|
100
|
+
print(f"Nodes: {G.number_of_nodes()}")
|
|
101
|
+
print(f"Edges: {G.number_of_edges()}")
|
|
102
|
+
print(f"Connected components: {nx.number_connected_components(G)}")
|
|
103
|
+
|
|
104
|
+
# Find similar document groups
|
|
105
|
+
for component in nx.connected_components(G):
|
|
106
|
+
if len(component) > 1:
|
|
107
|
+
similar_docs = [G.nodes[i]["label"] for i in component]
|
|
108
|
+
print(f"Similar documents: {similar_docs}")
|
|
109
|
+
|
|
110
|
+
# Calculate centrality measures,
|
|
111
|
+
# Degree centrality not that interesting in the example, but shown here for demonstration
|
|
112
|
+
centrality = nx.degree_centrality(G)
|
|
113
|
+
for node, cent_value in centrality.items():
|
|
114
|
+
print(f"Document: {G.nodes[node]['label']}, Degree Centrality: {cent_value:.4f}")
|
|
115
|
+
G.nodes[node]["degree_centrality"] = cent_value
|
|
116
|
+
|
|
117
|
+
# Export to pandas
|
|
118
|
+
nodes_df, edges_df = sem.to_pandas(G)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Installation
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
pip install semnet
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
For development:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
git clone https://github.com/specialprocedures/semnet.git
|
|
131
|
+
cd semnet
|
|
132
|
+
pip install -e ".[dev]"
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Configuration Options
|
|
136
|
+
|
|
137
|
+
### SemanticNetwork Parameters
|
|
138
|
+
|
|
139
|
+
- **metric**: Distance metric for Annoy index ('angular', 'euclidean', etc.) (default: 'angular')
|
|
140
|
+
- **n_trees**: Number of trees for Annoy index (more = better accuracy, slower) (default: 10)
|
|
141
|
+
- **thresh**: Similarity threshold (0.0 to 1.0) (default: 0.3)
|
|
142
|
+
- **top_k**: Maximum neighbors to check per document (default: 100)
|
|
143
|
+
- **verbose**: Show progress bars and logging (default: False)
|
|
144
|
+
|
|
145
|
+
### Method Parameters
|
|
146
|
+
|
|
147
|
+
- **fit(embeddings, labels=None, ids=None, node_data=None)**:
|
|
148
|
+
- embeddings are required pre-computed embeddings array with shape (n_docs, embedding_dim)
|
|
149
|
+
- labels are optional text labels/documents for the embeddings
|
|
150
|
+
- ids are optional custom IDs for the embeddings
|
|
151
|
+
- node_data is optional dictionary containing additional data to attach to nodes
|
|
152
|
+
- **transform(thresh=None, top_k=None)**: Optional threshold and top_k overrides
|
|
153
|
+
- **fit_transform(embeddings, labels=None, ids=None, node_data=None, thresh=None, top_k=None)**: Combined fit and transform
|
|
154
|
+
- **to_pandas(graph)**: Export NetworkX graph to pandas DataFrames
|
|
155
|
+
|
|
156
|
+
## Performance Tips
|
|
157
|
+
|
|
158
|
+
- Use `"angular"` metric for cosine similarity (default and recommended)
|
|
159
|
+
- Increase `n_trees` for better accuracy (try 50-100 for large datasets)
|
|
160
|
+
- Decrease `top_k` if you have memory constraints
|
|
161
|
+
- Use smaller embedding models for speed: `"all-MiniLM-L6-v2"`
|
|
162
|
+
- Use larger models for accuracy: `"BAAI/bge-large-en-v1.5"`
|
|
163
|
+
|
|
164
|
+
## Requirements
|
|
165
|
+
|
|
166
|
+
- Python 3.8+
|
|
167
|
+
- networkx
|
|
168
|
+
- annoy
|
|
169
|
+
- numpy
|
|
170
|
+
- pandas
|
|
171
|
+
- tqdm
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
## Project origin and statement on the use of AI
|
|
175
|
+
|
|
176
|
+
I love network analysis, and have explored embedding-derived [semantic networks](https://en.wikipedia.org/wiki/Semantic_network) in the past as an alternative approach to representing, clustering and querying news data.
|
|
177
|
+
|
|
178
|
+
Whilst using semantic networks for graph analysis on some forthcoming research, I decided to package some of my code for others to use.
|
|
179
|
+
|
|
180
|
+
I kicked off the project by hand-refactoring my initial code into the class-based structure that forms the core functionality of the current module.
|
|
181
|
+
|
|
182
|
+
I then used Github Copilot in VSCode to:
|
|
183
|
+
- Bootstrap scaffolding, tests, documentation, examples and typing
|
|
184
|
+
- Refactor the core methods in the style of the scikit-learn API
|
|
185
|
+
- Add additional functionality for convenient analysis of graph structures and to allow the use of custom embeddings.
|
|
186
|
+
|
|
187
|
+
## Roadmap
|
|
188
|
+
|
|
189
|
+
Semnet is a relatively simple project focused on core graph construction functionality. Potential future additions:
|
|
190
|
+
- Better examples showcasing network analysis on large corpora
|
|
191
|
+
- Integration with graph visualization tools
|
|
192
|
+
- Performance optimizations for very large datasets
|
|
193
|
+
|
|
194
|
+
## License
|
|
195
|
+
|
|
196
|
+
MIT License
|
|
197
|
+
|
|
198
|
+
## Citation
|
|
199
|
+
|
|
200
|
+
If you use Semnet in academic work, please cite:
|
|
201
|
+
|
|
202
|
+
```bibtex
|
|
203
|
+
@software{semnet,
|
|
204
|
+
title={Semnet: Semantic Networks from Embeddings},
|
|
205
|
+
author={Ian Goodrich},
|
|
206
|
+
year={2025},
|
|
207
|
+
url={https://github.com/specialprocedures/semnet}
|
|
208
|
+
}
|
|
209
|
+
```
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
semnet/__init__.py,sha256=_zSU-x86hBwF-ZC9a4p8f5DUZpvoJalhTYXXaQKYYM8,336
|
|
2
|
+
semnet/semnet.py,sha256=OEqOh9pB17MAiPInK77BUout0EVaKlR4I87EMyQxYNg,16481
|
|
3
|
+
semnet-0.1.3.dist-info/licenses/LICENSE,sha256=0TzAvDoYO4STeUC-y5HUHwD__mh19DrcV1u3pFHM1Uc,1068
|
|
4
|
+
semnet-0.1.3.dist-info/METADATA,sha256=GNnLnEI68S6eyhR4CXW194h0CwIj3z-5IeTOU69SddU,8364
|
|
5
|
+
semnet-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
+
semnet-0.1.3.dist-info/top_level.txt,sha256=L9LrXn-MHrZUilHd5D4NIPeKusKYrKzD3m9KRD-rRBY,7
|
|
7
|
+
semnet-0.1.3.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Ian Goodrich
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
semnet
|