semnet 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
semnet/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ """Semnet: Semantic Network Deduplication
2
+
3
+ A Python package for building semantic networks using embeddings and graph clustering
4
+ to perform intelligent deduplication of text data.
5
+ """
6
+
7
+ __version__ = "0.1.0"
8
+ __author__ = "Ian Goodrich"
9
+ __email__ = "ian@igdr.ch"
10
+
11
+ from .semnet import SemanticNetwork
12
+
13
+ __all__ = [
14
+ "SemanticNetwork",
15
+ ]
semnet/semnet.py ADDED
@@ -0,0 +1,461 @@
1
+ import logging
2
+ from typing import Dict, List, Literal, Optional, Tuple
3
+
4
+ import networkx as nx
5
+ import numpy as np
6
+ import pandas as pd
7
+ from annoy import AnnoyIndex
8
+ from tqdm.auto import tqdm
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ MetricType = Literal["angular", "euclidean", "manhattan", "hamming", "dot"]
13
+
14
+
15
+ class SemanticNetwork:
16
+ """
17
+ A semantic network builder for creating graphs from document embeddings.
18
+
19
+ This class follows the scikit-learn pattern with fit() and transform() methods.
20
+ Users must provide pre-computed embeddings during the fit process.
21
+
22
+ The fitting process builds an approximate nearest neighbor index from embeddings.
23
+ The transformation process constructs a graph where edges represent semantic similarity.
24
+
25
+ Key Methods:
26
+ fit(): Build the similarity index from provided embeddings
27
+ transform(): Construct and return a networkx object
28
+ fit_transform(): Combined fit and transform in one step
29
+ to_pandas(): Export graph structure to pandas DataFrames for analysis
30
+
31
+ Attributes:
32
+ metric: Distance metric for the Annoy index
33
+ n_trees: Number of trees for the Annoy index
34
+ thresh: Similarity threshold for connecting documents
35
+ top_k: Maximum neighbors to check per document
36
+ verbose: Whether to show progress bars and detailed logging
37
+ is_fitted_: Whether the model has been fitted
38
+ embeddings_: Document embeddings array (available after fitting)
39
+ index_: Annoy index for similarity search (available after fitting)
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ metric: MetricType = "angular",
45
+ n_trees: int = 10,
46
+ thresh: float = 0.7,
47
+ top_k: int = 100,
48
+ verbose: bool = False,
49
+ ) -> None:
50
+ """
51
+ Initialize the SemanticNetwork.
52
+
53
+ Args:
54
+ metric: Distance metric for Annoy index ('angular', 'euclidean', etc.)
55
+ n_trees: Number of trees for Annoy index (more = better accuracy, slower build)
56
+ thresh: Similarity threshold for connecting documents (0.0 to 1.0)
57
+ top_k: Maximum number of neighbors to check per document
58
+ verbose: Whether to show progress bars and detailed logging
59
+ """
60
+ self.metric = metric
61
+ self.n_trees = n_trees
62
+ self.thresh = thresh
63
+ self.top_k = top_k
64
+ self.verbose = verbose
65
+
66
+ # Fitted state
67
+ self.is_fitted_ = False
68
+ self.embeddings_: Optional[np.ndarray] = None
69
+ self.index_: Optional[AnnoyIndex] = None
70
+
71
+ # Training data (stored during fit)
72
+ self._labels: Optional[List[str]] = None
73
+ self._node_data: Optional[Dict] = None
74
+
75
+ def fit(
76
+ self,
77
+ embeddings: np.ndarray,
78
+ ) -> "SemanticNetwork":
79
+ """
80
+ Build the index from document embeddings.
81
+
82
+ This method uses provided embeddings to create an Annoy index for
83
+ fast nearest neighbor search.
84
+
85
+ Args:
86
+ embeddings: Pre-computed embeddings array with shape (n_docs, embedding_dim).
87
+ labels: Optional list of text labels/documents for the embeddings.
88
+ If not provided, will use string indices as labels.
89
+ node_data: Optional dictionary containing additional data to attach to nodes.
90
+ Format: {node_index: {attribute_name: value, ...}, ...}
91
+ OR {node_index: single_value, ...} (will be stored as {'value': single_value})
92
+ Only nodes present in the dictionary will get additional attributes.
93
+
94
+ Returns:
95
+ self: Returns the fitted estimator
96
+
97
+ Raises:
98
+ ValueError: If labels provided but length doesn't match embeddings
99
+ ValueError: If ids provided but length doesn't match embeddings
100
+ ValueError: If node_data values don't match embeddings length
101
+ """
102
+
103
+ self.embeddings_ = embeddings
104
+
105
+ if self.verbose:
106
+ logger.info(
107
+ f"Using provided embeddings with shape: {self.embeddings_.shape}"
108
+ )
109
+ logger.info(
110
+ f"Fitting SemanticNetwork on {len(embeddings)} documents"
111
+ )
112
+
113
+ # Build the vector index
114
+ self._build_vector_index()
115
+
116
+ self.is_fitted_ = True
117
+
118
+ if self.verbose:
119
+ logger.info("Fitting complete")
120
+
121
+ return self
122
+
123
+ def transform(
124
+ self,
125
+ thresh: Optional[float] = None,
126
+ top_k: Optional[int] = None,
127
+ labels: Optional[List[str]] = None,
128
+ node_data: Optional[Dict] = None,
129
+ ) -> nx.Graph:
130
+ """
131
+ Build and return a weighted graph from the fitted embeddings.
132
+
133
+ Args:
134
+ thresh: The similarity threshold for edge inclusion.
135
+ If None, uses the threshold from initialization.
136
+ top_k: Optional max neighbors override for this transform.
137
+ If None, uses the top_k from initialization.
138
+
139
+ Returns:
140
+ NetworkX graph where nodes represent documents and edges represent
141
+ similarities above the threshold.
142
+
143
+ Raises:
144
+ ValueError: If the model hasn't been fitted yet
145
+ """
146
+ if not self.is_fitted_:
147
+ raise ValueError(
148
+ "This SemanticNetwork instance is not fitted yet. Call 'fit' first."
149
+ )
150
+
151
+ n_docs = self.embeddings_.shape[0]
152
+
153
+ if labels is not None and len(labels) != n_docs:
154
+ raise ValueError(
155
+ f"Labels length ({len(labels)}) must match embeddings length ({n_docs})"
156
+ )
157
+
158
+ if node_data is not None:
159
+ # Validate node_data format: should be {node_index: {attribute_dict}} or {node_index: value}
160
+ if not isinstance(node_data, dict):
161
+ raise ValueError("Node data must be a dictionary")
162
+
163
+ # Check if all keys are integers (node indices)
164
+ non_integer_keys = [
165
+ k
166
+ for k in node_data.keys()
167
+ if not isinstance(k, (int, np.integer))
168
+ ]
169
+ if non_integer_keys:
170
+ raise ValueError(
171
+ f"Node data keys must be integer node indices, got: {non_integer_keys}"
172
+ )
173
+
174
+ # Validate that node_data keys are valid node indices
175
+ invalid_indices = [
176
+ idx for idx in node_data.keys() if idx >= n_docs or idx < 0
177
+ ]
178
+ if invalid_indices:
179
+ raise ValueError(
180
+ f"Node data contains invalid indices {invalid_indices}. Indices must be 0 <= idx < {n_docs}"
181
+ )
182
+
183
+ # Convert single values to dictionary format for consistency
184
+ # If values are not dictionaries, wrap them in a dictionary with 'value' key
185
+ processed_node_data = {}
186
+ for k, v in node_data.items():
187
+ if isinstance(v, dict):
188
+ processed_node_data[k] = v
189
+ else:
190
+ processed_node_data[k] = {"value": v}
191
+ node_data = processed_node_data
192
+
193
+ # Store training data
194
+ self._labels = (
195
+ labels if labels is not None else [str(i) for i in range(n_docs)]
196
+ )
197
+ self._node_data = node_data
198
+
199
+ # Use provided thresholds or fall back to instance defaults
200
+ effective_thresh = thresh if thresh is not None else self.thresh
201
+ effective_top_k = top_k if top_k is not None else self.top_k
202
+
203
+ # Get pairwise similarities
204
+ neighbor_data = self._get_pairwise_similarities(
205
+ effective_thresh, effective_top_k
206
+ )
207
+
208
+ # Build and return the graph
209
+ return self._build_graph(neighbor_data)
210
+
211
+ def fit_transform(
212
+ self,
213
+ embeddings: np.ndarray,
214
+ labels: Optional[List[str]] = None,
215
+ node_data: Optional[Dict] = None,
216
+ thresh: Optional[float] = None,
217
+ top_k: Optional[int] = None,
218
+ ) -> nx.Graph:
219
+ """
220
+ Fit the model and transform the embeddings in one step.
221
+
222
+ Args:
223
+ embeddings: Pre-computed embeddings array with shape (n_docs, embedding_dim).
224
+ labels: Optional list of text labels/documents for the embeddings.
225
+ node_data: Optional dictionary containing additional data to attach to nodes.
226
+ thresh: Optional similarity threshold override for this transform.
227
+ top_k: Optional max neighbors override for this transform.
228
+
229
+ Returns:
230
+ NetworkX graph representing the semantic network
231
+ """
232
+ return self.fit(embeddings=embeddings).transform(
233
+ thresh=thresh, top_k=top_k, labels=labels, node_data=node_data
234
+ )
235
+
236
+ def to_pandas(
237
+ self, graph: Optional[nx.Graph] = None
238
+ ) -> Tuple[pd.DataFrame, pd.DataFrame]:
239
+ """
240
+ Export a NetworkX graph to pandas DataFrames.
241
+
242
+ By default, exports the most recently transformed graph. Optionally accepts
243
+ an arbitrary NetworkX graph (useful for subgraphs or modified graphs).
244
+
245
+ Args:
246
+ graph: Optional NetworkX graph to export. If None, will raise an error
247
+ since no graph is stored by default after transform.
248
+
249
+ Returns:
250
+ Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing:
251
+ - nodes (pd.DataFrame): Node attributes with index as node ID.
252
+ Columns include all node attributes from the graph.
253
+ - edges (pd.DataFrame): Edge list with columns 'source', 'target',
254
+ and any edge attributes (e.g., 'weight').
255
+
256
+ Raises:
257
+ ValueError: If no graph is provided and the model hasn't been fitted yet
258
+
259
+ Examples:
260
+ >>> # Build and export a graph
261
+ >>> network = SemanticNetwork(thresh=0.8)
262
+ >>> graph = network.fit_transform(embeddings, labels=docs)
263
+ >>> nodes, edges = network.to_pandas(graph)
264
+
265
+ >>> # Export a subgraph
266
+ >>> subgraph = graph.subgraph([0, 1, 2])
267
+ >>> sub_nodes, sub_edges = network.to_pandas(subgraph)
268
+ """
269
+ if graph is None:
270
+ raise ValueError(
271
+ "No graph provided. Call transform() to get a graph, then pass it to to_pandas()."
272
+ )
273
+
274
+ # Convert nodes to DataFrame
275
+ nodes = pd.DataFrame.from_dict(
276
+ dict(graph.nodes(data=True)), orient="index"
277
+ )
278
+
279
+ # Convert edges to DataFrame
280
+ if graph.number_of_edges() > 0:
281
+ edges = nx.to_pandas_edgelist(graph)
282
+ else:
283
+ # Create empty DataFrame with expected columns if no edges
284
+ edges = pd.DataFrame(columns=["source", "target"])
285
+
286
+ return nodes, edges
287
+
288
+ def _build_vector_index(self) -> AnnoyIndex:
289
+ """
290
+ Build an Annoy index for fast approximate nearest neighbor search.
291
+
292
+ Returns:
293
+ The built Annoy index
294
+
295
+ Raises:
296
+ ValueError: If embeddings haven't been provided yet
297
+
298
+ Note:
299
+ The index is stored in self.index_ and also returned.
300
+ """
301
+ if self.embeddings_ is None:
302
+ raise ValueError(
303
+ "Embeddings not found. Please provide embeddings in fit() method."
304
+ )
305
+
306
+ embedding_dim = self.embeddings_.shape[1]
307
+ self.index_ = AnnoyIndex(embedding_dim, self.metric) # type: ignore
308
+
309
+ if self.verbose:
310
+ logger.info(
311
+ f"Building Annoy index with {self.n_trees} trees for {len(self.embeddings_)} embeddings"
312
+ )
313
+ iterator = tqdm(
314
+ enumerate(self.embeddings_),
315
+ total=len(self.embeddings_),
316
+ desc="Adding embeddings to index",
317
+ )
318
+ else:
319
+ iterator = enumerate(self.embeddings_)
320
+
321
+ for i, embedding_vector in iterator:
322
+ self.index_.add_item(i, embedding_vector)
323
+
324
+ if self.verbose:
325
+ logger.info("Building index trees...")
326
+ self.index_.build(self.n_trees)
327
+
328
+ if self.verbose:
329
+ logger.info("Vector index built successfully")
330
+
331
+ return self.index_
332
+
333
+ def _get_pairwise_similarities(
334
+ self, thresh: float, top_k: int
335
+ ) -> pd.DataFrame:
336
+ """
337
+ Find pairwise similarities between documents above a threshold.
338
+
339
+ Uses the Annoy index to efficiently find nearest neighbors for each document,
340
+ then calculates exact similarities and filters by threshold.
341
+
342
+ Args:
343
+ thresh: Similarity threshold for including edges
344
+ top_k: Maximum number of neighbors to check per document
345
+
346
+ Returns:
347
+ DataFrame of similarities with columns: source_idx, target_idx, weight, source_name, target_name
348
+
349
+ Raises:
350
+ ValueError: If embeddings or index haven't been built yet
351
+ """
352
+ if self.embeddings_ is None or self.index_ is None:
353
+ raise ValueError(
354
+ "Embeddings or index not found. Please provide embeddings in fit() method and run _build_vector_index() first."
355
+ )
356
+
357
+ if self._labels is None:
358
+ raise ValueError("No training documents found. Call fit() first.")
359
+
360
+ if self.verbose:
361
+ logger.info(
362
+ f"Finding pairwise similarities with threshold {thresh}, checking top {top_k} neighbors"
363
+ )
364
+
365
+ results = []
366
+
367
+ if self.verbose:
368
+ iterator = tqdm(
369
+ range(len(self.embeddings_)), desc="Finding similarities"
370
+ )
371
+ else:
372
+ iterator = range(len(self.embeddings_))
373
+
374
+ for idx_source in iterator:
375
+ neighbors = self.index_.get_nns_by_item(
376
+ idx_source, top_k, include_distances=True
377
+ )
378
+
379
+ for idx_target, dist in zip(*neighbors):
380
+ similarity = 1 - dist # Convert angular distance to similarity
381
+
382
+ if idx_source != idx_target and similarity >= thresh:
383
+ result_dict = {
384
+ "source_idx": idx_source,
385
+ "target_idx": idx_target,
386
+ "weight": similarity,
387
+ "source_name": self._labels[idx_source],
388
+ "target_name": self._labels[idx_target],
389
+ }
390
+ results.append(result_dict)
391
+
392
+ neighbor_data = pd.DataFrame(results)
393
+
394
+ if self.verbose:
395
+ logger.info(
396
+ f"Found {len(neighbor_data)} similarity pairs above threshold {thresh}"
397
+ )
398
+
399
+ return neighbor_data
400
+
401
+ def _build_graph(self, neighbor_data: pd.DataFrame) -> nx.Graph:
402
+ """
403
+ Build a NetworkX graph from pairwise similarities.
404
+
405
+ Creates a graph where:
406
+ - Nodes represent documents
407
+ - Edges represent similarities above the threshold (with 'weight' attribute representing similarity)
408
+
409
+ Args:
410
+ neighbor_data: DataFrame of pairwise similarities
411
+
412
+ Returns:
413
+ The constructed NetworkX graph
414
+
415
+ Raises:
416
+ ValueError: If training data hasn't been set
417
+
418
+ Note:
419
+ The graph includes all documents as nodes, even if they have no similarities above threshold.
420
+ """
421
+
422
+ if self._labels is None:
423
+ raise ValueError("No training documents found. Call fit() first.")
424
+
425
+ if self.verbose:
426
+ logger.info(
427
+ f"Building graph from {len(neighbor_data)} similarity edges"
428
+ )
429
+
430
+ # Instantiate undirected graph
431
+ G = nx.Graph()
432
+
433
+ # Add all nodes with their attributes
434
+ for i in range(len(self._labels)):
435
+ # Set basic attributes
436
+ attrs = {
437
+ "label": self._labels[i],
438
+ "id": i,
439
+ }
440
+
441
+ # Add custom node data if provided for this specific node
442
+ if self._node_data is not None and i in self._node_data:
443
+ attrs.update(self._node_data[i])
444
+
445
+ G.add_node(i, **attrs)
446
+
447
+ # Add edges from neighbor data
448
+ for _, row in neighbor_data.iterrows():
449
+ G.add_edge(
450
+ row["source_idx"],
451
+ row["target_idx"],
452
+ weight=row["weight"],
453
+ )
454
+
455
+ if self.verbose:
456
+ num_components = nx.number_connected_components(G)
457
+ logger.info(
458
+ f"Built graph with {G.number_of_nodes()} nodes, {G.number_of_edges()} edges, {num_components} components"
459
+ )
460
+
461
+ return G
@@ -0,0 +1,209 @@
1
+ Metadata-Version: 2.4
2
+ Name: semnet
3
+ Version: 0.1.3
4
+ Summary: Semantic Networks from Embeddings
5
+ Author-email: Ian Goodrich <ian@igdr.ch>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/specialprocedures/semnet
8
+ Project-URL: Documentation, https://semnetdocs.readthedocs.io
9
+ Project-URL: Repository, https://github.com/specialprocedures/semnet
10
+ Project-URL: Bug Tracker, https://github.com/specialprocedures/semnet/issues
11
+ Keywords: semantic networks,embeddings,graph analysis,nlp,similarity,networkx
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
25
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
+ Requires-Python: >=3.8
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: networkx>=2.5
30
+ Requires-Dist: annoy>=1.17.0
31
+ Requires-Dist: numpy>=1.19.0
32
+ Requires-Dist: pandas>=1.2.0
33
+ Requires-Dist: tqdm>=4.60.0
34
+ Provides-Extra: dev
35
+ Requires-Dist: pytest>=6.0; extra == "dev"
36
+ Requires-Dist: pytest-cov>=2.10; extra == "dev"
37
+ Requires-Dist: black>=21.0; extra == "dev"
38
+ Requires-Dist: isort>=5.0; extra == "dev"
39
+ Requires-Dist: flake8>=3.8; extra == "dev"
40
+ Requires-Dist: mypy>=0.800; extra == "dev"
41
+ Requires-Dist: sentence-transformers>=2.0.0; extra == "dev"
42
+ Provides-Extra: docs
43
+ Requires-Dist: sphinx>=4.0; extra == "docs"
44
+ Requires-Dist: sphinx-rtd-theme>=1.0; extra == "docs"
45
+ Requires-Dist: myst-parser>=0.17; extra == "docs"
46
+ Provides-Extra: examples
47
+ Requires-Dist: sentence-transformers>=2.0.0; extra == "examples"
48
+ Requires-Dist: matplotlib>=3.3.0; extra == "examples"
49
+ Requires-Dist: jupyter>=1.0.0; extra == "examples"
50
+ Dynamic: license-file
51
+
52
+
53
+ # Semnet: Graph structures from embeddings
54
+
55
+ ![Embeddings of Guardian headlines represented as a network structure by Semnet and visualised by Cosmograph](img/cosmo_semnet.png)
56
+ _Embeddings of Guardian headlines represented as a network by Semnet and visualised in [Cosmograph](cosmograph.app)_
57
+
58
+ Semnet constructs graph structures from embeddings, enabling graph-based analysis and operations over embedded documents, images, and more.
59
+
60
+ Semnet uses [Annoy](https://github.com/spotify/annoy) to perform efficient pair-wise distance calculations across all embeddings in the dataset, then constructs [NetworkX](https://networkx.org) graphs representing relationships between embeddings.
61
+
62
+ ## Use cases
63
+ Semnet may be used for:
64
+ - **Deduplication**: remove duplicate records (e.g., "Donald Trump", "Donald J. Trump) from datasets
65
+ - **Clustering**: find groups of similar documents via [community detection](https://networkx.org/documentation/stable/reference/algorithms/community.html) algorithms
66
+ - **Recommendation systems**: Account for relationships, and take advantage of graph structures such as communities and paths in search and RAG
67
+ - **Knowledge graph construction**: Build networks of related concepts or entities, as a regular NetworkX graph it's easy to add additional entities
68
+ - **Exploratory data analysis and visualisation**, [Cosmograph](https://cosmograph.app/) works brilliantly for large corpora
69
+
70
+ Exposing the full NetworkX and Annoy APIs, Semnet offers plenty of opportunity for experimentation depending on your use-case. Check out the examples for inspiration.
71
+
72
+
73
+ ## Quick Start
74
+
75
+ ```python
76
+ from semnet import SemanticNetwork
77
+ from sentence_transformers import SentenceTransformer
78
+ import networkx as nx
79
+
80
+ # Your documents
81
+ docs = [
82
+ "The cat sat on the mat",
83
+ "A cat was sitting on a mat",
84
+ "The dog ran in the park",
85
+ "I love Python",
86
+ "Python is a great programming language",
87
+ ]
88
+
89
+ # Generate embeddings (use any embedding provider)
90
+ embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
91
+ embeddings = embedding_model.encode(docs)
92
+
93
+ # Create and configure semantic network
94
+ sem = SemanticNetwork(thresh=0.3, verbose=True) # Larger values give sparser networks
95
+
96
+ # Build the semantic graph from your embeddings
97
+ G = sem.fit_transform(embeddings, labels=docs)
98
+
99
+ # Analyze the graph
100
+ print(f"Nodes: {G.number_of_nodes()}")
101
+ print(f"Edges: {G.number_of_edges()}")
102
+ print(f"Connected components: {nx.number_connected_components(G)}")
103
+
104
+ # Find similar document groups
105
+ for component in nx.connected_components(G):
106
+ if len(component) > 1:
107
+ similar_docs = [G.nodes[i]["label"] for i in component]
108
+ print(f"Similar documents: {similar_docs}")
109
+
110
+ # Calculate centrality measures,
111
+ # Degree centrality not that interesting in the example, but shown here for demonstration
112
+ centrality = nx.degree_centrality(G)
113
+ for node, cent_value in centrality.items():
114
+ print(f"Document: {G.nodes[node]['label']}, Degree Centrality: {cent_value:.4f}")
115
+ G.nodes[node]["degree_centrality"] = cent_value
116
+
117
+ # Export to pandas
118
+ nodes_df, edges_df = sem.to_pandas(G)
119
+ ```
120
+
121
+ ## Installation
122
+
123
+ ```bash
124
+ pip install semnet
125
+ ```
126
+
127
+ For development:
128
+
129
+ ```bash
130
+ git clone https://github.com/specialprocedures/semnet.git
131
+ cd semnet
132
+ pip install -e ".[dev]"
133
+ ```
134
+
135
+ ## Configuration Options
136
+
137
+ ### SemanticNetwork Parameters
138
+
139
+ - **metric**: Distance metric for Annoy index ('angular', 'euclidean', etc.) (default: 'angular')
140
+ - **n_trees**: Number of trees for Annoy index (more = better accuracy, slower) (default: 10)
141
+ - **thresh**: Similarity threshold (0.0 to 1.0) (default: 0.3)
142
+ - **top_k**: Maximum neighbors to check per document (default: 100)
143
+ - **verbose**: Show progress bars and logging (default: False)
144
+
145
+ ### Method Parameters
146
+
147
+ - **fit(embeddings, labels=None, ids=None, node_data=None)**:
148
+ - embeddings are required pre-computed embeddings array with shape (n_docs, embedding_dim)
149
+ - labels are optional text labels/documents for the embeddings
150
+ - ids are optional custom IDs for the embeddings
151
+ - node_data is optional dictionary containing additional data to attach to nodes
152
+ - **transform(thresh=None, top_k=None)**: Optional threshold and top_k overrides
153
+ - **fit_transform(embeddings, labels=None, ids=None, node_data=None, thresh=None, top_k=None)**: Combined fit and transform
154
+ - **to_pandas(graph)**: Export NetworkX graph to pandas DataFrames
155
+
156
+ ## Performance Tips
157
+
158
+ - Use `"angular"` metric for cosine similarity (default and recommended)
159
+ - Increase `n_trees` for better accuracy (try 50-100 for large datasets)
160
+ - Decrease `top_k` if you have memory constraints
161
+ - Use smaller embedding models for speed: `"all-MiniLM-L6-v2"`
162
+ - Use larger models for accuracy: `"BAAI/bge-large-en-v1.5"`
163
+
164
+ ## Requirements
165
+
166
+ - Python 3.8+
167
+ - networkx
168
+ - annoy
169
+ - numpy
170
+ - pandas
171
+ - tqdm
172
+
173
+
174
+ ## Project origin and statement on the use of AI
175
+
176
+ I love network analysis, and have explored embedding-derived [semantic networks](https://en.wikipedia.org/wiki/Semantic_network) in the past as an alternative approach to representing, clustering and querying news data.
177
+
178
+ Whilst using semantic networks for graph analysis on some forthcoming research, I decided to package some of my code for others to use.
179
+
180
+ I kicked off the project by hand-refactoring my initial code into the class-based structure that forms the core functionality of the current module.
181
+
182
+ I then used Github Copilot in VSCode to:
183
+ - Bootstrap scaffolding, tests, documentation, examples and typing
184
+ - Refactor the core methods in the style of the scikit-learn API
185
+ - Add additional functionality for convenient analysis of graph structures and to allow the use of custom embeddings.
186
+
187
+ ## Roadmap
188
+
189
+ Semnet is a relatively simple project focused on core graph construction functionality. Potential future additions:
190
+ - Better examples showcasing network analysis on large corpora
191
+ - Integration with graph visualization tools
192
+ - Performance optimizations for very large datasets
193
+
194
+ ## License
195
+
196
+ MIT License
197
+
198
+ ## Citation
199
+
200
+ If you use Semnet in academic work, please cite:
201
+
202
+ ```bibtex
203
+ @software{semnet,
204
+ title={Semnet: Semantic Networks from Embeddings},
205
+ author={Ian Goodrich},
206
+ year={2025},
207
+ url={https://github.com/specialprocedures/semnet}
208
+ }
209
+ ```
@@ -0,0 +1,7 @@
1
+ semnet/__init__.py,sha256=_zSU-x86hBwF-ZC9a4p8f5DUZpvoJalhTYXXaQKYYM8,336
2
+ semnet/semnet.py,sha256=OEqOh9pB17MAiPInK77BUout0EVaKlR4I87EMyQxYNg,16481
3
+ semnet-0.1.3.dist-info/licenses/LICENSE,sha256=0TzAvDoYO4STeUC-y5HUHwD__mh19DrcV1u3pFHM1Uc,1068
4
+ semnet-0.1.3.dist-info/METADATA,sha256=GNnLnEI68S6eyhR4CXW194h0CwIj3z-5IeTOU69SddU,8364
5
+ semnet-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ semnet-0.1.3.dist-info/top_level.txt,sha256=L9LrXn-MHrZUilHd5D4NIPeKusKYrKzD3m9KRD-rRBY,7
7
+ semnet-0.1.3.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Ian Goodrich
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ semnet