semnet 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- semnet-0.1.3/LICENSE +21 -0
- semnet-0.1.3/MANIFEST.in +11 -0
- semnet-0.1.3/PKG-INFO +209 -0
- semnet-0.1.3/README.md +158 -0
- semnet-0.1.3/docs/api.md +8 -0
- semnet-0.1.3/docs/conf.py +59 -0
- semnet-0.1.3/docs/examples.md +3 -0
- semnet-0.1.3/docs/index.md +19 -0
- semnet-0.1.3/docs/installation.md +27 -0
- semnet-0.1.3/docs/quickstart.md +49 -0
- semnet-0.1.3/docs/requirements.txt +3 -0
- semnet-0.1.3/examples/basic_example.py +163 -0
- semnet-0.1.3/examples/data/english_historical_quotes.json +1 -0
- semnet-0.1.3/examples/data/ground_truth.csv +301 -0
- semnet-0.1.3/examples/data/quickstart.ipynb +268 -0
- semnet-0.1.3/examples/deduplication.ipynb +93 -0
- semnet-0.1.3/examples/examples.ipynb +582 -0
- semnet-0.1.3/examples/pandas_export_example.py +221 -0
- semnet-0.1.3/examples/quotes.ipynb +4630 -0
- semnet-0.1.3/pyproject.toml +116 -0
- semnet-0.1.3/setup.cfg +4 -0
- semnet-0.1.3/src/semnet/__init__.py +15 -0
- semnet-0.1.3/src/semnet/semnet.py +461 -0
- semnet-0.1.3/src/semnet.egg-info/PKG-INFO +209 -0
- semnet-0.1.3/src/semnet.egg-info/SOURCES.txt +28 -0
- semnet-0.1.3/src/semnet.egg-info/dependency_links.txt +1 -0
- semnet-0.1.3/src/semnet.egg-info/requires.txt +24 -0
- semnet-0.1.3/src/semnet.egg-info/top_level.txt +1 -0
- semnet-0.1.3/tests/__init__.py +1 -0
- semnet-0.1.3/tests/test_semantic_network.py +323 -0
semnet-0.1.3/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Ian Goodrich
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
semnet-0.1.3/MANIFEST.in
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
include README.md
|
|
2
|
+
include LICENSE
|
|
3
|
+
include pyproject.toml
|
|
4
|
+
recursive-include src *.py
|
|
5
|
+
recursive-include examples *.py *.ipynb *.json *.csv
|
|
6
|
+
recursive-include tests *.py
|
|
7
|
+
recursive-include docs *.md *.rst *.txt *.py
|
|
8
|
+
prune docs/_build
|
|
9
|
+
prune .pytest_cache
|
|
10
|
+
prune **/__pycache__
|
|
11
|
+
global-exclude *.pyc
|
semnet-0.1.3/PKG-INFO
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: semnet
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: Semantic Networks from Embeddings
|
|
5
|
+
Author-email: Ian Goodrich <ian@igdr.ch>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/specialprocedures/semnet
|
|
8
|
+
Project-URL: Documentation, https://semnetdocs.readthedocs.io
|
|
9
|
+
Project-URL: Repository, https://github.com/specialprocedures/semnet
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/specialprocedures/semnet/issues
|
|
11
|
+
Keywords: semantic networks,embeddings,graph analysis,nlp,similarity,networkx
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
25
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
26
|
+
Requires-Python: >=3.8
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: networkx>=2.5
|
|
30
|
+
Requires-Dist: annoy>=1.17.0
|
|
31
|
+
Requires-Dist: numpy>=1.19.0
|
|
32
|
+
Requires-Dist: pandas>=1.2.0
|
|
33
|
+
Requires-Dist: tqdm>=4.60.0
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest-cov>=2.10; extra == "dev"
|
|
37
|
+
Requires-Dist: black>=21.0; extra == "dev"
|
|
38
|
+
Requires-Dist: isort>=5.0; extra == "dev"
|
|
39
|
+
Requires-Dist: flake8>=3.8; extra == "dev"
|
|
40
|
+
Requires-Dist: mypy>=0.800; extra == "dev"
|
|
41
|
+
Requires-Dist: sentence-transformers>=2.0.0; extra == "dev"
|
|
42
|
+
Provides-Extra: docs
|
|
43
|
+
Requires-Dist: sphinx>=4.0; extra == "docs"
|
|
44
|
+
Requires-Dist: sphinx-rtd-theme>=1.0; extra == "docs"
|
|
45
|
+
Requires-Dist: myst-parser>=0.17; extra == "docs"
|
|
46
|
+
Provides-Extra: examples
|
|
47
|
+
Requires-Dist: sentence-transformers>=2.0.0; extra == "examples"
|
|
48
|
+
Requires-Dist: matplotlib>=3.3.0; extra == "examples"
|
|
49
|
+
Requires-Dist: jupyter>=1.0.0; extra == "examples"
|
|
50
|
+
Dynamic: license-file
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# Semnet: Graph structures from embeddings
|
|
54
|
+
|
|
55
|
+

|
|
56
|
+
_Embeddings of Guardian headlines represented as a network by Semnet and visualised in [Cosmograph](cosmograph.app)_
|
|
57
|
+
|
|
58
|
+
Semnet constructs graph structures from embeddings, enabling graph-based analysis and operations over embedded documents, images, and more.
|
|
59
|
+
|
|
60
|
+
Semnet uses [Annoy](https://github.com/spotify/annoy) to perform efficient pair-wise distance calculations across all embeddings in the dataset, then constructs [NetworkX](https://networkx.org) graphs representing relationships between embeddings.
|
|
61
|
+
|
|
62
|
+
## Use cases
|
|
63
|
+
Semnet may be used for:
|
|
64
|
+
- **Deduplication**: remove duplicate records (e.g., "Donald Trump", "Donald J. Trump) from datasets
|
|
65
|
+
- **Clustering**: find groups of similar documents via [community detection](https://networkx.org/documentation/stable/reference/algorithms/community.html) algorithms
|
|
66
|
+
- **Recommendation systems**: Account for relationships, and take advantage of graph structures such as communities and paths in search and RAG
|
|
67
|
+
- **Knowledge graph construction**: Build networks of related concepts or entities, as a regular NetworkX graph it's easy to add additional entities
|
|
68
|
+
- **Exploratory data analysis and visualisation**, [Cosmograph](https://cosmograph.app/) works brilliantly for large corpora
|
|
69
|
+
|
|
70
|
+
Exposing the full NetworkX and Annoy APIs, Semnet offers plenty of opportunity for experimentation depending on your use-case. Check out the examples for inspiration.
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
## Quick Start
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from semnet import SemanticNetwork
|
|
77
|
+
from sentence_transformers import SentenceTransformer
|
|
78
|
+
import networkx as nx
|
|
79
|
+
|
|
80
|
+
# Your documents
|
|
81
|
+
docs = [
|
|
82
|
+
"The cat sat on the mat",
|
|
83
|
+
"A cat was sitting on a mat",
|
|
84
|
+
"The dog ran in the park",
|
|
85
|
+
"I love Python",
|
|
86
|
+
"Python is a great programming language",
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
# Generate embeddings (use any embedding provider)
|
|
90
|
+
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
|
|
91
|
+
embeddings = embedding_model.encode(docs)
|
|
92
|
+
|
|
93
|
+
# Create and configure semantic network
|
|
94
|
+
sem = SemanticNetwork(thresh=0.3, verbose=True) # Larger values give sparser networks
|
|
95
|
+
|
|
96
|
+
# Build the semantic graph from your embeddings
|
|
97
|
+
G = sem.fit_transform(embeddings, labels=docs)
|
|
98
|
+
|
|
99
|
+
# Analyze the graph
|
|
100
|
+
print(f"Nodes: {G.number_of_nodes()}")
|
|
101
|
+
print(f"Edges: {G.number_of_edges()}")
|
|
102
|
+
print(f"Connected components: {nx.number_connected_components(G)}")
|
|
103
|
+
|
|
104
|
+
# Find similar document groups
|
|
105
|
+
for component in nx.connected_components(G):
|
|
106
|
+
if len(component) > 1:
|
|
107
|
+
similar_docs = [G.nodes[i]["label"] for i in component]
|
|
108
|
+
print(f"Similar documents: {similar_docs}")
|
|
109
|
+
|
|
110
|
+
# Calculate centrality measures,
|
|
111
|
+
# Degree centrality not that interesting in the example, but shown here for demonstration
|
|
112
|
+
centrality = nx.degree_centrality(G)
|
|
113
|
+
for node, cent_value in centrality.items():
|
|
114
|
+
print(f"Document: {G.nodes[node]['label']}, Degree Centrality: {cent_value:.4f}")
|
|
115
|
+
G.nodes[node]["degree_centrality"] = cent_value
|
|
116
|
+
|
|
117
|
+
# Export to pandas
|
|
118
|
+
nodes_df, edges_df = sem.to_pandas(G)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Installation
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
pip install semnet
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
For development:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
git clone https://github.com/specialprocedures/semnet.git
|
|
131
|
+
cd semnet
|
|
132
|
+
pip install -e ".[dev]"
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Configuration Options
|
|
136
|
+
|
|
137
|
+
### SemanticNetwork Parameters
|
|
138
|
+
|
|
139
|
+
- **metric**: Distance metric for Annoy index ('angular', 'euclidean', etc.) (default: 'angular')
|
|
140
|
+
- **n_trees**: Number of trees for Annoy index (more = better accuracy, slower) (default: 10)
|
|
141
|
+
- **thresh**: Similarity threshold (0.0 to 1.0) (default: 0.3)
|
|
142
|
+
- **top_k**: Maximum neighbors to check per document (default: 100)
|
|
143
|
+
- **verbose**: Show progress bars and logging (default: False)
|
|
144
|
+
|
|
145
|
+
### Method Parameters
|
|
146
|
+
|
|
147
|
+
- **fit(embeddings, labels=None, ids=None, node_data=None)**:
|
|
148
|
+
- embeddings are required pre-computed embeddings array with shape (n_docs, embedding_dim)
|
|
149
|
+
- labels are optional text labels/documents for the embeddings
|
|
150
|
+
- ids are optional custom IDs for the embeddings
|
|
151
|
+
- node_data is optional dictionary containing additional data to attach to nodes
|
|
152
|
+
- **transform(thresh=None, top_k=None)**: Optional threshold and top_k overrides
|
|
153
|
+
- **fit_transform(embeddings, labels=None, ids=None, node_data=None, thresh=None, top_k=None)**: Combined fit and transform
|
|
154
|
+
- **to_pandas(graph)**: Export NetworkX graph to pandas DataFrames
|
|
155
|
+
|
|
156
|
+
## Performance Tips
|
|
157
|
+
|
|
158
|
+
- Use `"angular"` metric for cosine similarity (default and recommended)
|
|
159
|
+
- Increase `n_trees` for better accuracy (try 50-100 for large datasets)
|
|
160
|
+
- Decrease `top_k` if you have memory constraints
|
|
161
|
+
- Use smaller embedding models for speed: `"all-MiniLM-L6-v2"`
|
|
162
|
+
- Use larger models for accuracy: `"BAAI/bge-large-en-v1.5"`
|
|
163
|
+
|
|
164
|
+
## Requirements
|
|
165
|
+
|
|
166
|
+
- Python 3.8+
|
|
167
|
+
- networkx
|
|
168
|
+
- annoy
|
|
169
|
+
- numpy
|
|
170
|
+
- pandas
|
|
171
|
+
- tqdm
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
## Project origin and statement on the use of AI
|
|
175
|
+
|
|
176
|
+
I love network analysis, and have explored embedding-derived [semantic networks](https://en.wikipedia.org/wiki/Semantic_network) in the past as an alternative approach to representing, clustering and querying news data.
|
|
177
|
+
|
|
178
|
+
Whilst using semantic networks for graph analysis on some forthcoming research, I decided to package some of my code for others to use.
|
|
179
|
+
|
|
180
|
+
I kicked off the project by hand-refactoring my initial code into the class-based structure that forms the core functionality of the current module.
|
|
181
|
+
|
|
182
|
+
I then used Github Copilot in VSCode to:
|
|
183
|
+
- Bootstrap scaffolding, tests, documentation, examples and typing
|
|
184
|
+
- Refactor the core methods in the style of the scikit-learn API
|
|
185
|
+
- Add additional functionality for convenient analysis of graph structures and to allow the use of custom embeddings.
|
|
186
|
+
|
|
187
|
+
## Roadmap
|
|
188
|
+
|
|
189
|
+
Semnet is a relatively simple project focused on core graph construction functionality. Potential future additions:
|
|
190
|
+
- Better examples showcasing network analysis on large corpora
|
|
191
|
+
- Integration with graph visualization tools
|
|
192
|
+
- Performance optimizations for very large datasets
|
|
193
|
+
|
|
194
|
+
## License
|
|
195
|
+
|
|
196
|
+
MIT License
|
|
197
|
+
|
|
198
|
+
## Citation
|
|
199
|
+
|
|
200
|
+
If you use Semnet in academic work, please cite:
|
|
201
|
+
|
|
202
|
+
```bibtex
|
|
203
|
+
@software{semnet,
|
|
204
|
+
title={Semnet: Semantic Networks from Embeddings},
|
|
205
|
+
author={Ian Goodrich},
|
|
206
|
+
year={2025},
|
|
207
|
+
url={https://github.com/specialprocedures/semnet}
|
|
208
|
+
}
|
|
209
|
+
```
|
semnet-0.1.3/README.md
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
|
|
2
|
+
# Semnet: Graph structures from embeddings
|
|
3
|
+
|
|
4
|
+

|
|
5
|
+
_Embeddings of Guardian headlines represented as a network by Semnet and visualised in [Cosmograph](cosmograph.app)_
|
|
6
|
+
|
|
7
|
+
Semnet constructs graph structures from embeddings, enabling graph-based analysis and operations over embedded documents, images, and more.
|
|
8
|
+
|
|
9
|
+
Semnet uses [Annoy](https://github.com/spotify/annoy) to perform efficient pair-wise distance calculations across all embeddings in the dataset, then constructs [NetworkX](https://networkx.org) graphs representing relationships between embeddings.
|
|
10
|
+
|
|
11
|
+
## Use cases
|
|
12
|
+
Semnet may be used for:
|
|
13
|
+
- **Deduplication**: remove duplicate records (e.g., "Donald Trump", "Donald J. Trump) from datasets
|
|
14
|
+
- **Clustering**: find groups of similar documents via [community detection](https://networkx.org/documentation/stable/reference/algorithms/community.html) algorithms
|
|
15
|
+
- **Recommendation systems**: Account for relationships, and take advantage of graph structures such as communities and paths in search and RAG
|
|
16
|
+
- **Knowledge graph construction**: Build networks of related concepts or entities, as a regular NetworkX graph it's easy to add additional entities
|
|
17
|
+
- **Exploratory data analysis and visualisation**, [Cosmograph](https://cosmograph.app/) works brilliantly for large corpora
|
|
18
|
+
|
|
19
|
+
Exposing the full NetworkX and Annoy APIs, Semnet offers plenty of opportunity for experimentation depending on your use-case. Check out the examples for inspiration.
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
from semnet import SemanticNetwork
|
|
26
|
+
from sentence_transformers import SentenceTransformer
|
|
27
|
+
import networkx as nx
|
|
28
|
+
|
|
29
|
+
# Your documents
|
|
30
|
+
docs = [
|
|
31
|
+
"The cat sat on the mat",
|
|
32
|
+
"A cat was sitting on a mat",
|
|
33
|
+
"The dog ran in the park",
|
|
34
|
+
"I love Python",
|
|
35
|
+
"Python is a great programming language",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
# Generate embeddings (use any embedding provider)
|
|
39
|
+
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
|
|
40
|
+
embeddings = embedding_model.encode(docs)
|
|
41
|
+
|
|
42
|
+
# Create and configure semantic network
|
|
43
|
+
sem = SemanticNetwork(thresh=0.3, verbose=True) # Larger values give sparser networks
|
|
44
|
+
|
|
45
|
+
# Build the semantic graph from your embeddings
|
|
46
|
+
G = sem.fit_transform(embeddings, labels=docs)
|
|
47
|
+
|
|
48
|
+
# Analyze the graph
|
|
49
|
+
print(f"Nodes: {G.number_of_nodes()}")
|
|
50
|
+
print(f"Edges: {G.number_of_edges()}")
|
|
51
|
+
print(f"Connected components: {nx.number_connected_components(G)}")
|
|
52
|
+
|
|
53
|
+
# Find similar document groups
|
|
54
|
+
for component in nx.connected_components(G):
|
|
55
|
+
if len(component) > 1:
|
|
56
|
+
similar_docs = [G.nodes[i]["label"] for i in component]
|
|
57
|
+
print(f"Similar documents: {similar_docs}")
|
|
58
|
+
|
|
59
|
+
# Calculate centrality measures,
|
|
60
|
+
# Degree centrality not that interesting in the example, but shown here for demonstration
|
|
61
|
+
centrality = nx.degree_centrality(G)
|
|
62
|
+
for node, cent_value in centrality.items():
|
|
63
|
+
print(f"Document: {G.nodes[node]['label']}, Degree Centrality: {cent_value:.4f}")
|
|
64
|
+
G.nodes[node]["degree_centrality"] = cent_value
|
|
65
|
+
|
|
66
|
+
# Export to pandas
|
|
67
|
+
nodes_df, edges_df = sem.to_pandas(G)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Installation
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install semnet
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
For development:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
git clone https://github.com/specialprocedures/semnet.git
|
|
80
|
+
cd semnet
|
|
81
|
+
pip install -e ".[dev]"
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Configuration Options
|
|
85
|
+
|
|
86
|
+
### SemanticNetwork Parameters
|
|
87
|
+
|
|
88
|
+
- **metric**: Distance metric for Annoy index ('angular', 'euclidean', etc.) (default: 'angular')
|
|
89
|
+
- **n_trees**: Number of trees for Annoy index (more = better accuracy, slower) (default: 10)
|
|
90
|
+
- **thresh**: Similarity threshold (0.0 to 1.0) (default: 0.3)
|
|
91
|
+
- **top_k**: Maximum neighbors to check per document (default: 100)
|
|
92
|
+
- **verbose**: Show progress bars and logging (default: False)
|
|
93
|
+
|
|
94
|
+
### Method Parameters
|
|
95
|
+
|
|
96
|
+
- **fit(embeddings, labels=None, ids=None, node_data=None)**:
|
|
97
|
+
- embeddings are required pre-computed embeddings array with shape (n_docs, embedding_dim)
|
|
98
|
+
- labels are optional text labels/documents for the embeddings
|
|
99
|
+
- ids are optional custom IDs for the embeddings
|
|
100
|
+
- node_data is optional dictionary containing additional data to attach to nodes
|
|
101
|
+
- **transform(thresh=None, top_k=None)**: Optional threshold and top_k overrides
|
|
102
|
+
- **fit_transform(embeddings, labels=None, ids=None, node_data=None, thresh=None, top_k=None)**: Combined fit and transform
|
|
103
|
+
- **to_pandas(graph)**: Export NetworkX graph to pandas DataFrames
|
|
104
|
+
|
|
105
|
+
## Performance Tips
|
|
106
|
+
|
|
107
|
+
- Use `"angular"` metric for cosine similarity (default and recommended)
|
|
108
|
+
- Increase `n_trees` for better accuracy (try 50-100 for large datasets)
|
|
109
|
+
- Decrease `top_k` if you have memory constraints
|
|
110
|
+
- Use smaller embedding models for speed: `"all-MiniLM-L6-v2"`
|
|
111
|
+
- Use larger models for accuracy: `"BAAI/bge-large-en-v1.5"`
|
|
112
|
+
|
|
113
|
+
## Requirements
|
|
114
|
+
|
|
115
|
+
- Python 3.8+
|
|
116
|
+
- networkx
|
|
117
|
+
- annoy
|
|
118
|
+
- numpy
|
|
119
|
+
- pandas
|
|
120
|
+
- tqdm
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
## Project origin and statement on the use of AI
|
|
124
|
+
|
|
125
|
+
I love network analysis, and have explored embedding-derived [semantic networks](https://en.wikipedia.org/wiki/Semantic_network) in the past as an alternative approach to representing, clustering and querying news data.
|
|
126
|
+
|
|
127
|
+
Whilst using semantic networks for graph analysis on some forthcoming research, I decided to package some of my code for others to use.
|
|
128
|
+
|
|
129
|
+
I kicked off the project by hand-refactoring my initial code into the class-based structure that forms the core functionality of the current module.
|
|
130
|
+
|
|
131
|
+
I then used Github Copilot in VSCode to:
|
|
132
|
+
- Bootstrap scaffolding, tests, documentation, examples and typing
|
|
133
|
+
- Refactor the core methods in the style of the scikit-learn API
|
|
134
|
+
- Add additional functionality for convenient analysis of graph structures and to allow the use of custom embeddings.
|
|
135
|
+
|
|
136
|
+
## Roadmap
|
|
137
|
+
|
|
138
|
+
Semnet is a relatively simple project focused on core graph construction functionality. Potential future additions:
|
|
139
|
+
- Better examples showcasing network analysis on large corpora
|
|
140
|
+
- Integration with graph visualization tools
|
|
141
|
+
- Performance optimizations for very large datasets
|
|
142
|
+
|
|
143
|
+
## License
|
|
144
|
+
|
|
145
|
+
MIT License
|
|
146
|
+
|
|
147
|
+
## Citation
|
|
148
|
+
|
|
149
|
+
If you use Semnet in academic work, please cite:
|
|
150
|
+
|
|
151
|
+
```bibtex
|
|
152
|
+
@software{semnet,
|
|
153
|
+
title={Semnet: Semantic Networks from Embeddings},
|
|
154
|
+
author={Ian Goodrich},
|
|
155
|
+
year={2025},
|
|
156
|
+
url={https://github.com/specialprocedures/semnet}
|
|
157
|
+
}
|
|
158
|
+
```
|
semnet-0.1.3/docs/api.md
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Configuration file for the Sphinx documentation builder.
|
|
2
|
+
#
|
|
3
|
+
# For the full list of built-in configuration values, see the documentation:
|
|
4
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
|
5
|
+
|
|
6
|
+
# -- Project information -----------------------------------------------------
|
|
7
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
|
8
|
+
|
|
9
|
+
project = "semnet"
|
|
10
|
+
author = "Ian Goodrich"
|
|
11
|
+
release = "0.1.0"
|
|
12
|
+
|
|
13
|
+
# -- General configuration ---------------------------------------------------
|
|
14
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
|
15
|
+
|
|
16
|
+
extensions = [
|
|
17
|
+
"sphinx.ext.autodoc",
|
|
18
|
+
"sphinx.ext.viewcode",
|
|
19
|
+
"sphinx.ext.napoleon",
|
|
20
|
+
"sphinx.ext.githubpages",
|
|
21
|
+
"myst_parser",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
templates_path = ["_templates"]
|
|
25
|
+
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
|
|
26
|
+
|
|
27
|
+
# -- Options for HTML output -------------------------------------------------
|
|
28
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
|
29
|
+
|
|
30
|
+
html_theme = "sphinx_rtd_theme"
|
|
31
|
+
html_static_path = ["_static"]
|
|
32
|
+
|
|
33
|
+
# -- Extension configuration -------------------------------------------------
|
|
34
|
+
|
|
35
|
+
# Napoleon settings
|
|
36
|
+
napoleon_google_docstring = True
|
|
37
|
+
napoleon_numpy_docstring = True
|
|
38
|
+
napoleon_include_init_with_doc = False
|
|
39
|
+
napoleon_include_private_with_doc = False
|
|
40
|
+
|
|
41
|
+
# MyST settings - let MyST handle .md files automatically
|
|
42
|
+
# source_suffix handled by myst_parser extension
|
|
43
|
+
|
|
44
|
+
# Auto-doc settings
|
|
45
|
+
autodoc_member_order = "bysource"
|
|
46
|
+
# Mock all external dependencies since we only need the API structure
|
|
47
|
+
autodoc_mock_imports = [
|
|
48
|
+
"annoy",
|
|
49
|
+
"networkx",
|
|
50
|
+
"numpy",
|
|
51
|
+
"pandas",
|
|
52
|
+
"tqdm",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
# Path to the Python modules
|
|
56
|
+
import os
|
|
57
|
+
import sys
|
|
58
|
+
|
|
59
|
+
sys.path.insert(0, os.path.abspath("../src"))
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Semnet Documentation
|
|
2
|
+
|
|
3
|
+
Welcome to the documentation for **Semnet** - a Python library for constructing semantic networks from embeddings.
|
|
4
|
+
|
|
5
|
+
```{toctree}
|
|
6
|
+
:maxdepth: 2
|
|
7
|
+
:caption: Contents:
|
|
8
|
+
|
|
9
|
+
installation
|
|
10
|
+
quickstart
|
|
11
|
+
api
|
|
12
|
+
examples
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Indices and tables
|
|
16
|
+
|
|
17
|
+
* {ref}`genindex`
|
|
18
|
+
* {ref}`modindex`
|
|
19
|
+
* {ref}`search`
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Installation
|
|
2
|
+
|
|
3
|
+
## Requirements
|
|
4
|
+
|
|
5
|
+
- Python 3.8+
|
|
6
|
+
- networkx
|
|
7
|
+
- annoy
|
|
8
|
+
- numpy
|
|
9
|
+
- pandas
|
|
10
|
+
- tqdm
|
|
11
|
+
|
|
12
|
+
Optional for examples:
|
|
13
|
+
- sentence-transformers
|
|
14
|
+
|
|
15
|
+
## Install from PyPI
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install semnet
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Install from Source
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
git clone https://github.com/specialprocedures/semnet.git
|
|
25
|
+
cd semnet
|
|
26
|
+
pip install -e ".[dev]"
|
|
27
|
+
```
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Quick Start
|
|
2
|
+
|
|
3
|
+
## Basic Usage
|
|
4
|
+
|
|
5
|
+
```python
|
|
6
|
+
from semnet import SemanticNetwork
|
|
7
|
+
from sentence_transformers import SentenceTransformer
|
|
8
|
+
import networkx as nx
|
|
9
|
+
|
|
10
|
+
# Your documents
|
|
11
|
+
docs = [
|
|
12
|
+
"The cat sat on the mat",
|
|
13
|
+
"A cat was sitting on a mat",
|
|
14
|
+
"The dog ran in the park",
|
|
15
|
+
"I love Python",
|
|
16
|
+
"Python is a great programming language",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
# Generate embeddings (use any embedding provider)
|
|
20
|
+
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
|
|
21
|
+
embeddings = embedding_model.encode(docs)
|
|
22
|
+
|
|
23
|
+
# Create and configure semantic network
|
|
24
|
+
sem = SemanticNetwork(thresh=0.3, verbose=True) # Larger values give sparser networks
|
|
25
|
+
|
|
26
|
+
# Build the semantic graph from your embeddings
|
|
27
|
+
G = sem.fit_transform(embeddings, labels=docs)
|
|
28
|
+
|
|
29
|
+
# Analyze the graph
|
|
30
|
+
print(f"Nodes: {G.number_of_nodes()}")
|
|
31
|
+
print(f"Edges: {G.number_of_edges()}")
|
|
32
|
+
print(f"Connected components: {nx.number_connected_components(G)}")
|
|
33
|
+
|
|
34
|
+
# Find similar document groups
|
|
35
|
+
for component in nx.connected_components(G):
|
|
36
|
+
if len(component) > 1:
|
|
37
|
+
similar_docs = [G.nodes[i]["label"] for i in component]
|
|
38
|
+
print(f"Similar documents: {similar_docs}")
|
|
39
|
+
|
|
40
|
+
# Calculate centrality measures,
|
|
41
|
+
# Degree centrality not that interesting in the example, but shown here for demonstration
|
|
42
|
+
centrality = nx.degree_centrality(G)
|
|
43
|
+
for node, cent_value in centrality.items():
|
|
44
|
+
print(f"Document: {G.nodes[node]['label']}, Degree Centrality: {cent_value:.4f}")
|
|
45
|
+
G.nodes[node]["degree_centrality"] = cent_value
|
|
46
|
+
|
|
47
|
+
# Export to pandas
|
|
48
|
+
nodes_df, edges_df = sem.to_pandas(G)
|
|
49
|
+
```
|