lexograph 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexograph/__init__.py +65 -0
- lexograph/_types.py +26 -0
- lexograph/analyze/__init__.py +139 -0
- lexograph/analyze/backbone.py +170 -0
- lexograph/analyze/embeddings.py +68 -0
- lexograph/analyze/graph.py +190 -0
- lexograph/datasets/__init__.py +30 -0
- lexograph/datasets/_pride_and_prejudice.py +130 -0
- lexograph/encode/__init__.py +17 -0
- lexograph/encode/channels.py +164 -0
- lexograph/io/__init__.py +1 -0
- lexograph/layout/__init__.py +24 -0
- lexograph/layout/dispersion.py +116 -0
- lexograph/layout/linear.py +65 -0
- lexograph/layout/recurrence.py +117 -0
- lexograph/layout/spiral.py +115 -0
- lexograph/layout/walk.py +121 -0
- lexograph/layout/walk3d.py +64 -0
- lexograph/layout/widths.py +86 -0
- lexograph/presets/__init__.py +14 -0
- lexograph/presets/concordance.py +94 -0
- lexograph/presets/punctuation_spiral.py +139 -0
- lexograph/presets/recurrence.py +104 -0
- lexograph/presets/text_walk.py +200 -0
- lexograph/py.typed +0 -0
- lexograph/render/__init__.py +6 -0
- lexograph/render/mpl.py +233 -0
- lexograph/render/mpl3d.py +135 -0
- lexograph/scalars.py +70 -0
- lexograph/segment/__init__.py +5 -0
- lexograph/segment/units.py +188 -0
- lexograph-0.1.0.dist-info/METADATA +135 -0
- lexograph-0.1.0.dist-info/RECORD +34 -0
- lexograph-0.1.0.dist-info/WHEEL +4 -0
lexograph/__init__.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""lexograph — spatialize linear text into pictures you can read.
|
|
2
|
+
|
|
3
|
+
lexograph turns a linear text into a picture through one four-step spine:
|
|
4
|
+
**segment** the text into ordered units (characters, tokens, or sentences),
|
|
5
|
+
**lay them out** in 2-D or 3-D space, **encode** per-unit attributes onto
|
|
6
|
+
visual channels (size, colour, glyph), and **render** the result as a
|
|
7
|
+
matplotlib :class:`~matplotlib.figure.Figure` that displays inline in Jupyter
|
|
8
|
+
and saves cleanly with ``fig.savefig(...)``.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
|
|
13
|
+
from lexograph.datasets import load_demo_text
|
|
14
|
+
from lexograph.encode import (
|
|
15
|
+
Channels,
|
|
16
|
+
categorical_colors,
|
|
17
|
+
continuous_colors,
|
|
18
|
+
normalize_size,
|
|
19
|
+
)
|
|
20
|
+
from lexograph.layout import (
|
|
21
|
+
kwic,
|
|
22
|
+
linear_layout,
|
|
23
|
+
rendered_widths,
|
|
24
|
+
spiral_layout,
|
|
25
|
+
term_offsets,
|
|
26
|
+
walk3d_layout,
|
|
27
|
+
walk_layout,
|
|
28
|
+
)
|
|
29
|
+
from lexograph.presets import (
|
|
30
|
+
concordance,
|
|
31
|
+
punctuation_spiral,
|
|
32
|
+
recurrence_plot,
|
|
33
|
+
text_walk,
|
|
34
|
+
)
|
|
35
|
+
from lexograph.render import frame_axes, render_path, render_path_3d, render_points
|
|
36
|
+
from lexograph.scalars import frequencies, lengths, positions
|
|
37
|
+
from lexograph.segment import segment
|
|
38
|
+
|
|
39
|
+
__all__ = [
|
|
40
|
+
"segment",
|
|
41
|
+
"lengths",
|
|
42
|
+
"positions",
|
|
43
|
+
"frequencies",
|
|
44
|
+
"linear_layout",
|
|
45
|
+
"walk_layout",
|
|
46
|
+
"walk3d_layout",
|
|
47
|
+
"spiral_layout",
|
|
48
|
+
"rendered_widths",
|
|
49
|
+
"term_offsets",
|
|
50
|
+
"kwic",
|
|
51
|
+
"normalize_size",
|
|
52
|
+
"categorical_colors",
|
|
53
|
+
"continuous_colors",
|
|
54
|
+
"Channels",
|
|
55
|
+
"render_points",
|
|
56
|
+
"render_path",
|
|
57
|
+
"render_path_3d",
|
|
58
|
+
"frame_axes",
|
|
59
|
+
"punctuation_spiral",
|
|
60
|
+
"text_walk",
|
|
61
|
+
"recurrence_plot",
|
|
62
|
+
"concordance",
|
|
63
|
+
"load_demo_text",
|
|
64
|
+
"__version__",
|
|
65
|
+
]
|
lexograph/_types.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Shared type aliases for the lexograph package.
|
|
2
|
+
|
|
3
|
+
The visual-channel arrays named here are the package's public data contract:
|
|
4
|
+
``encode`` accepts plain per-unit arrays, and ``analyze``/``integrations`` only
|
|
5
|
+
ever *produce* arrays that satisfy these aliases. Nothing in the core needs to
|
|
6
|
+
know where the numbers came from.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Literal, TypeAlias
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import numpy.typing as npt
|
|
15
|
+
|
|
16
|
+
Unit: TypeAlias = str
|
|
17
|
+
"""A single segmented unit of text: a character, token, or sentence."""
|
|
18
|
+
|
|
19
|
+
UnitKind: TypeAlias = Literal["chars", "tokens", "sentences"]
|
|
20
|
+
"""Which kind of unit a segmenter emits."""
|
|
21
|
+
|
|
22
|
+
FloatArray: TypeAlias = npt.NDArray[np.float64]
|
|
23
|
+
"""A 1-D array of floats: a per-unit scalar channel (e.g. ``size``)."""
|
|
24
|
+
|
|
25
|
+
Coords: TypeAlias = npt.NDArray[np.float64]
|
|
26
|
+
"""An ``(N, 2)`` or ``(N, 3)`` array of layout coordinates, one row per unit."""
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Optional analysis layer: turn a text into per-sentence channel arrays.
|
|
2
|
+
|
|
3
|
+
Behind the ``lexograph[graph]`` extra. It runs the Wittgenstein pipeline —
|
|
4
|
+
sentence embeddings → cosine kNN graph → (optional disparity backbone) → PageRank
|
|
5
|
+
and community detection — and hands back plain arrays that satisfy the
|
|
6
|
+
``encode`` data contract: a ``size`` array (PageRank centrality), a ``community``
|
|
7
|
+
array (colour labels), and a cosine ``distances`` matrix (for a semantic
|
|
8
|
+
recurrence dotplot). The core never imports this; the arrow points inward.
|
|
9
|
+
|
|
10
|
+
The core dependency-free scalars (length, position, frequency) live in
|
|
11
|
+
:mod:`lexograph.scalars`.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
|
+
|
|
19
|
+
from lexograph.analyze.backbone import extract_backbone
|
|
20
|
+
from lexograph.analyze.embeddings import embed_sentences
|
|
21
|
+
from lexograph.analyze.graph import (
|
|
22
|
+
community_labels,
|
|
23
|
+
embedding_distances,
|
|
24
|
+
knn_graph,
|
|
25
|
+
pagerank_scores,
|
|
26
|
+
)
|
|
27
|
+
from lexograph.segment.units import sentences as split_sentences
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
import numpy as np
|
|
31
|
+
|
|
32
|
+
from lexograph._types import FloatArray
|
|
33
|
+
from lexograph.analyze.graph import CommunityMethod
|
|
34
|
+
|
|
35
|
+
__all__ = [
|
|
36
|
+
"Analysis",
|
|
37
|
+
"analyze_text",
|
|
38
|
+
"embed_sentences",
|
|
39
|
+
"knn_graph",
|
|
40
|
+
"embedding_distances",
|
|
41
|
+
"pagerank_scores",
|
|
42
|
+
"community_labels",
|
|
43
|
+
"extract_backbone",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass(frozen=True, slots=True)
|
|
48
|
+
class Analysis:
|
|
49
|
+
"""The per-sentence channel arrays derived from a text.
|
|
50
|
+
|
|
51
|
+
Attributes:
|
|
52
|
+
sentences: The segmented sentences (length ``N``).
|
|
53
|
+
embeddings: The ``(N, D)`` sentence embeddings.
|
|
54
|
+
size: PageRank centrality per sentence — the size channel.
|
|
55
|
+
community: Community id per sentence — the colour channel.
|
|
56
|
+
distances: The ``(N, N)`` cosine distance matrix — pass it to
|
|
57
|
+
``recurrence_plot(distances=...)`` for a semantic dotplot.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
sentences: list[str]
|
|
61
|
+
embeddings: FloatArray
|
|
62
|
+
size: FloatArray
|
|
63
|
+
community: np.ndarray
|
|
64
|
+
distances: FloatArray
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def analyze_text(
|
|
68
|
+
text: str,
|
|
69
|
+
*,
|
|
70
|
+
embeddings: FloatArray | None = None,
|
|
71
|
+
k: int = 5,
|
|
72
|
+
community: CommunityMethod = "louvain",
|
|
73
|
+
n_clusters: int = 10,
|
|
74
|
+
backbone: bool = False,
|
|
75
|
+
min_alpha_ptile: float = 0.5,
|
|
76
|
+
seed: int = 42,
|
|
77
|
+
) -> Analysis:
|
|
78
|
+
"""Run the analysis pipeline and return per-sentence channel arrays.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
text: The source text.
|
|
82
|
+
embeddings: Precomputed ``(N, D)`` embeddings to use. If ``None``, the
|
|
83
|
+
sentences are embedded with the default model (a model download).
|
|
84
|
+
k: Neighbours per node in the kNN graph.
|
|
85
|
+
community: Community method — ``"louvain"`` (default) or ``"kmeans"``.
|
|
86
|
+
n_clusters: Number of clusters for the ``"kmeans"`` method.
|
|
87
|
+
backbone: If ``True``, sparsify the kNN graph with the disparity filter
|
|
88
|
+
before PageRank and community detection.
|
|
89
|
+
min_alpha_ptile: Disparity-filter threshold (used when ``backbone``).
|
|
90
|
+
seed: Random seed for reproducibility.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
An :class:`Analysis` whose arrays align to the segmented sentences.
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
ValueError: If ``embeddings`` is given but its length does not match the
|
|
97
|
+
sentence count.
|
|
98
|
+
|
|
99
|
+
Example:
|
|
100
|
+
Drive a walk and a semantic dotplot from the analysis (not run as a
|
|
101
|
+
doctest — embedding downloads a model)::
|
|
102
|
+
|
|
103
|
+
from lexograph import text_walk, recurrence_plot, load_demo_text
|
|
104
|
+
from lexograph.analyze import analyze_text
|
|
105
|
+
|
|
106
|
+
a = analyze_text(load_demo_text())
|
|
107
|
+
walk = text_walk(load_demo_text(), colour=a.community,
|
|
108
|
+
colour_kind="categorical", size=a.size)
|
|
109
|
+
dots = recurrence_plot(load_demo_text(), distances=a.distances,
|
|
110
|
+
threshold=0.4)
|
|
111
|
+
"""
|
|
112
|
+
units = split_sentences(text)
|
|
113
|
+
n = len(units)
|
|
114
|
+
if embeddings is None:
|
|
115
|
+
embeddings = embed_sentences(units)
|
|
116
|
+
elif len(embeddings) != n:
|
|
117
|
+
msg = f"embeddings must have one row per sentence ({n}), got {len(embeddings)}"
|
|
118
|
+
raise ValueError(msg)
|
|
119
|
+
|
|
120
|
+
graph = knn_graph(embeddings, k=k)
|
|
121
|
+
if backbone:
|
|
122
|
+
graph = extract_backbone(graph, min_alpha_ptile=min_alpha_ptile)
|
|
123
|
+
size = pagerank_scores(graph, n)
|
|
124
|
+
labels = community_labels(
|
|
125
|
+
graph,
|
|
126
|
+
n,
|
|
127
|
+
method=community,
|
|
128
|
+
embeddings=embeddings,
|
|
129
|
+
n_clusters=n_clusters,
|
|
130
|
+
seed=seed,
|
|
131
|
+
)
|
|
132
|
+
distances = embedding_distances(embeddings)
|
|
133
|
+
return Analysis(
|
|
134
|
+
sentences=units,
|
|
135
|
+
embeddings=embeddings,
|
|
136
|
+
size=size,
|
|
137
|
+
community=labels,
|
|
138
|
+
distances=distances,
|
|
139
|
+
)
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Disparity-filter backbone extraction for weighted graphs.
|
|
2
|
+
|
|
3
|
+
The multiscale backbone of Serrano, Boguñá & Vespignani (2009),
|
|
4
|
+
https://arxiv.org/pdf/0904.2389.pdf — it keeps the statistically significant
|
|
5
|
+
edges of a weighted graph and discards the rest. Ported from the sibling
|
|
6
|
+
``kenon`` package's ``backbone`` module (which in turn follows DerwenAI's
|
|
7
|
+
``disparity_filter``); used here to sparsify the kNN sentence graph before
|
|
8
|
+
PageRank and community detection.
|
|
9
|
+
|
|
10
|
+
Part of the optional ``lexograph[graph]`` extra.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import copy
|
|
16
|
+
|
|
17
|
+
import networkx as nx
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"disparity_integral",
|
|
22
|
+
"edge_alpha",
|
|
23
|
+
"apply_disparity_filter",
|
|
24
|
+
"extract_backbone",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def disparity_integral(x: float, degree: float) -> float:
|
|
29
|
+
"""Evaluate the disparity-filter PDF integral ``(1-x)^k / ((k-1)(x-1))``.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
x: A normalised edge weight (must not be exactly 1.0).
|
|
33
|
+
degree: A node degree ``k`` (must not be exactly 1.0).
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
The value of the integral at ``x`` for degree ``degree``.
|
|
37
|
+
|
|
38
|
+
Examples:
|
|
39
|
+
>>> disparity_integral(0.5, 3.0) != disparity_integral(0.0, 3.0)
|
|
40
|
+
True
|
|
41
|
+
"""
|
|
42
|
+
return ((1.0 - x) ** degree) / ((degree - 1.0) * (x - 1.0))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def edge_alpha(norm_weight: float, degree: float) -> float:
|
|
46
|
+
"""Return the disparity significance ``alpha`` of one edge endpoint.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
norm_weight: The edge weight divided by the node's strength.
|
|
50
|
+
degree: The node's degree.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
``alpha`` in ``[0, 1]``; lower means more significant. A node of degree
|
|
54
|
+
``<= 1`` yields ``0.0``.
|
|
55
|
+
|
|
56
|
+
Examples:
|
|
57
|
+
>>> 0.0 <= edge_alpha(0.5, 3.0) <= 1.0
|
|
58
|
+
True
|
|
59
|
+
>>> edge_alpha(0.9, 1.0)
|
|
60
|
+
0.0
|
|
61
|
+
"""
|
|
62
|
+
if degree <= 1.0:
|
|
63
|
+
return 0.0
|
|
64
|
+
return 1.0 - (degree - 1.0) * (
|
|
65
|
+
disparity_integral(norm_weight, degree) - disparity_integral(0.0, degree)
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def apply_disparity_filter(graph: nx.Graph) -> list[float]:
|
|
70
|
+
"""Attach disparity statistics to every edge and node, in place.
|
|
71
|
+
|
|
72
|
+
Each node gains a ``strength`` (sum of incident weights); each edge gains
|
|
73
|
+
``norm_weight``, ``alpha`` (the more significant of its two endpoints), and
|
|
74
|
+
``alpha_ptile`` (the percentile of its alpha among all edges).
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
graph: A weighted networkx graph (edges carry a ``weight``).
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
The list of all edge alpha values.
|
|
81
|
+
|
|
82
|
+
Examples:
|
|
83
|
+
>>> import networkx as nx
|
|
84
|
+
>>> g = nx.Graph()
|
|
85
|
+
>>> g.add_edge("a", "b", weight=0.8)
|
|
86
|
+
>>> g.add_edge("b", "c", weight=0.3)
|
|
87
|
+
>>> g.add_edge("a", "c", weight=0.5)
|
|
88
|
+
>>> len(apply_disparity_filter(g)) == g.number_of_edges()
|
|
89
|
+
True
|
|
90
|
+
"""
|
|
91
|
+
if graph.number_of_edges() == 0:
|
|
92
|
+
return []
|
|
93
|
+
|
|
94
|
+
for node in graph.nodes():
|
|
95
|
+
graph.nodes[node]["strength"] = sum(
|
|
96
|
+
data.get("weight", 1.0) for _, _, data in graph.edges(node, data=True)
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
alphas: list[float] = []
|
|
100
|
+
for u, v, data in graph.edges(data=True):
|
|
101
|
+
weight = data.get("weight", 1.0)
|
|
102
|
+
alpha_u = _endpoint_alpha(graph, u, weight)
|
|
103
|
+
alpha_v = _endpoint_alpha(graph, v, weight)
|
|
104
|
+
data["alpha"] = min(alpha_u, alpha_v)
|
|
105
|
+
alphas.append(data["alpha"])
|
|
106
|
+
|
|
107
|
+
sorted_alphas = np.array(sorted(alphas))
|
|
108
|
+
for _u, _v, data in graph.edges(data=True):
|
|
109
|
+
data["alpha_ptile"] = float(
|
|
110
|
+
np.searchsorted(sorted_alphas, data["alpha"])
|
|
111
|
+
) / len(sorted_alphas)
|
|
112
|
+
return alphas
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _endpoint_alpha(graph: nx.Graph, node: object, weight: float) -> float:
|
|
116
|
+
"""Disparity alpha of ``weight`` as seen from one endpoint ``node``."""
|
|
117
|
+
strength = graph.nodes[node]["strength"]
|
|
118
|
+
degree = float(graph.degree(node))
|
|
119
|
+
norm = weight / strength if strength > 0 else 0.0
|
|
120
|
+
return edge_alpha(norm, degree)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def extract_backbone(
|
|
124
|
+
graph: nx.Graph,
|
|
125
|
+
*,
|
|
126
|
+
min_alpha_ptile: float = 0.5,
|
|
127
|
+
min_degree: int = 1,
|
|
128
|
+
) -> nx.Graph:
|
|
129
|
+
"""Return the disparity-filter backbone of a weighted graph.
|
|
130
|
+
|
|
131
|
+
The input is copied (never mutated): edges below ``min_alpha_ptile`` are
|
|
132
|
+
dropped, then nodes whose degree falls below ``min_degree`` are pruned
|
|
133
|
+
iteratively until stable.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
graph: A weighted networkx graph.
|
|
137
|
+
min_alpha_ptile: Edges with an alpha percentile below this are removed.
|
|
138
|
+
min_degree: Nodes left with a degree below this are pruned (``1`` keeps
|
|
139
|
+
any node that still has an edge).
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
A new graph containing only the backbone.
|
|
143
|
+
|
|
144
|
+
Examples:
|
|
145
|
+
>>> import networkx as nx
|
|
146
|
+
>>> g = nx.path_graph(5)
|
|
147
|
+
>>> for u, v in g.edges():
|
|
148
|
+
... g[u][v]["weight"] = float(v + 1)
|
|
149
|
+
>>> bb = extract_backbone(g, min_alpha_ptile=0.3)
|
|
150
|
+
>>> bb.number_of_nodes() <= g.number_of_nodes()
|
|
151
|
+
True
|
|
152
|
+
"""
|
|
153
|
+
if graph.number_of_edges() == 0:
|
|
154
|
+
return nx.Graph()
|
|
155
|
+
|
|
156
|
+
result = copy.deepcopy(graph)
|
|
157
|
+
apply_disparity_filter(result)
|
|
158
|
+
result.remove_edges_from(
|
|
159
|
+
[
|
|
160
|
+
(u, v)
|
|
161
|
+
for u, v, data in result.edges(data=True)
|
|
162
|
+
if data.get("alpha_ptile", 0.0) < min_alpha_ptile
|
|
163
|
+
]
|
|
164
|
+
)
|
|
165
|
+
changed = True
|
|
166
|
+
while changed:
|
|
167
|
+
prune = [n for n in list(result.nodes()) if result.degree(n) < min_degree]
|
|
168
|
+
changed = bool(prune)
|
|
169
|
+
result.remove_nodes_from(prune)
|
|
170
|
+
return result
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Embed sentences with a sentence-transformers model.
|
|
2
|
+
|
|
3
|
+
This is the only step that pulls in the heavy neural dependency, so the import is
|
|
4
|
+
deferred to call time: importing this module (and the rest of ``analyze``) is
|
|
5
|
+
cheap, and the model is loaded only when you actually embed. Every other
|
|
6
|
+
``analyze`` function takes the embedding array, so the whole graph pipeline can
|
|
7
|
+
be exercised with your own vectors and no model at all.
|
|
8
|
+
|
|
9
|
+
Part of the optional ``lexograph[graph]`` extra.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import importlib
|
|
15
|
+
from collections.abc import Sequence
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
from lexograph._types import FloatArray
|
|
21
|
+
|
|
22
|
+
__all__ = ["embed_sentences", "DEFAULT_MODEL"]
|
|
23
|
+
|
|
24
|
+
DEFAULT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
|
25
|
+
"""The default embedding model: small, CPU-fast, 384-dimensional."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def embed_sentences(
|
|
29
|
+
sentences: Sequence[str],
|
|
30
|
+
*,
|
|
31
|
+
model_name: str = DEFAULT_MODEL,
|
|
32
|
+
batch_size: int = 64,
|
|
33
|
+
) -> FloatArray:
|
|
34
|
+
"""Embed sentences into L2-normalised vectors.
|
|
35
|
+
|
|
36
|
+
The model is downloaded on first use and cached by ``sentence-transformers``.
|
|
37
|
+
Because the embeddings are L2-normalised, a dot product equals cosine
|
|
38
|
+
similarity.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
sentences: The sentences to embed.
|
|
42
|
+
model_name: A sentence-transformers model id.
|
|
43
|
+
batch_size: Encoding batch size.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
An ``(N, D)`` float array of unit-norm sentence embeddings.
|
|
47
|
+
|
|
48
|
+
Example:
|
|
49
|
+
Not run as a doctest (it would download the model)::
|
|
50
|
+
|
|
51
|
+
from lexograph import segment, load_demo_text
|
|
52
|
+
from lexograph.analyze.embeddings import embed_sentences
|
|
53
|
+
|
|
54
|
+
sentences = segment(load_demo_text())
|
|
55
|
+
embeddings = embed_sentences(sentences)
|
|
56
|
+
"""
|
|
57
|
+
# Imported by name so the heavy optional dependency is neither required to
|
|
58
|
+
# import this module nor statically type-checked when it is absent.
|
|
59
|
+
st: Any = importlib.import_module("sentence_transformers")
|
|
60
|
+
|
|
61
|
+
model = st.SentenceTransformer(model_name)
|
|
62
|
+
vectors = model.encode(
|
|
63
|
+
list(sentences),
|
|
64
|
+
batch_size=batch_size,
|
|
65
|
+
convert_to_numpy=True,
|
|
66
|
+
normalize_embeddings=True,
|
|
67
|
+
)
|
|
68
|
+
return np.asarray(vectors, dtype=float)
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Build a sentence graph from embeddings and read channels off it.
|
|
2
|
+
|
|
3
|
+
The pipeline ported from the Wittgenstein piece: sentence embeddings → a cosine
|
|
4
|
+
k-nearest-neighbour graph → PageRank centrality (the **size** channel) and
|
|
5
|
+
community detection (the **colour** channel). All functions take and return plain
|
|
6
|
+
arrays / a networkx graph, so the results drop straight into ``encode``.
|
|
7
|
+
|
|
8
|
+
Part of the optional ``lexograph[graph]`` extra.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Literal, cast
|
|
14
|
+
|
|
15
|
+
import networkx as nx
|
|
16
|
+
import numpy as np
|
|
17
|
+
from sklearn.metrics.pairwise import cosine_distances
|
|
18
|
+
from sklearn.neighbors import kneighbors_graph
|
|
19
|
+
|
|
20
|
+
from lexograph._types import FloatArray
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"knn_graph",
|
|
24
|
+
"embedding_distances",
|
|
25
|
+
"pagerank_scores",
|
|
26
|
+
"community_labels",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
CommunityMethod = Literal["louvain", "kmeans"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def knn_graph(embeddings: FloatArray, *, k: int = 5) -> nx.Graph:
|
|
33
|
+
"""Build a weighted cosine k-nearest-neighbour graph over the embeddings.
|
|
34
|
+
|
|
35
|
+
Node ``i`` is sentence ``i``; an edge carries the cosine **similarity**
|
|
36
|
+
(``1 - cosine distance``) as its ``weight``. The graph is undirected: the
|
|
37
|
+
mutual edge keeps the stronger of the two directed similarities.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
embeddings: An ``(N, D)`` array of sentence embeddings.
|
|
41
|
+
k: Neighbours per node (capped at ``N - 1``).
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
A networkx graph with nodes ``0 .. N-1`` and weighted edges.
|
|
45
|
+
|
|
46
|
+
Examples:
|
|
47
|
+
>>> import numpy as np
|
|
48
|
+
>>> emb = np.eye(4)
|
|
49
|
+
>>> g = knn_graph(emb, k=1)
|
|
50
|
+
>>> g.number_of_nodes()
|
|
51
|
+
4
|
|
52
|
+
"""
|
|
53
|
+
n = len(embeddings)
|
|
54
|
+
graph = nx.Graph()
|
|
55
|
+
graph.add_nodes_from(range(n))
|
|
56
|
+
if n < 2:
|
|
57
|
+
return graph
|
|
58
|
+
k_eff = min(k, n - 1)
|
|
59
|
+
adjacency = kneighbors_graph(
|
|
60
|
+
np.asarray(embeddings, dtype=float),
|
|
61
|
+
n_neighbors=k_eff,
|
|
62
|
+
mode="distance",
|
|
63
|
+
metric="cosine",
|
|
64
|
+
include_self=False,
|
|
65
|
+
).tocoo()
|
|
66
|
+
for i, j, distance in zip(
|
|
67
|
+
adjacency.row, adjacency.col, adjacency.data, strict=True
|
|
68
|
+
):
|
|
69
|
+
weight = 1.0 - float(distance)
|
|
70
|
+
if weight <= 0.0:
|
|
71
|
+
continue
|
|
72
|
+
if graph.has_edge(int(i), int(j)):
|
|
73
|
+
graph[int(i)][int(j)]["weight"] = max(
|
|
74
|
+
graph[int(i)][int(j)]["weight"], weight
|
|
75
|
+
)
|
|
76
|
+
else:
|
|
77
|
+
graph.add_edge(int(i), int(j), weight=weight)
|
|
78
|
+
return graph
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def embedding_distances(embeddings: FloatArray) -> FloatArray:
|
|
82
|
+
"""Return the pairwise cosine distance matrix of the embeddings.
|
|
83
|
+
|
|
84
|
+
Ready to pass as ``distances`` to
|
|
85
|
+
:func:`lexograph.presets.recurrence.recurrence_plot` for a semantic
|
|
86
|
+
recurrence dotplot.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
embeddings: An ``(N, D)`` array of sentence embeddings.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
An ``(N, N)`` cosine distance matrix with a zero diagonal.
|
|
93
|
+
|
|
94
|
+
Examples:
|
|
95
|
+
>>> import numpy as np
|
|
96
|
+
>>> d = embedding_distances(np.eye(3))
|
|
97
|
+
>>> d.shape
|
|
98
|
+
(3, 3)
|
|
99
|
+
"""
|
|
100
|
+
return np.asarray(
|
|
101
|
+
cosine_distances(np.asarray(embeddings, dtype=float)), dtype=float
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def pagerank_scores(graph: nx.Graph, n: int) -> FloatArray:
|
|
106
|
+
"""Return weighted PageRank as a per-sentence array (the size channel).
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
graph: A weighted sentence graph (e.g. from :func:`knn_graph`).
|
|
110
|
+
n: The total sentence count, so the result aligns to every sentence even
|
|
111
|
+
if the graph has dropped some nodes.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
A length-``n`` float array; entry ``i`` is the PageRank of node ``i`` (or
|
|
115
|
+
``0.0`` if the node is absent or the graph has no edges).
|
|
116
|
+
|
|
117
|
+
Examples:
|
|
118
|
+
>>> import networkx as nx
|
|
119
|
+
>>> g = nx.path_graph(3)
|
|
120
|
+
>>> for u, v in g.edges():
|
|
121
|
+
... g[u][v]["weight"] = 1.0
|
|
122
|
+
>>> pagerank_scores(g, 3).shape
|
|
123
|
+
(3,)
|
|
124
|
+
"""
|
|
125
|
+
if graph.number_of_edges() == 0:
|
|
126
|
+
return np.zeros(n, dtype=float)
|
|
127
|
+
ranks = nx.pagerank(graph, weight="weight")
|
|
128
|
+
return np.asarray([ranks.get(i, 0.0) for i in range(n)], dtype=float)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def community_labels(
|
|
132
|
+
graph: nx.Graph,
|
|
133
|
+
n: int,
|
|
134
|
+
*,
|
|
135
|
+
method: CommunityMethod = "louvain",
|
|
136
|
+
embeddings: FloatArray | None = None,
|
|
137
|
+
n_clusters: int = 10,
|
|
138
|
+
seed: int = 42,
|
|
139
|
+
) -> np.ndarray:
|
|
140
|
+
"""Return a per-sentence community label (the colour channel).
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
graph: A weighted sentence graph (used by the Louvain method).
|
|
144
|
+
n: The total sentence count, so labels align to every sentence.
|
|
145
|
+
method: ``"louvain"`` (graph communities, the default) or ``"kmeans"``
|
|
146
|
+
(clusters the embeddings directly).
|
|
147
|
+
embeddings: Required for ``"kmeans"``; the ``(N, D)`` embedding array.
|
|
148
|
+
n_clusters: Number of clusters for ``"kmeans"`` (capped at ``N``).
|
|
149
|
+
seed: Random seed for reproducibility.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
A length-``n`` integer array of community ids. For Louvain, id ``0`` is
|
|
153
|
+
the largest community; a node absent from the graph gets ``-1``.
|
|
154
|
+
|
|
155
|
+
Raises:
|
|
156
|
+
ValueError: If ``method`` is ``"kmeans"`` and ``embeddings`` is ``None``,
|
|
157
|
+
or ``method`` is unrecognised.
|
|
158
|
+
|
|
159
|
+
Examples:
|
|
160
|
+
>>> import networkx as nx
|
|
161
|
+
>>> g = nx.path_graph(4)
|
|
162
|
+
>>> for u, v in g.edges():
|
|
163
|
+
... g[u][v]["weight"] = 1.0
|
|
164
|
+
>>> labels = community_labels(g, 4)
|
|
165
|
+
>>> labels.shape
|
|
166
|
+
(4,)
|
|
167
|
+
"""
|
|
168
|
+
if method == "kmeans":
|
|
169
|
+
if embeddings is None:
|
|
170
|
+
msg = "kmeans community detection requires embeddings"
|
|
171
|
+
raise ValueError(msg)
|
|
172
|
+
from sklearn.cluster import KMeans
|
|
173
|
+
|
|
174
|
+
k = min(n_clusters, n)
|
|
175
|
+
labels = KMeans(n_clusters=k, random_state=seed, n_init=10).fit_predict(
|
|
176
|
+
np.asarray(embeddings, dtype=float)
|
|
177
|
+
)
|
|
178
|
+
return labels.astype(int)
|
|
179
|
+
if method == "louvain":
|
|
180
|
+
raw = cast(
|
|
181
|
+
"list[set[int]]",
|
|
182
|
+
nx.community.louvain_communities(graph, weight="weight", seed=seed),
|
|
183
|
+
)
|
|
184
|
+
communities = sorted(raw, key=len, reverse=True)
|
|
185
|
+
label_of = {
|
|
186
|
+
node: cid for cid, members in enumerate(communities) for node in members
|
|
187
|
+
}
|
|
188
|
+
return np.asarray([label_of.get(i, -1) for i in range(n)], dtype=int)
|
|
189
|
+
msg = f"method must be 'louvain' or 'kmeans', got {method!r}"
|
|
190
|
+
raise ValueError(msg)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Tiny bundled text for docs, tests, and every preset's worked example.
|
|
2
|
+
|
|
3
|
+
The demo text is the first chapter of Jane Austen's *Pride and Prejudice*
|
|
4
|
+
(public domain; see :mod:`lexograph.datasets._pride_and_prejudice`). It is large
|
|
5
|
+
enough to make a walk, spiral, dotplot, or concordance legible, and small enough
|
|
6
|
+
to keep the wheel tiny and the doctests fast.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from lexograph.datasets._pride_and_prejudice import PRIDE_AND_PREJUDICE_CH1
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def load_demo_text() -> str:
|
|
15
|
+
"""Return the bundled demo text: Chapter 1 of *Pride and Prejudice*.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
The chapter as a single string, paragraphs separated by blank lines.
|
|
19
|
+
|
|
20
|
+
Examples:
|
|
21
|
+
>>> text = load_demo_text()
|
|
22
|
+
>>> text.startswith("It is a truth universally acknowledged")
|
|
23
|
+
True
|
|
24
|
+
>>> "Bingley" in text
|
|
25
|
+
True
|
|
26
|
+
"""
|
|
27
|
+
return PRIDE_AND_PREJUDICE_CH1
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
__all__ = ["load_demo_text"]
|