lexograph 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lexograph/__init__.py ADDED
@@ -0,0 +1,65 @@
1
+ """lexograph — spatialize linear text into pictures you can read.
2
+
3
+ lexograph turns a linear text into a picture through one four-step spine:
4
+ **segment** the text into ordered units (characters, tokens, or sentences),
5
+ **lay them out** in 2-D or 3-D space, **encode** per-unit attributes onto
6
+ visual channels (size, colour, glyph), and **render** the result as a
7
+ matplotlib :class:`~matplotlib.figure.Figure` that displays inline in Jupyter
8
+ and saves cleanly with ``fig.savefig(...)``.
9
+ """
10
+
11
+ __version__ = "0.1.0"
12
+
13
+ from lexograph.datasets import load_demo_text
14
+ from lexograph.encode import (
15
+ Channels,
16
+ categorical_colors,
17
+ continuous_colors,
18
+ normalize_size,
19
+ )
20
+ from lexograph.layout import (
21
+ kwic,
22
+ linear_layout,
23
+ rendered_widths,
24
+ spiral_layout,
25
+ term_offsets,
26
+ walk3d_layout,
27
+ walk_layout,
28
+ )
29
+ from lexograph.presets import (
30
+ concordance,
31
+ punctuation_spiral,
32
+ recurrence_plot,
33
+ text_walk,
34
+ )
35
+ from lexograph.render import frame_axes, render_path, render_path_3d, render_points
36
+ from lexograph.scalars import frequencies, lengths, positions
37
+ from lexograph.segment import segment
38
+
39
+ __all__ = [
40
+ "segment",
41
+ "lengths",
42
+ "positions",
43
+ "frequencies",
44
+ "linear_layout",
45
+ "walk_layout",
46
+ "walk3d_layout",
47
+ "spiral_layout",
48
+ "rendered_widths",
49
+ "term_offsets",
50
+ "kwic",
51
+ "normalize_size",
52
+ "categorical_colors",
53
+ "continuous_colors",
54
+ "Channels",
55
+ "render_points",
56
+ "render_path",
57
+ "render_path_3d",
58
+ "frame_axes",
59
+ "punctuation_spiral",
60
+ "text_walk",
61
+ "recurrence_plot",
62
+ "concordance",
63
+ "load_demo_text",
64
+ "__version__",
65
+ ]
lexograph/_types.py ADDED
@@ -0,0 +1,26 @@
1
+ """Shared type aliases for the lexograph package.
2
+
3
+ The visual-channel arrays named here are the package's public data contract:
4
+ ``encode`` accepts plain per-unit arrays, and ``analyze``/``integrations`` only
5
+ ever *produce* arrays that satisfy these aliases. Nothing in the core needs to
6
+ know where the numbers came from.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Literal, TypeAlias
12
+
13
+ import numpy as np
14
+ import numpy.typing as npt
15
+
16
+ Unit: TypeAlias = str
17
+ """A single segmented unit of text: a character, token, or sentence."""
18
+
19
+ UnitKind: TypeAlias = Literal["chars", "tokens", "sentences"]
20
+ """Which kind of unit a segmenter emits."""
21
+
22
+ FloatArray: TypeAlias = npt.NDArray[np.float64]
23
+ """A 1-D array of floats: a per-unit scalar channel (e.g. ``size``)."""
24
+
25
+ Coords: TypeAlias = npt.NDArray[np.float64]
26
+ """An ``(N, 2)`` or ``(N, 3)`` array of layout coordinates, one row per unit."""
@@ -0,0 +1,139 @@
1
+ """Optional analysis layer: turn a text into per-sentence channel arrays.
2
+
3
+ Behind the ``lexograph[graph]`` extra. It runs the Wittgenstein pipeline —
4
+ sentence embeddings → cosine kNN graph → (optional disparity backbone) → PageRank
5
+ and community detection — and hands back plain arrays that satisfy the
6
+ ``encode`` data contract: a ``size`` array (PageRank centrality), a ``community``
7
+ array (colour labels), and a cosine ``distances`` matrix (for a semantic
8
+ recurrence dotplot). The core never imports this; the arrow points inward.
9
+
10
+ The core dependency-free scalars (length, position, frequency) live in
11
+ :mod:`lexograph.scalars`.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from dataclasses import dataclass
17
+ from typing import TYPE_CHECKING
18
+
19
+ from lexograph.analyze.backbone import extract_backbone
20
+ from lexograph.analyze.embeddings import embed_sentences
21
+ from lexograph.analyze.graph import (
22
+ community_labels,
23
+ embedding_distances,
24
+ knn_graph,
25
+ pagerank_scores,
26
+ )
27
+ from lexograph.segment.units import sentences as split_sentences
28
+
29
+ if TYPE_CHECKING:
30
+ import numpy as np
31
+
32
+ from lexograph._types import FloatArray
33
+ from lexograph.analyze.graph import CommunityMethod
34
+
35
+ __all__ = [
36
+ "Analysis",
37
+ "analyze_text",
38
+ "embed_sentences",
39
+ "knn_graph",
40
+ "embedding_distances",
41
+ "pagerank_scores",
42
+ "community_labels",
43
+ "extract_backbone",
44
+ ]
45
+
46
+
47
+ @dataclass(frozen=True, slots=True)
48
+ class Analysis:
49
+ """The per-sentence channel arrays derived from a text.
50
+
51
+ Attributes:
52
+ sentences: The segmented sentences (length ``N``).
53
+ embeddings: The ``(N, D)`` sentence embeddings.
54
+ size: PageRank centrality per sentence — the size channel.
55
+ community: Community id per sentence — the colour channel.
56
+ distances: The ``(N, N)`` cosine distance matrix — pass it to
57
+ ``recurrence_plot(distances=...)`` for a semantic dotplot.
58
+ """
59
+
60
+ sentences: list[str]
61
+ embeddings: FloatArray
62
+ size: FloatArray
63
+ community: np.ndarray
64
+ distances: FloatArray
65
+
66
+
67
+ def analyze_text(
68
+ text: str,
69
+ *,
70
+ embeddings: FloatArray | None = None,
71
+ k: int = 5,
72
+ community: CommunityMethod = "louvain",
73
+ n_clusters: int = 10,
74
+ backbone: bool = False,
75
+ min_alpha_ptile: float = 0.5,
76
+ seed: int = 42,
77
+ ) -> Analysis:
78
+ """Run the analysis pipeline and return per-sentence channel arrays.
79
+
80
+ Args:
81
+ text: The source text.
82
+ embeddings: Precomputed ``(N, D)`` embeddings to use. If ``None``, the
83
+ sentences are embedded with the default model (a model download).
84
+ k: Neighbours per node in the kNN graph.
85
+ community: Community method — ``"louvain"`` (default) or ``"kmeans"``.
86
+ n_clusters: Number of clusters for the ``"kmeans"`` method.
87
+ backbone: If ``True``, sparsify the kNN graph with the disparity filter
88
+ before PageRank and community detection.
89
+ min_alpha_ptile: Disparity-filter threshold (used when ``backbone``).
90
+ seed: Random seed for reproducibility.
91
+
92
+ Returns:
93
+ An :class:`Analysis` whose arrays align to the segmented sentences.
94
+
95
+ Raises:
96
+ ValueError: If ``embeddings`` is given but its length does not match the
97
+ sentence count.
98
+
99
+ Example:
100
+ Drive a walk and a semantic dotplot from the analysis (not run as a
101
+ doctest — embedding downloads a model)::
102
+
103
+ from lexograph import text_walk, recurrence_plot, load_demo_text
104
+ from lexograph.analyze import analyze_text
105
+
106
+ a = analyze_text(load_demo_text())
107
+ walk = text_walk(load_demo_text(), colour=a.community,
108
+ colour_kind="categorical", size=a.size)
109
+ dots = recurrence_plot(load_demo_text(), distances=a.distances,
110
+ threshold=0.4)
111
+ """
112
+ units = split_sentences(text)
113
+ n = len(units)
114
+ if embeddings is None:
115
+ embeddings = embed_sentences(units)
116
+ elif len(embeddings) != n:
117
+ msg = f"embeddings must have one row per sentence ({n}), got {len(embeddings)}"
118
+ raise ValueError(msg)
119
+
120
+ graph = knn_graph(embeddings, k=k)
121
+ if backbone:
122
+ graph = extract_backbone(graph, min_alpha_ptile=min_alpha_ptile)
123
+ size = pagerank_scores(graph, n)
124
+ labels = community_labels(
125
+ graph,
126
+ n,
127
+ method=community,
128
+ embeddings=embeddings,
129
+ n_clusters=n_clusters,
130
+ seed=seed,
131
+ )
132
+ distances = embedding_distances(embeddings)
133
+ return Analysis(
134
+ sentences=units,
135
+ embeddings=embeddings,
136
+ size=size,
137
+ community=labels,
138
+ distances=distances,
139
+ )
@@ -0,0 +1,170 @@
1
+ """Disparity-filter backbone extraction for weighted graphs.
2
+
3
+ The multiscale backbone of Serrano, Boguñá & Vespignani (2009),
4
+ https://arxiv.org/pdf/0904.2389.pdf — it keeps the statistically significant
5
+ edges of a weighted graph and discards the rest. Ported from the sibling
6
+ ``kenon`` package's ``backbone`` module (which in turn follows DerwenAI's
7
+ ``disparity_filter``); used here to sparsify the kNN sentence graph before
8
+ PageRank and community detection.
9
+
10
+ Part of the optional ``lexograph[graph]`` extra.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import copy
16
+
17
+ import networkx as nx
18
+ import numpy as np
19
+
20
+ __all__ = [
21
+ "disparity_integral",
22
+ "edge_alpha",
23
+ "apply_disparity_filter",
24
+ "extract_backbone",
25
+ ]
26
+
27
+
28
+ def disparity_integral(x: float, degree: float) -> float:
29
+ """Evaluate the disparity-filter PDF integral ``(1-x)^k / ((k-1)(x-1))``.
30
+
31
+ Args:
32
+ x: A normalised edge weight (must not be exactly 1.0).
33
+ degree: A node degree ``k`` (must not be exactly 1.0).
34
+
35
+ Returns:
36
+ The value of the integral at ``x`` for degree ``degree``.
37
+
38
+ Examples:
39
+ >>> disparity_integral(0.5, 3.0) != disparity_integral(0.0, 3.0)
40
+ True
41
+ """
42
+ return ((1.0 - x) ** degree) / ((degree - 1.0) * (x - 1.0))
43
+
44
+
45
+ def edge_alpha(norm_weight: float, degree: float) -> float:
46
+ """Return the disparity significance ``alpha`` of one edge endpoint.
47
+
48
+ Args:
49
+ norm_weight: The edge weight divided by the node's strength.
50
+ degree: The node's degree.
51
+
52
+ Returns:
53
+ ``alpha`` in ``[0, 1]``; lower means more significant. A node of degree
54
+ ``<= 1`` yields ``0.0``.
55
+
56
+ Examples:
57
+ >>> 0.0 <= edge_alpha(0.5, 3.0) <= 1.0
58
+ True
59
+ >>> edge_alpha(0.9, 1.0)
60
+ 0.0
61
+ """
62
+ if degree <= 1.0:
63
+ return 0.0
64
+ return 1.0 - (degree - 1.0) * (
65
+ disparity_integral(norm_weight, degree) - disparity_integral(0.0, degree)
66
+ )
67
+
68
+
69
+ def apply_disparity_filter(graph: nx.Graph) -> list[float]:
70
+ """Attach disparity statistics to every edge and node, in place.
71
+
72
+ Each node gains a ``strength`` (sum of incident weights); each edge gains
73
+ ``norm_weight``, ``alpha`` (the more significant of its two endpoints), and
74
+ ``alpha_ptile`` (the percentile of its alpha among all edges).
75
+
76
+ Args:
77
+ graph: A weighted networkx graph (edges carry a ``weight``).
78
+
79
+ Returns:
80
+ The list of all edge alpha values.
81
+
82
+ Examples:
83
+ >>> import networkx as nx
84
+ >>> g = nx.Graph()
85
+ >>> g.add_edge("a", "b", weight=0.8)
86
+ >>> g.add_edge("b", "c", weight=0.3)
87
+ >>> g.add_edge("a", "c", weight=0.5)
88
+ >>> len(apply_disparity_filter(g)) == g.number_of_edges()
89
+ True
90
+ """
91
+ if graph.number_of_edges() == 0:
92
+ return []
93
+
94
+ for node in graph.nodes():
95
+ graph.nodes[node]["strength"] = sum(
96
+ data.get("weight", 1.0) for _, _, data in graph.edges(node, data=True)
97
+ )
98
+
99
+ alphas: list[float] = []
100
+ for u, v, data in graph.edges(data=True):
101
+ weight = data.get("weight", 1.0)
102
+ alpha_u = _endpoint_alpha(graph, u, weight)
103
+ alpha_v = _endpoint_alpha(graph, v, weight)
104
+ data["alpha"] = min(alpha_u, alpha_v)
105
+ alphas.append(data["alpha"])
106
+
107
+ sorted_alphas = np.array(sorted(alphas))
108
+ for _u, _v, data in graph.edges(data=True):
109
+ data["alpha_ptile"] = float(
110
+ np.searchsorted(sorted_alphas, data["alpha"])
111
+ ) / len(sorted_alphas)
112
+ return alphas
113
+
114
+
115
+ def _endpoint_alpha(graph: nx.Graph, node: object, weight: float) -> float:
116
+ """Disparity alpha of ``weight`` as seen from one endpoint ``node``."""
117
+ strength = graph.nodes[node]["strength"]
118
+ degree = float(graph.degree(node))
119
+ norm = weight / strength if strength > 0 else 0.0
120
+ return edge_alpha(norm, degree)
121
+
122
+
123
+ def extract_backbone(
124
+ graph: nx.Graph,
125
+ *,
126
+ min_alpha_ptile: float = 0.5,
127
+ min_degree: int = 1,
128
+ ) -> nx.Graph:
129
+ """Return the disparity-filter backbone of a weighted graph.
130
+
131
+ The input is copied (never mutated): edges below ``min_alpha_ptile`` are
132
+ dropped, then nodes whose degree falls below ``min_degree`` are pruned
133
+ iteratively until stable.
134
+
135
+ Args:
136
+ graph: A weighted networkx graph.
137
+ min_alpha_ptile: Edges with an alpha percentile below this are removed.
138
+ min_degree: Nodes left with a degree below this are pruned (``1`` keeps
139
+ any node that still has an edge).
140
+
141
+ Returns:
142
+ A new graph containing only the backbone.
143
+
144
+ Examples:
145
+ >>> import networkx as nx
146
+ >>> g = nx.path_graph(5)
147
+ >>> for u, v in g.edges():
148
+ ... g[u][v]["weight"] = float(v + 1)
149
+ >>> bb = extract_backbone(g, min_alpha_ptile=0.3)
150
+ >>> bb.number_of_nodes() <= g.number_of_nodes()
151
+ True
152
+ """
153
+ if graph.number_of_edges() == 0:
154
+ return nx.Graph()
155
+
156
+ result = copy.deepcopy(graph)
157
+ apply_disparity_filter(result)
158
+ result.remove_edges_from(
159
+ [
160
+ (u, v)
161
+ for u, v, data in result.edges(data=True)
162
+ if data.get("alpha_ptile", 0.0) < min_alpha_ptile
163
+ ]
164
+ )
165
+ changed = True
166
+ while changed:
167
+ prune = [n for n in list(result.nodes()) if result.degree(n) < min_degree]
168
+ changed = bool(prune)
169
+ result.remove_nodes_from(prune)
170
+ return result
@@ -0,0 +1,68 @@
1
+ """Embed sentences with a sentence-transformers model.
2
+
3
+ This is the only step that pulls in the heavy neural dependency, so the import is
4
+ deferred to call time: importing this module (and the rest of ``analyze``) is
5
+ cheap, and the model is loaded only when you actually embed. Every other
6
+ ``analyze`` function takes the embedding array, so the whole graph pipeline can
7
+ be exercised with your own vectors and no model at all.
8
+
9
+ Part of the optional ``lexograph[graph]`` extra.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import importlib
15
+ from collections.abc import Sequence
16
+ from typing import Any
17
+
18
+ import numpy as np
19
+
20
+ from lexograph._types import FloatArray
21
+
22
+ __all__ = ["embed_sentences", "DEFAULT_MODEL"]
23
+
24
+ DEFAULT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
25
+ """The default embedding model: small, CPU-fast, 384-dimensional."""
26
+
27
+
28
+ def embed_sentences(
29
+ sentences: Sequence[str],
30
+ *,
31
+ model_name: str = DEFAULT_MODEL,
32
+ batch_size: int = 64,
33
+ ) -> FloatArray:
34
+ """Embed sentences into L2-normalised vectors.
35
+
36
+ The model is downloaded on first use and cached by ``sentence-transformers``.
37
+ Because the embeddings are L2-normalised, a dot product equals cosine
38
+ similarity.
39
+
40
+ Args:
41
+ sentences: The sentences to embed.
42
+ model_name: A sentence-transformers model id.
43
+ batch_size: Encoding batch size.
44
+
45
+ Returns:
46
+ An ``(N, D)`` float array of unit-norm sentence embeddings.
47
+
48
+ Example:
49
+ Not run as a doctest (it would download the model)::
50
+
51
+ from lexograph import segment, load_demo_text
52
+ from lexograph.analyze.embeddings import embed_sentences
53
+
54
+ sentences = segment(load_demo_text())
55
+ embeddings = embed_sentences(sentences)
56
+ """
57
+ # Imported by name so the heavy optional dependency is neither required to
58
+ # import this module nor statically type-checked when it is absent.
59
+ st: Any = importlib.import_module("sentence_transformers")
60
+
61
+ model = st.SentenceTransformer(model_name)
62
+ vectors = model.encode(
63
+ list(sentences),
64
+ batch_size=batch_size,
65
+ convert_to_numpy=True,
66
+ normalize_embeddings=True,
67
+ )
68
+ return np.asarray(vectors, dtype=float)
@@ -0,0 +1,190 @@
1
+ """Build a sentence graph from embeddings and read channels off it.
2
+
3
+ The pipeline ported from the Wittgenstein piece: sentence embeddings → a cosine
4
+ k-nearest-neighbour graph → PageRank centrality (the **size** channel) and
5
+ community detection (the **colour** channel). All functions take and return plain
6
+ arrays / a networkx graph, so the results drop straight into ``encode``.
7
+
8
+ Part of the optional ``lexograph[graph]`` extra.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Literal, cast
14
+
15
+ import networkx as nx
16
+ import numpy as np
17
+ from sklearn.metrics.pairwise import cosine_distances
18
+ from sklearn.neighbors import kneighbors_graph
19
+
20
+ from lexograph._types import FloatArray
21
+
22
+ __all__ = [
23
+ "knn_graph",
24
+ "embedding_distances",
25
+ "pagerank_scores",
26
+ "community_labels",
27
+ ]
28
+
29
+ CommunityMethod = Literal["louvain", "kmeans"]
30
+
31
+
32
+ def knn_graph(embeddings: FloatArray, *, k: int = 5) -> nx.Graph:
33
+ """Build a weighted cosine k-nearest-neighbour graph over the embeddings.
34
+
35
+ Node ``i`` is sentence ``i``; an edge carries the cosine **similarity**
36
+ (``1 - cosine distance``) as its ``weight``. The graph is undirected: the
37
+ mutual edge keeps the stronger of the two directed similarities.
38
+
39
+ Args:
40
+ embeddings: An ``(N, D)`` array of sentence embeddings.
41
+ k: Neighbours per node (capped at ``N - 1``).
42
+
43
+ Returns:
44
+ A networkx graph with nodes ``0 .. N-1`` and weighted edges.
45
+
46
+ Examples:
47
+ >>> import numpy as np
48
+ >>> emb = np.eye(4)
49
+ >>> g = knn_graph(emb, k=1)
50
+ >>> g.number_of_nodes()
51
+ 4
52
+ """
53
+ n = len(embeddings)
54
+ graph = nx.Graph()
55
+ graph.add_nodes_from(range(n))
56
+ if n < 2:
57
+ return graph
58
+ k_eff = min(k, n - 1)
59
+ adjacency = kneighbors_graph(
60
+ np.asarray(embeddings, dtype=float),
61
+ n_neighbors=k_eff,
62
+ mode="distance",
63
+ metric="cosine",
64
+ include_self=False,
65
+ ).tocoo()
66
+ for i, j, distance in zip(
67
+ adjacency.row, adjacency.col, adjacency.data, strict=True
68
+ ):
69
+ weight = 1.0 - float(distance)
70
+ if weight <= 0.0:
71
+ continue
72
+ if graph.has_edge(int(i), int(j)):
73
+ graph[int(i)][int(j)]["weight"] = max(
74
+ graph[int(i)][int(j)]["weight"], weight
75
+ )
76
+ else:
77
+ graph.add_edge(int(i), int(j), weight=weight)
78
+ return graph
79
+
80
+
81
+ def embedding_distances(embeddings: FloatArray) -> FloatArray:
82
+ """Return the pairwise cosine distance matrix of the embeddings.
83
+
84
+ Ready to pass as ``distances`` to
85
+ :func:`lexograph.presets.recurrence.recurrence_plot` for a semantic
86
+ recurrence dotplot.
87
+
88
+ Args:
89
+ embeddings: An ``(N, D)`` array of sentence embeddings.
90
+
91
+ Returns:
92
+ An ``(N, N)`` cosine distance matrix with a zero diagonal.
93
+
94
+ Examples:
95
+ >>> import numpy as np
96
+ >>> d = embedding_distances(np.eye(3))
97
+ >>> d.shape
98
+ (3, 3)
99
+ """
100
+ return np.asarray(
101
+ cosine_distances(np.asarray(embeddings, dtype=float)), dtype=float
102
+ )
103
+
104
+
105
+ def pagerank_scores(graph: nx.Graph, n: int) -> FloatArray:
106
+ """Return weighted PageRank as a per-sentence array (the size channel).
107
+
108
+ Args:
109
+ graph: A weighted sentence graph (e.g. from :func:`knn_graph`).
110
+ n: The total sentence count, so the result aligns to every sentence even
111
+ if the graph has dropped some nodes.
112
+
113
+ Returns:
114
+ A length-``n`` float array; entry ``i`` is the PageRank of node ``i`` (or
115
+ ``0.0`` if the node is absent or the graph has no edges).
116
+
117
+ Examples:
118
+ >>> import networkx as nx
119
+ >>> g = nx.path_graph(3)
120
+ >>> for u, v in g.edges():
121
+ ... g[u][v]["weight"] = 1.0
122
+ >>> pagerank_scores(g, 3).shape
123
+ (3,)
124
+ """
125
+ if graph.number_of_edges() == 0:
126
+ return np.zeros(n, dtype=float)
127
+ ranks = nx.pagerank(graph, weight="weight")
128
+ return np.asarray([ranks.get(i, 0.0) for i in range(n)], dtype=float)
129
+
130
+
131
+ def community_labels(
132
+ graph: nx.Graph,
133
+ n: int,
134
+ *,
135
+ method: CommunityMethod = "louvain",
136
+ embeddings: FloatArray | None = None,
137
+ n_clusters: int = 10,
138
+ seed: int = 42,
139
+ ) -> np.ndarray:
140
+ """Return a per-sentence community label (the colour channel).
141
+
142
+ Args:
143
+ graph: A weighted sentence graph (used by the Louvain method).
144
+ n: The total sentence count, so labels align to every sentence.
145
+ method: ``"louvain"`` (graph communities, the default) or ``"kmeans"``
146
+ (clusters the embeddings directly).
147
+ embeddings: Required for ``"kmeans"``; the ``(N, D)`` embedding array.
148
+ n_clusters: Number of clusters for ``"kmeans"`` (capped at ``N``).
149
+ seed: Random seed for reproducibility.
150
+
151
+ Returns:
152
+ A length-``n`` integer array of community ids. For Louvain, id ``0`` is
153
+ the largest community; a node absent from the graph gets ``-1``.
154
+
155
+ Raises:
156
+ ValueError: If ``method`` is ``"kmeans"`` and ``embeddings`` is ``None``,
157
+ or ``method`` is unrecognised.
158
+
159
+ Examples:
160
+ >>> import networkx as nx
161
+ >>> g = nx.path_graph(4)
162
+ >>> for u, v in g.edges():
163
+ ... g[u][v]["weight"] = 1.0
164
+ >>> labels = community_labels(g, 4)
165
+ >>> labels.shape
166
+ (4,)
167
+ """
168
+ if method == "kmeans":
169
+ if embeddings is None:
170
+ msg = "kmeans community detection requires embeddings"
171
+ raise ValueError(msg)
172
+ from sklearn.cluster import KMeans
173
+
174
+ k = min(n_clusters, n)
175
+ labels = KMeans(n_clusters=k, random_state=seed, n_init=10).fit_predict(
176
+ np.asarray(embeddings, dtype=float)
177
+ )
178
+ return labels.astype(int)
179
+ if method == "louvain":
180
+ raw = cast(
181
+ "list[set[int]]",
182
+ nx.community.louvain_communities(graph, weight="weight", seed=seed),
183
+ )
184
+ communities = sorted(raw, key=len, reverse=True)
185
+ label_of = {
186
+ node: cid for cid, members in enumerate(communities) for node in members
187
+ }
188
+ return np.asarray([label_of.get(i, -1) for i in range(n)], dtype=int)
189
+ msg = f"method must be 'louvain' or 'kmeans', got {method!r}"
190
+ raise ValueError(msg)
@@ -0,0 +1,30 @@
1
+ """Tiny bundled text for docs, tests, and every preset's worked example.
2
+
3
+ The demo text is the first chapter of Jane Austen's *Pride and Prejudice*
4
+ (public domain; see :mod:`lexograph.datasets._pride_and_prejudice`). It is large
5
+ enough to make a walk, spiral, dotplot, or concordance legible, and small enough
6
+ to keep the wheel tiny and the doctests fast.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from lexograph.datasets._pride_and_prejudice import PRIDE_AND_PREJUDICE_CH1
12
+
13
+
14
+ def load_demo_text() -> str:
15
+ """Return the bundled demo text: Chapter 1 of *Pride and Prejudice*.
16
+
17
+ Returns:
18
+ The chapter as a single string, paragraphs separated by blank lines.
19
+
20
+ Examples:
21
+ >>> text = load_demo_text()
22
+ >>> text.startswith("It is a truth universally acknowledged")
23
+ True
24
+ >>> "Bingley" in text
25
+ True
26
+ """
27
+ return PRIDE_AND_PREJUDICE_CH1
28
+
29
+
30
+ __all__ = ["load_demo_text"]