lexograph 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. lexograph-0.1.0/PKG-INFO +135 -0
  2. lexograph-0.1.0/README.md +94 -0
  3. lexograph-0.1.0/pyproject.toml +112 -0
  4. lexograph-0.1.0/src/lexograph/__init__.py +65 -0
  5. lexograph-0.1.0/src/lexograph/_types.py +26 -0
  6. lexograph-0.1.0/src/lexograph/analyze/__init__.py +139 -0
  7. lexograph-0.1.0/src/lexograph/analyze/backbone.py +170 -0
  8. lexograph-0.1.0/src/lexograph/analyze/embeddings.py +68 -0
  9. lexograph-0.1.0/src/lexograph/analyze/graph.py +190 -0
  10. lexograph-0.1.0/src/lexograph/datasets/__init__.py +30 -0
  11. lexograph-0.1.0/src/lexograph/datasets/_pride_and_prejudice.py +130 -0
  12. lexograph-0.1.0/src/lexograph/encode/__init__.py +17 -0
  13. lexograph-0.1.0/src/lexograph/encode/channels.py +164 -0
  14. lexograph-0.1.0/src/lexograph/io/__init__.py +1 -0
  15. lexograph-0.1.0/src/lexograph/layout/__init__.py +24 -0
  16. lexograph-0.1.0/src/lexograph/layout/dispersion.py +116 -0
  17. lexograph-0.1.0/src/lexograph/layout/linear.py +65 -0
  18. lexograph-0.1.0/src/lexograph/layout/recurrence.py +117 -0
  19. lexograph-0.1.0/src/lexograph/layout/spiral.py +115 -0
  20. lexograph-0.1.0/src/lexograph/layout/walk.py +121 -0
  21. lexograph-0.1.0/src/lexograph/layout/walk3d.py +64 -0
  22. lexograph-0.1.0/src/lexograph/layout/widths.py +86 -0
  23. lexograph-0.1.0/src/lexograph/presets/__init__.py +14 -0
  24. lexograph-0.1.0/src/lexograph/presets/concordance.py +94 -0
  25. lexograph-0.1.0/src/lexograph/presets/punctuation_spiral.py +139 -0
  26. lexograph-0.1.0/src/lexograph/presets/recurrence.py +104 -0
  27. lexograph-0.1.0/src/lexograph/presets/text_walk.py +200 -0
  28. lexograph-0.1.0/src/lexograph/py.typed +0 -0
  29. lexograph-0.1.0/src/lexograph/render/__init__.py +6 -0
  30. lexograph-0.1.0/src/lexograph/render/mpl.py +233 -0
  31. lexograph-0.1.0/src/lexograph/render/mpl3d.py +135 -0
  32. lexograph-0.1.0/src/lexograph/scalars.py +70 -0
  33. lexograph-0.1.0/src/lexograph/segment/__init__.py +5 -0
  34. lexograph-0.1.0/src/lexograph/segment/units.py +188 -0
@@ -0,0 +1,135 @@
1
+ Metadata-Version: 2.3
2
+ Name: lexograph
3
+ Version: 0.1.0
4
+ Summary: Spatialize linear text into pictures you can read: walks, spirals, dotplots, and concordances
5
+ Keywords: nlp,text-visualization,information-design,concordance,recurrence-plot,dotplot,text-analysis,computational-humanities
6
+ Author: Zoltan Varju, Orsolya Putz
7
+ Author-email: Zoltan Varju <zoltan.varju@crowintelligence.org>, Orsolya Putz <orsolya.putz@crowintelligence.org>
8
+ License: MIT
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Scientific/Engineering :: Visualization
17
+ Classifier: Topic :: Text Processing :: Linguistic
18
+ Requires-Dist: numpy>=1.26
19
+ Requires-Dist: matplotlib>=3.8
20
+ Requires-Dist: nltk>=3.8
21
+ Requires-Dist: pytest>=8 ; extra == 'dev'
22
+ Requires-Dist: pytest-cov ; extra == 'dev'
23
+ Requires-Dist: hypothesis>=6.100 ; extra == 'dev'
24
+ Requires-Dist: ruff>=0.4 ; extra == 'dev'
25
+ Requires-Dist: ty ; extra == 'dev'
26
+ Requires-Dist: mutmut>=3 ; extra == 'dev'
27
+ Requires-Dist: mkdocs-material ; extra == 'docs'
28
+ Requires-Dist: mkdocstrings[python] ; extra == 'docs'
29
+ Requires-Dist: sentence-transformers>=2.2 ; extra == 'graph'
30
+ Requires-Dist: scikit-learn>=1.3 ; extra == 'graph'
31
+ Requires-Dist: networkx>=3.0 ; extra == 'graph'
32
+ Requires-Dist: scipy>=1.10 ; extra == 'graph'
33
+ Requires-Python: >=3.11
34
+ Project-URL: Homepage, https://crowintelligence.org/
35
+ Project-URL: Repository, https://github.com/crow-intelligence/lexograph
36
+ Project-URL: Documentation, https://lexograph.readthedocs.io
37
+ Provides-Extra: dev
38
+ Provides-Extra: docs
39
+ Provides-Extra: graph
40
+ Description-Content-Type: text/markdown
41
+
42
+ <p align="center">
43
+ <img src="https://raw.githubusercontent.com/crow-intelligence/lexograph/main/docs/assets/logo.png" alt="lexograph" width="480">
44
+ </p>
45
+
46
+ # lexograph
47
+
48
+ Spatialize linear text into pictures you can read — in pure Python, rendered with
49
+ matplotlib.
50
+
51
+ lexograph is the visualization member of the corpus-lx family (alongside
52
+ [chronowords](https://github.com/crow-intelligence/chronowords),
53
+ [kenon](https://github.com/crow-intelligence/kenon), and
54
+ [keyflux](https://github.com/crow-intelligence/keyflux)). It turns a text into a
55
+ picture through one four-step spine — **segment → layout → encode → render** — and
56
+ ships several presets that are each just a point on that spine:
57
+
58
+ - **Punctuation spiral** — every non-alphanumeric mark, in order, along an Archimedean
59
+ spiral, coloured by symbol class.
60
+ - **Text walk (2-D / 3-D)** — each sentence steps forward and turns 90°, space-filling;
61
+ size, colour, and glyph encode per-unit attributes. The 3-D variant lifts the walk
62
+ into a corkscrew.
63
+ - **Recurrence dotplot** — the only preset that plots a text against *itself*: a
64
+ sentence × sentence self-similarity grid that exposes internal echo structure.
65
+ - **Concordance** — a term's dispersion across the text (and across texts/time), with
66
+ optional KWIC.
67
+
68
+ Every preset returns a matplotlib `Figure` and never calls `show()`, so it renders
69
+ inline in Jupyter and saves cleanly with `fig.savefig(...)`. The core is headless and
70
+ dependency-light; heavy analysis (sentence embeddings, graph centrality, communities)
71
+ lives behind an optional `[graph]` extra.
72
+
73
+ ## Installation
74
+
75
+ ```bash
76
+ uv add lexograph
77
+ ```
78
+
79
+ The core depends only on `numpy`, `matplotlib`, and `nltk`.
80
+
81
+ ## Quickstart
82
+
83
+ ```python
84
+ from lexograph import (
85
+ load_demo_text, punctuation_spiral, text_walk, recurrence_plot, concordance,
86
+ )
87
+
88
+ text = load_demo_text() # Chapter 1 of Pride and Prejudice (public domain)
89
+
90
+ punctuation_spiral(text) # marks on an Archimedean spiral
91
+ text_walk(text) # the 2-D space-filling walk
92
+ text_walk(text, helix=True, z_step=4.0) # the 3-D corkscrew
93
+ recurrence_plot(text) # the text against itself
94
+ concordance(text, ["Bennet", "Bingley", "wife"]) # term dispersion
95
+ ```
96
+
97
+ Every call returns a matplotlib `Figure`. For PageRank-sized, community-coloured, or
98
+ semantically-recurrent figures, install the `[graph]` extra and feed
99
+ `lexograph.analyze.analyze_text`'s arrays into any preset.
100
+
101
+ ## The data contract
102
+
103
+ Every visual channel is fed by a **plain per-unit array** — a scalar array for `size`,
104
+ an array of labels or values for `colour`, an optional `glyph`/font. Nothing in the
105
+ core knows where those numbers came from, so you can drive a spiral or a walk from
106
+ `length`, `frequency`, or your own column with no analysis stack at all. The optional
107
+ `analyze` layer and the `[kenon]` / `[chronowords]` integrations only *produce* arrays
108
+ that satisfy this contract.
109
+
110
+ ## Documentation
111
+
112
+ Full documentation — quickstart, a tutorial per preset, troubleshooting, and the API
113
+ reference — is at [lexograph.readthedocs.io](https://lexograph.readthedocs.io). The
114
+ sources live in `docs/`.
115
+
116
+ ## Roadmap
117
+
118
+ Open modelling and packaging decisions are analysed in
119
+ [`CHANGES_SUMMARY.md`](CHANGES_SUMMARY.md), and the failure modes in
120
+ [`PRE-MORTEM.md`](PRE-MORTEM.md). The main not-yet-built pieces:
121
+
122
+ - [ ] Vendor the four OFL/Apache handwriting fonts so the handwriting walk works out of
123
+ the box (the width-step already runs on any TTF).
124
+ - [ ] `integrations/` — thin `[kenon]` and `[chronowords]` adapters over the data contract.
125
+ - [ ] Optional interactive HTML/WebGL export (`render/html.py`), ported from the source
126
+ viewers and gated behind a stretch extra.
127
+ - [ ] A space-filling / non-self-overlapping turn rule for very uniform texts.
128
+
129
+ ## Made by
130
+
131
+ lexograph is made by [Crow Intelligence](https://crowintelligence.org/).
132
+
133
+ ## License
134
+
135
+ MIT
@@ -0,0 +1,94 @@
1
+ <p align="center">
2
+ <img src="https://raw.githubusercontent.com/crow-intelligence/lexograph/main/docs/assets/logo.png" alt="lexograph" width="480">
3
+ </p>
4
+
5
+ # lexograph
6
+
7
+ Spatialize linear text into pictures you can read — in pure Python, rendered with
8
+ matplotlib.
9
+
10
+ lexograph is the visualization member of the corpus-lx family (alongside
11
+ [chronowords](https://github.com/crow-intelligence/chronowords),
12
+ [kenon](https://github.com/crow-intelligence/kenon), and
13
+ [keyflux](https://github.com/crow-intelligence/keyflux)). It turns a text into a
14
+ picture through one four-step spine — **segment → layout → encode → render** — and
15
+ ships several presets that are each just a point on that spine:
16
+
17
+ - **Punctuation spiral** — every non-alphanumeric mark, in order, along an Archimedean
18
+ spiral, coloured by symbol class.
19
+ - **Text walk (2-D / 3-D)** — each sentence steps forward and turns 90°, space-filling;
20
+ size, colour, and glyph encode per-unit attributes. The 3-D variant lifts the walk
21
+ into a corkscrew.
22
+ - **Recurrence dotplot** — the only preset that plots a text against *itself*: a
23
+ sentence × sentence self-similarity grid that exposes internal echo structure.
24
+ - **Concordance** — a term's dispersion across the text (and across texts/time), with
25
+ optional KWIC.
26
+
27
+ Every preset returns a matplotlib `Figure` and never calls `show()`, so it renders
28
+ inline in Jupyter and saves cleanly with `fig.savefig(...)`. The core is headless and
29
+ dependency-light; heavy analysis (sentence embeddings, graph centrality, communities)
30
+ lives behind an optional `[graph]` extra.
31
+
32
+ ## Installation
33
+
34
+ ```bash
35
+ uv add lexograph
36
+ ```
37
+
38
+ The core depends only on `numpy`, `matplotlib`, and `nltk`.
39
+
40
+ ## Quickstart
41
+
42
+ ```python
43
+ from lexograph import (
44
+ load_demo_text, punctuation_spiral, text_walk, recurrence_plot, concordance,
45
+ )
46
+
47
+ text = load_demo_text() # Chapter 1 of Pride and Prejudice (public domain)
48
+
49
+ punctuation_spiral(text) # marks on an Archimedean spiral
50
+ text_walk(text) # the 2-D space-filling walk
51
+ text_walk(text, helix=True, z_step=4.0) # the 3-D corkscrew
52
+ recurrence_plot(text) # the text against itself
53
+ concordance(text, ["Bennet", "Bingley", "wife"]) # term dispersion
54
+ ```
55
+
56
+ Every call returns a matplotlib `Figure`. For PageRank-sized, community-coloured, or
57
+ semantically-recurrent figures, install the `[graph]` extra and feed
58
+ `lexograph.analyze.analyze_text`'s arrays into any preset.
59
+
60
+ ## The data contract
61
+
62
+ Every visual channel is fed by a **plain per-unit array** — a scalar array for `size`,
63
+ an array of labels or values for `colour`, an optional `glyph`/font. Nothing in the
64
+ core knows where those numbers came from, so you can drive a spiral or a walk from
65
+ `length`, `frequency`, or your own column with no analysis stack at all. The optional
66
+ `analyze` layer and the `[kenon]` / `[chronowords]` integrations only *produce* arrays
67
+ that satisfy this contract.
68
+
69
+ ## Documentation
70
+
71
+ Full documentation — quickstart, a tutorial per preset, troubleshooting, and the API
72
+ reference — is at [lexograph.readthedocs.io](https://lexograph.readthedocs.io). The
73
+ sources live in `docs/`.
74
+
75
+ ## Roadmap
76
+
77
+ Open modelling and packaging decisions are analysed in
78
+ [`CHANGES_SUMMARY.md`](CHANGES_SUMMARY.md), and the failure modes in
79
+ [`PRE-MORTEM.md`](PRE-MORTEM.md). The main not-yet-built pieces:
80
+
81
+ - [ ] Vendor the four OFL/Apache handwriting fonts so the handwriting walk works out of
82
+ the box (the width-step already runs on any TTF).
83
+ - [ ] `integrations/` — thin `[kenon]` and `[chronowords]` adapters over the data contract.
84
+ - [ ] Optional interactive HTML/WebGL export (`render/html.py`), ported from the source
85
+ viewers and gated behind a stretch extra.
86
+ - [ ] A space-filling / non-self-overlapping turn rule for very uniform texts.
87
+
88
+ ## Made by
89
+
90
+ lexograph is made by [Crow Intelligence](https://crowintelligence.org/).
91
+
92
+ ## License
93
+
94
+ MIT
@@ -0,0 +1,112 @@
1
+ [project]
2
+ name = "lexograph"
3
+ version = "0.1.0"
4
+ description = "Spatialize linear text into pictures you can read: walks, spirals, dotplots, and concordances"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Zoltan Varju", email = "zoltan.varju@crowintelligence.org" },
8
+ { name = "Orsolya Putz", email = "orsolya.putz@crowintelligence.org" },
9
+ ]
10
+ requires-python = ">=3.11"
11
+ license = { text = "MIT" }
12
+ keywords = [
13
+ "nlp",
14
+ "text-visualization",
15
+ "information-design",
16
+ "concordance",
17
+ "recurrence-plot",
18
+ "dotplot",
19
+ "text-analysis",
20
+ "computational-humanities",
21
+ ]
22
+ classifiers = [
23
+ "Development Status :: 3 - Alpha",
24
+ "Intended Audience :: Science/Research",
25
+ "License :: OSI Approved :: MIT License",
26
+ "Programming Language :: Python :: 3",
27
+ "Programming Language :: Python :: 3.11",
28
+ "Programming Language :: Python :: 3.12",
29
+ "Programming Language :: Python :: 3.13",
30
+ "Topic :: Scientific/Engineering :: Visualization",
31
+ "Topic :: Text Processing :: Linguistic",
32
+ ]
33
+ dependencies = [
34
+ "numpy>=1.26",
35
+ "matplotlib>=3.8",
36
+ "nltk>=3.8",
37
+ ]
38
+
39
+ [project.urls]
40
+ Homepage = "https://crowintelligence.org/"
41
+ Repository = "https://github.com/crow-intelligence/lexograph"
42
+ Documentation = "https://lexograph.readthedocs.io"
43
+
44
+ [project.optional-dependencies]
45
+ # Heavy analysis layer (lexograph.analyze): sentence embeddings, kNN graph,
46
+ # disparity-filter backbone, PageRank, and community detection. The core never
47
+ # imports these.
48
+ graph = [
49
+ "sentence-transformers>=2.2",
50
+ "scikit-learn>=1.3",
51
+ "networkx>=3.0",
52
+ "scipy>=1.10",
53
+ ]
54
+ dev = [
55
+ "pytest>=8",
56
+ "pytest-cov",
57
+ "hypothesis>=6.100",
58
+ "ruff>=0.4",
59
+ "ty",
60
+ "mutmut>=3",
61
+ ]
62
+ docs = [
63
+ "mkdocs-material",
64
+ "mkdocstrings[python]",
65
+ ]
66
+
67
+ [build-system]
68
+ requires = ["uv_build>=0.9.22,<0.10.0"]
69
+ build-backend = "uv_build"
70
+
71
+ [tool.ruff]
72
+ line-length = 88
73
+ target-version = "py311"
74
+
75
+ [tool.ruff.lint]
76
+ select = ["E", "F", "I", "N", "UP", "ANN", "D"]
77
+ ignore = ["D105", "D107"]
78
+
79
+ [tool.ruff.lint.per-file-ignores]
80
+ "tests/**" = ["D100", "D102", "D103", "D104", "ANN"]
81
+ "examples/**" = ["D100", "ANN"]
82
+
83
+ [tool.ruff.lint.pydocstyle]
84
+ convention = "google"
85
+
86
+ [tool.pytest.ini_options]
87
+ addopts = "--doctest-modules --tb=short"
88
+ testpaths = ["src", "tests"]
89
+
90
+ # Mutation testing (dev-only, not run in CI). Scoped to the layout geometry — the
91
+ # math-heavy core — and its fast, plotting-free test files. Run with
92
+ # `uv run mutmut run`.
93
+ [tool.mutmut]
94
+ source_paths = ["src/lexograph"]
95
+ only_mutate = [
96
+ "src/lexograph/layout/walk.py",
97
+ "src/lexograph/layout/walk3d.py",
98
+ "src/lexograph/layout/spiral.py",
99
+ "src/lexograph/layout/recurrence.py",
100
+ "src/lexograph/layout/dispersion.py",
101
+ ]
102
+ pytest_add_cli_args_test_selection = [
103
+ "tests/test_walk.py",
104
+ "tests/test_walk3d.py",
105
+ "tests/test_spiral.py",
106
+ "tests/test_recurrence.py",
107
+ "tests/test_dispersion.py",
108
+ "tests/test_geometry.py",
109
+ ]
110
+ pytest_add_cli_args = ["-p", "no:cacheprovider"]
111
+
112
+ [tool.ty]
@@ -0,0 +1,65 @@
1
+ """lexograph — spatialize linear text into pictures you can read.
2
+
3
+ lexograph turns a linear text into a picture through one four-step spine:
4
+ **segment** the text into ordered units (characters, tokens, or sentences),
5
+ **lay them out** in 2-D or 3-D space, **encode** per-unit attributes onto
6
+ visual channels (size, colour, glyph), and **render** the result as a
7
+ matplotlib :class:`~matplotlib.figure.Figure` that displays inline in Jupyter
8
+ and saves cleanly with ``fig.savefig(...)``.
9
+ """
10
+
11
+ __version__ = "0.1.0"
12
+
13
+ from lexograph.datasets import load_demo_text
14
+ from lexograph.encode import (
15
+ Channels,
16
+ categorical_colors,
17
+ continuous_colors,
18
+ normalize_size,
19
+ )
20
+ from lexograph.layout import (
21
+ kwic,
22
+ linear_layout,
23
+ rendered_widths,
24
+ spiral_layout,
25
+ term_offsets,
26
+ walk3d_layout,
27
+ walk_layout,
28
+ )
29
+ from lexograph.presets import (
30
+ concordance,
31
+ punctuation_spiral,
32
+ recurrence_plot,
33
+ text_walk,
34
+ )
35
+ from lexograph.render import frame_axes, render_path, render_path_3d, render_points
36
+ from lexograph.scalars import frequencies, lengths, positions
37
+ from lexograph.segment import segment
38
+
39
+ __all__ = [
40
+ "segment",
41
+ "lengths",
42
+ "positions",
43
+ "frequencies",
44
+ "linear_layout",
45
+ "walk_layout",
46
+ "walk3d_layout",
47
+ "spiral_layout",
48
+ "rendered_widths",
49
+ "term_offsets",
50
+ "kwic",
51
+ "normalize_size",
52
+ "categorical_colors",
53
+ "continuous_colors",
54
+ "Channels",
55
+ "render_points",
56
+ "render_path",
57
+ "render_path_3d",
58
+ "frame_axes",
59
+ "punctuation_spiral",
60
+ "text_walk",
61
+ "recurrence_plot",
62
+ "concordance",
63
+ "load_demo_text",
64
+ "__version__",
65
+ ]
@@ -0,0 +1,26 @@
1
+ """Shared type aliases for the lexograph package.
2
+
3
+ The visual-channel arrays named here are the package's public data contract:
4
+ ``encode`` accepts plain per-unit arrays, and ``analyze``/``integrations`` only
5
+ ever *produce* arrays that satisfy these aliases. Nothing in the core needs to
6
+ know where the numbers came from.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Literal, TypeAlias
12
+
13
+ import numpy as np
14
+ import numpy.typing as npt
15
+
16
+ Unit: TypeAlias = str
17
+ """A single segmented unit of text: a character, token, or sentence."""
18
+
19
+ UnitKind: TypeAlias = Literal["chars", "tokens", "sentences"]
20
+ """Which kind of unit a segmenter emits."""
21
+
22
+ FloatArray: TypeAlias = npt.NDArray[np.float64]
23
+ """A 1-D array of floats: a per-unit scalar channel (e.g. ``size``)."""
24
+
25
+ Coords: TypeAlias = npt.NDArray[np.float64]
26
+ """An ``(N, 2)`` or ``(N, 3)`` array of layout coordinates, one row per unit."""
@@ -0,0 +1,139 @@
1
+ """Optional analysis layer: turn a text into per-sentence channel arrays.
2
+
3
+ Behind the ``lexograph[graph]`` extra. It runs the Wittgenstein pipeline —
4
+ sentence embeddings → cosine kNN graph → (optional disparity backbone) → PageRank
5
+ and community detection — and hands back plain arrays that satisfy the
6
+ ``encode`` data contract: a ``size`` array (PageRank centrality), a ``community``
7
+ array (colour labels), and a cosine ``distances`` matrix (for a semantic
8
+ recurrence dotplot). The core never imports this; the arrow points inward.
9
+
10
+ The core dependency-free scalars (length, position, frequency) live in
11
+ :mod:`lexograph.scalars`.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from dataclasses import dataclass
17
+ from typing import TYPE_CHECKING
18
+
19
+ from lexograph.analyze.backbone import extract_backbone
20
+ from lexograph.analyze.embeddings import embed_sentences
21
+ from lexograph.analyze.graph import (
22
+ community_labels,
23
+ embedding_distances,
24
+ knn_graph,
25
+ pagerank_scores,
26
+ )
27
+ from lexograph.segment.units import sentences as split_sentences
28
+
29
+ if TYPE_CHECKING:
30
+ import numpy as np
31
+
32
+ from lexograph._types import FloatArray
33
+ from lexograph.analyze.graph import CommunityMethod
34
+
35
+ __all__ = [
36
+ "Analysis",
37
+ "analyze_text",
38
+ "embed_sentences",
39
+ "knn_graph",
40
+ "embedding_distances",
41
+ "pagerank_scores",
42
+ "community_labels",
43
+ "extract_backbone",
44
+ ]
45
+
46
+
47
+ @dataclass(frozen=True, slots=True)
48
+ class Analysis:
49
+ """The per-sentence channel arrays derived from a text.
50
+
51
+ Attributes:
52
+ sentences: The segmented sentences (length ``N``).
53
+ embeddings: The ``(N, D)`` sentence embeddings.
54
+ size: PageRank centrality per sentence — the size channel.
55
+ community: Community id per sentence — the colour channel.
56
+ distances: The ``(N, N)`` cosine distance matrix — pass it to
57
+ ``recurrence_plot(distances=...)`` for a semantic dotplot.
58
+ """
59
+
60
+ sentences: list[str]
61
+ embeddings: FloatArray
62
+ size: FloatArray
63
+ community: np.ndarray
64
+ distances: FloatArray
65
+
66
+
67
+ def analyze_text(
68
+ text: str,
69
+ *,
70
+ embeddings: FloatArray | None = None,
71
+ k: int = 5,
72
+ community: CommunityMethod = "louvain",
73
+ n_clusters: int = 10,
74
+ backbone: bool = False,
75
+ min_alpha_ptile: float = 0.5,
76
+ seed: int = 42,
77
+ ) -> Analysis:
78
+ """Run the analysis pipeline and return per-sentence channel arrays.
79
+
80
+ Args:
81
+ text: The source text.
82
+ embeddings: Precomputed ``(N, D)`` embeddings to use. If ``None``, the
83
+ sentences are embedded with the default model (a model download).
84
+ k: Neighbours per node in the kNN graph.
85
+ community: Community method — ``"louvain"`` (default) or ``"kmeans"``.
86
+ n_clusters: Number of clusters for the ``"kmeans"`` method.
87
+ backbone: If ``True``, sparsify the kNN graph with the disparity filter
88
+ before PageRank and community detection.
89
+ min_alpha_ptile: Disparity-filter threshold (used when ``backbone``).
90
+ seed: Random seed for reproducibility.
91
+
92
+ Returns:
93
+ An :class:`Analysis` whose arrays align to the segmented sentences.
94
+
95
+ Raises:
96
+ ValueError: If ``embeddings`` is given but its length does not match the
97
+ sentence count.
98
+
99
+ Example:
100
+ Drive a walk and a semantic dotplot from the analysis (not run as a
101
+ doctest — embedding downloads a model)::
102
+
103
+ from lexograph import text_walk, recurrence_plot, load_demo_text
104
+ from lexograph.analyze import analyze_text
105
+
106
+ a = analyze_text(load_demo_text())
107
+ walk = text_walk(load_demo_text(), colour=a.community,
108
+ colour_kind="categorical", size=a.size)
109
+ dots = recurrence_plot(load_demo_text(), distances=a.distances,
110
+ threshold=0.4)
111
+ """
112
+ units = split_sentences(text)
113
+ n = len(units)
114
+ if embeddings is None:
115
+ embeddings = embed_sentences(units)
116
+ elif len(embeddings) != n:
117
+ msg = f"embeddings must have one row per sentence ({n}), got {len(embeddings)}"
118
+ raise ValueError(msg)
119
+
120
+ graph = knn_graph(embeddings, k=k)
121
+ if backbone:
122
+ graph = extract_backbone(graph, min_alpha_ptile=min_alpha_ptile)
123
+ size = pagerank_scores(graph, n)
124
+ labels = community_labels(
125
+ graph,
126
+ n,
127
+ method=community,
128
+ embeddings=embeddings,
129
+ n_clusters=n_clusters,
130
+ seed=seed,
131
+ )
132
+ distances = embedding_distances(embeddings)
133
+ return Analysis(
134
+ sentences=units,
135
+ embeddings=embeddings,
136
+ size=size,
137
+ community=labels,
138
+ distances=distances,
139
+ )