lexograph 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexograph-0.1.0/PKG-INFO +135 -0
- lexograph-0.1.0/README.md +94 -0
- lexograph-0.1.0/pyproject.toml +112 -0
- lexograph-0.1.0/src/lexograph/__init__.py +65 -0
- lexograph-0.1.0/src/lexograph/_types.py +26 -0
- lexograph-0.1.0/src/lexograph/analyze/__init__.py +139 -0
- lexograph-0.1.0/src/lexograph/analyze/backbone.py +170 -0
- lexograph-0.1.0/src/lexograph/analyze/embeddings.py +68 -0
- lexograph-0.1.0/src/lexograph/analyze/graph.py +190 -0
- lexograph-0.1.0/src/lexograph/datasets/__init__.py +30 -0
- lexograph-0.1.0/src/lexograph/datasets/_pride_and_prejudice.py +130 -0
- lexograph-0.1.0/src/lexograph/encode/__init__.py +17 -0
- lexograph-0.1.0/src/lexograph/encode/channels.py +164 -0
- lexograph-0.1.0/src/lexograph/io/__init__.py +1 -0
- lexograph-0.1.0/src/lexograph/layout/__init__.py +24 -0
- lexograph-0.1.0/src/lexograph/layout/dispersion.py +116 -0
- lexograph-0.1.0/src/lexograph/layout/linear.py +65 -0
- lexograph-0.1.0/src/lexograph/layout/recurrence.py +117 -0
- lexograph-0.1.0/src/lexograph/layout/spiral.py +115 -0
- lexograph-0.1.0/src/lexograph/layout/walk.py +121 -0
- lexograph-0.1.0/src/lexograph/layout/walk3d.py +64 -0
- lexograph-0.1.0/src/lexograph/layout/widths.py +86 -0
- lexograph-0.1.0/src/lexograph/presets/__init__.py +14 -0
- lexograph-0.1.0/src/lexograph/presets/concordance.py +94 -0
- lexograph-0.1.0/src/lexograph/presets/punctuation_spiral.py +139 -0
- lexograph-0.1.0/src/lexograph/presets/recurrence.py +104 -0
- lexograph-0.1.0/src/lexograph/presets/text_walk.py +200 -0
- lexograph-0.1.0/src/lexograph/py.typed +0 -0
- lexograph-0.1.0/src/lexograph/render/__init__.py +6 -0
- lexograph-0.1.0/src/lexograph/render/mpl.py +233 -0
- lexograph-0.1.0/src/lexograph/render/mpl3d.py +135 -0
- lexograph-0.1.0/src/lexograph/scalars.py +70 -0
- lexograph-0.1.0/src/lexograph/segment/__init__.py +5 -0
- lexograph-0.1.0/src/lexograph/segment/units.py +188 -0
lexograph-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: lexograph
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Spatialize linear text into pictures you can read: walks, spirals, dotplots, and concordances
|
|
5
|
+
Keywords: nlp,text-visualization,information-design,concordance,recurrence-plot,dotplot,text-analysis,computational-humanities
|
|
6
|
+
Author: Zoltan Varju, Orsolya Putz
|
|
7
|
+
Author-email: Zoltan Varju <zoltan.varju@crowintelligence.org>, Orsolya Putz <orsolya.putz@crowintelligence.org>
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
17
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
18
|
+
Requires-Dist: numpy>=1.26
|
|
19
|
+
Requires-Dist: matplotlib>=3.8
|
|
20
|
+
Requires-Dist: nltk>=3.8
|
|
21
|
+
Requires-Dist: pytest>=8 ; extra == 'dev'
|
|
22
|
+
Requires-Dist: pytest-cov ; extra == 'dev'
|
|
23
|
+
Requires-Dist: hypothesis>=6.100 ; extra == 'dev'
|
|
24
|
+
Requires-Dist: ruff>=0.4 ; extra == 'dev'
|
|
25
|
+
Requires-Dist: ty ; extra == 'dev'
|
|
26
|
+
Requires-Dist: mutmut>=3 ; extra == 'dev'
|
|
27
|
+
Requires-Dist: mkdocs-material ; extra == 'docs'
|
|
28
|
+
Requires-Dist: mkdocstrings[python] ; extra == 'docs'
|
|
29
|
+
Requires-Dist: sentence-transformers>=2.2 ; extra == 'graph'
|
|
30
|
+
Requires-Dist: scikit-learn>=1.3 ; extra == 'graph'
|
|
31
|
+
Requires-Dist: networkx>=3.0 ; extra == 'graph'
|
|
32
|
+
Requires-Dist: scipy>=1.10 ; extra == 'graph'
|
|
33
|
+
Requires-Python: >=3.11
|
|
34
|
+
Project-URL: Homepage, https://crowintelligence.org/
|
|
35
|
+
Project-URL: Repository, https://github.com/crow-intelligence/lexograph
|
|
36
|
+
Project-URL: Documentation, https://lexograph.readthedocs.io
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Provides-Extra: docs
|
|
39
|
+
Provides-Extra: graph
|
|
40
|
+
Description-Content-Type: text/markdown
|
|
41
|
+
|
|
42
|
+
<p align="center">
|
|
43
|
+
<img src="https://raw.githubusercontent.com/crow-intelligence/lexograph/main/docs/assets/logo.png" alt="lexograph" width="480">
|
|
44
|
+
</p>
|
|
45
|
+
|
|
46
|
+
# lexograph
|
|
47
|
+
|
|
48
|
+
Spatialize linear text into pictures you can read — in pure Python, rendered with
|
|
49
|
+
matplotlib.
|
|
50
|
+
|
|
51
|
+
lexograph is the visualization member of the corpus-lx family (alongside
|
|
52
|
+
[chronowords](https://github.com/crow-intelligence/chronowords),
|
|
53
|
+
[kenon](https://github.com/crow-intelligence/kenon), and
|
|
54
|
+
[keyflux](https://github.com/crow-intelligence/keyflux)). It turns a text into a
|
|
55
|
+
picture through one four-step spine — **segment → layout → encode → render** — and
|
|
56
|
+
ships several presets that are each just a point on that spine:
|
|
57
|
+
|
|
58
|
+
- **Punctuation spiral** — every non-alphanumeric mark, in order, along an Archimedean
|
|
59
|
+
spiral, coloured by symbol class.
|
|
60
|
+
- **Text walk (2-D / 3-D)** — each sentence steps forward and turns 90°, space-filling;
|
|
61
|
+
size, colour, and glyph encode per-unit attributes. The 3-D variant lifts the walk
|
|
62
|
+
into a corkscrew.
|
|
63
|
+
- **Recurrence dotplot** — the only preset that plots a text against *itself*: a
|
|
64
|
+
sentence × sentence self-similarity grid that exposes internal echo structure.
|
|
65
|
+
- **Concordance** — a term's dispersion across the text (and across texts/time), with
|
|
66
|
+
optional KWIC.
|
|
67
|
+
|
|
68
|
+
Every preset returns a matplotlib `Figure` and never calls `show()`, so it renders
|
|
69
|
+
inline in Jupyter and saves cleanly with `fig.savefig(...)`. The core is headless and
|
|
70
|
+
dependency-light; heavy analysis (sentence embeddings, graph centrality, communities)
|
|
71
|
+
lives behind an optional `[graph]` extra.
|
|
72
|
+
|
|
73
|
+
## Installation
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
uv add lexograph
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
The core depends only on `numpy`, `matplotlib`, and `nltk`.
|
|
80
|
+
|
|
81
|
+
## Quickstart
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from lexograph import (
|
|
85
|
+
load_demo_text, punctuation_spiral, text_walk, recurrence_plot, concordance,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
text = load_demo_text() # Chapter 1 of Pride and Prejudice (public domain)
|
|
89
|
+
|
|
90
|
+
punctuation_spiral(text) # marks on an Archimedean spiral
|
|
91
|
+
text_walk(text) # the 2-D space-filling walk
|
|
92
|
+
text_walk(text, helix=True, z_step=4.0) # the 3-D corkscrew
|
|
93
|
+
recurrence_plot(text) # the text against itself
|
|
94
|
+
concordance(text, ["Bennet", "Bingley", "wife"]) # term dispersion
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Every call returns a matplotlib `Figure`. For PageRank-sized, community-coloured, or
|
|
98
|
+
semantically-recurrent figures, install the `[graph]` extra and feed
|
|
99
|
+
`lexograph.analyze.analyze_text`'s arrays into any preset.
|
|
100
|
+
|
|
101
|
+
## The data contract
|
|
102
|
+
|
|
103
|
+
Every visual channel is fed by a **plain per-unit array** — a scalar array for `size`,
|
|
104
|
+
an array of labels or values for `colour`, an optional `glyph`/font. Nothing in the
|
|
105
|
+
core knows where those numbers came from, so you can drive a spiral or a walk from
|
|
106
|
+
`length`, `frequency`, or your own column with no analysis stack at all. The optional
|
|
107
|
+
`analyze` layer and the `[kenon]` / `[chronowords]` integrations only *produce* arrays
|
|
108
|
+
that satisfy this contract.
|
|
109
|
+
|
|
110
|
+
## Documentation
|
|
111
|
+
|
|
112
|
+
Full documentation — quickstart, a tutorial per preset, troubleshooting, and the API
|
|
113
|
+
reference — is at [lexograph.readthedocs.io](https://lexograph.readthedocs.io). The
|
|
114
|
+
sources live in `docs/`.
|
|
115
|
+
|
|
116
|
+
## Roadmap
|
|
117
|
+
|
|
118
|
+
Open modelling and packaging decisions are analysed in
|
|
119
|
+
[`CHANGES_SUMMARY.md`](CHANGES_SUMMARY.md), and the failure modes in
|
|
120
|
+
[`PRE-MORTEM.md`](PRE-MORTEM.md). The main not-yet-built pieces:
|
|
121
|
+
|
|
122
|
+
- [ ] Vendor the four OFL/Apache handwriting fonts so the handwriting walk works out of
|
|
123
|
+
the box (the width-step already runs on any TTF).
|
|
124
|
+
- [ ] `integrations/` — thin `[kenon]` and `[chronowords]` adapters over the data contract.
|
|
125
|
+
- [ ] Optional interactive HTML/WebGL export (`render/html.py`), ported from the source
|
|
126
|
+
viewers and gated behind a stretch extra.
|
|
127
|
+
- [ ] A space-filling / non-self-overlapping turn rule for very uniform texts.
|
|
128
|
+
|
|
129
|
+
## Made by
|
|
130
|
+
|
|
131
|
+
lexograph is made by [Crow Intelligence](https://crowintelligence.org/).
|
|
132
|
+
|
|
133
|
+
## License
|
|
134
|
+
|
|
135
|
+
MIT
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://raw.githubusercontent.com/crow-intelligence/lexograph/main/docs/assets/logo.png" alt="lexograph" width="480">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
# lexograph
|
|
6
|
+
|
|
7
|
+
Spatialize linear text into pictures you can read — in pure Python, rendered with
|
|
8
|
+
matplotlib.
|
|
9
|
+
|
|
10
|
+
lexograph is the visualization member of the corpus-lx family (alongside
|
|
11
|
+
[chronowords](https://github.com/crow-intelligence/chronowords),
|
|
12
|
+
[kenon](https://github.com/crow-intelligence/kenon), and
|
|
13
|
+
[keyflux](https://github.com/crow-intelligence/keyflux)). It turns a text into a
|
|
14
|
+
picture through one four-step spine — **segment → layout → encode → render** — and
|
|
15
|
+
ships several presets that are each just a point on that spine:
|
|
16
|
+
|
|
17
|
+
- **Punctuation spiral** — every non-alphanumeric mark, in order, along an Archimedean
|
|
18
|
+
spiral, coloured by symbol class.
|
|
19
|
+
- **Text walk (2-D / 3-D)** — each sentence steps forward and turns 90°, space-filling;
|
|
20
|
+
size, colour, and glyph encode per-unit attributes. The 3-D variant lifts the walk
|
|
21
|
+
into a corkscrew.
|
|
22
|
+
- **Recurrence dotplot** — the only preset that plots a text against *itself*: a
|
|
23
|
+
sentence × sentence self-similarity grid that exposes internal echo structure.
|
|
24
|
+
- **Concordance** — a term's dispersion across the text (and across texts/time), with
|
|
25
|
+
optional KWIC.
|
|
26
|
+
|
|
27
|
+
Every preset returns a matplotlib `Figure` and never calls `show()`, so it renders
|
|
28
|
+
inline in Jupyter and saves cleanly with `fig.savefig(...)`. The core is headless and
|
|
29
|
+
dependency-light; heavy analysis (sentence embeddings, graph centrality, communities)
|
|
30
|
+
lives behind an optional `[graph]` extra.
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
uv add lexograph
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
The core depends only on `numpy`, `matplotlib`, and `nltk`.
|
|
39
|
+
|
|
40
|
+
## Quickstart
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from lexograph import (
|
|
44
|
+
load_demo_text, punctuation_spiral, text_walk, recurrence_plot, concordance,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
text = load_demo_text() # Chapter 1 of Pride and Prejudice (public domain)
|
|
48
|
+
|
|
49
|
+
punctuation_spiral(text) # marks on an Archimedean spiral
|
|
50
|
+
text_walk(text) # the 2-D space-filling walk
|
|
51
|
+
text_walk(text, helix=True, z_step=4.0) # the 3-D corkscrew
|
|
52
|
+
recurrence_plot(text) # the text against itself
|
|
53
|
+
concordance(text, ["Bennet", "Bingley", "wife"]) # term dispersion
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Every call returns a matplotlib `Figure`. For PageRank-sized, community-coloured, or
|
|
57
|
+
semantically-recurrent figures, install the `[graph]` extra and feed
|
|
58
|
+
`lexograph.analyze.analyze_text`'s arrays into any preset.
|
|
59
|
+
|
|
60
|
+
## The data contract
|
|
61
|
+
|
|
62
|
+
Every visual channel is fed by a **plain per-unit array** — a scalar array for `size`,
|
|
63
|
+
an array of labels or values for `colour`, an optional `glyph`/font. Nothing in the
|
|
64
|
+
core knows where those numbers came from, so you can drive a spiral or a walk from
|
|
65
|
+
`length`, `frequency`, or your own column with no analysis stack at all. The optional
|
|
66
|
+
`analyze` layer and the `[kenon]` / `[chronowords]` integrations only *produce* arrays
|
|
67
|
+
that satisfy this contract.
|
|
68
|
+
|
|
69
|
+
## Documentation
|
|
70
|
+
|
|
71
|
+
Full documentation — quickstart, a tutorial per preset, troubleshooting, and the API
|
|
72
|
+
reference — is at [lexograph.readthedocs.io](https://lexograph.readthedocs.io). The
|
|
73
|
+
sources live in `docs/`.
|
|
74
|
+
|
|
75
|
+
## Roadmap
|
|
76
|
+
|
|
77
|
+
Open modelling and packaging decisions are analysed in
|
|
78
|
+
[`CHANGES_SUMMARY.md`](CHANGES_SUMMARY.md), and the failure modes in
|
|
79
|
+
[`PRE-MORTEM.md`](PRE-MORTEM.md). The main not-yet-built pieces:
|
|
80
|
+
|
|
81
|
+
- [ ] Vendor the four OFL/Apache handwriting fonts so the handwriting walk works out of
|
|
82
|
+
the box (the width-step already runs on any TTF).
|
|
83
|
+
- [ ] `integrations/` — thin `[kenon]` and `[chronowords]` adapters over the data contract.
|
|
84
|
+
- [ ] Optional interactive HTML/WebGL export (`render/html.py`), ported from the source
|
|
85
|
+
viewers and gated behind a stretch extra.
|
|
86
|
+
- [ ] A space-filling / non-self-overlapping turn rule for very uniform texts.
|
|
87
|
+
|
|
88
|
+
## Made by
|
|
89
|
+
|
|
90
|
+
lexograph is made by [Crow Intelligence](https://crowintelligence.org/).
|
|
91
|
+
|
|
92
|
+
## License
|
|
93
|
+
|
|
94
|
+
MIT
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "lexograph"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Spatialize linear text into pictures you can read: walks, spirals, dotplots, and concordances"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Zoltan Varju", email = "zoltan.varju@crowintelligence.org" },
|
|
8
|
+
{ name = "Orsolya Putz", email = "orsolya.putz@crowintelligence.org" },
|
|
9
|
+
]
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
keywords = [
|
|
13
|
+
"nlp",
|
|
14
|
+
"text-visualization",
|
|
15
|
+
"information-design",
|
|
16
|
+
"concordance",
|
|
17
|
+
"recurrence-plot",
|
|
18
|
+
"dotplot",
|
|
19
|
+
"text-analysis",
|
|
20
|
+
"computational-humanities",
|
|
21
|
+
]
|
|
22
|
+
classifiers = [
|
|
23
|
+
"Development Status :: 3 - Alpha",
|
|
24
|
+
"Intended Audience :: Science/Research",
|
|
25
|
+
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.11",
|
|
28
|
+
"Programming Language :: Python :: 3.12",
|
|
29
|
+
"Programming Language :: Python :: 3.13",
|
|
30
|
+
"Topic :: Scientific/Engineering :: Visualization",
|
|
31
|
+
"Topic :: Text Processing :: Linguistic",
|
|
32
|
+
]
|
|
33
|
+
dependencies = [
|
|
34
|
+
"numpy>=1.26",
|
|
35
|
+
"matplotlib>=3.8",
|
|
36
|
+
"nltk>=3.8",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[project.urls]
|
|
40
|
+
Homepage = "https://crowintelligence.org/"
|
|
41
|
+
Repository = "https://github.com/crow-intelligence/lexograph"
|
|
42
|
+
Documentation = "https://lexograph.readthedocs.io"
|
|
43
|
+
|
|
44
|
+
[project.optional-dependencies]
|
|
45
|
+
# Heavy analysis layer (lexograph.analyze): sentence embeddings, kNN graph,
|
|
46
|
+
# disparity-filter backbone, PageRank, and community detection. The core never
|
|
47
|
+
# imports these.
|
|
48
|
+
graph = [
|
|
49
|
+
"sentence-transformers>=2.2",
|
|
50
|
+
"scikit-learn>=1.3",
|
|
51
|
+
"networkx>=3.0",
|
|
52
|
+
"scipy>=1.10",
|
|
53
|
+
]
|
|
54
|
+
dev = [
|
|
55
|
+
"pytest>=8",
|
|
56
|
+
"pytest-cov",
|
|
57
|
+
"hypothesis>=6.100",
|
|
58
|
+
"ruff>=0.4",
|
|
59
|
+
"ty",
|
|
60
|
+
"mutmut>=3",
|
|
61
|
+
]
|
|
62
|
+
docs = [
|
|
63
|
+
"mkdocs-material",
|
|
64
|
+
"mkdocstrings[python]",
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
[build-system]
|
|
68
|
+
requires = ["uv_build>=0.9.22,<0.10.0"]
|
|
69
|
+
build-backend = "uv_build"
|
|
70
|
+
|
|
71
|
+
[tool.ruff]
|
|
72
|
+
line-length = 88
|
|
73
|
+
target-version = "py311"
|
|
74
|
+
|
|
75
|
+
[tool.ruff.lint]
|
|
76
|
+
select = ["E", "F", "I", "N", "UP", "ANN", "D"]
|
|
77
|
+
ignore = ["D105", "D107"]
|
|
78
|
+
|
|
79
|
+
[tool.ruff.lint.per-file-ignores]
|
|
80
|
+
"tests/**" = ["D100", "D102", "D103", "D104", "ANN"]
|
|
81
|
+
"examples/**" = ["D100", "ANN"]
|
|
82
|
+
|
|
83
|
+
[tool.ruff.lint.pydocstyle]
|
|
84
|
+
convention = "google"
|
|
85
|
+
|
|
86
|
+
[tool.pytest.ini_options]
|
|
87
|
+
addopts = "--doctest-modules --tb=short"
|
|
88
|
+
testpaths = ["src", "tests"]
|
|
89
|
+
|
|
90
|
+
# Mutation testing (dev-only, not run in CI). Scoped to the layout geometry — the
|
|
91
|
+
# math-heavy core — and its fast, plotting-free test files. Run with
|
|
92
|
+
# `uv run mutmut run`.
|
|
93
|
+
[tool.mutmut]
|
|
94
|
+
source_paths = ["src/lexograph"]
|
|
95
|
+
only_mutate = [
|
|
96
|
+
"src/lexograph/layout/walk.py",
|
|
97
|
+
"src/lexograph/layout/walk3d.py",
|
|
98
|
+
"src/lexograph/layout/spiral.py",
|
|
99
|
+
"src/lexograph/layout/recurrence.py",
|
|
100
|
+
"src/lexograph/layout/dispersion.py",
|
|
101
|
+
]
|
|
102
|
+
pytest_add_cli_args_test_selection = [
|
|
103
|
+
"tests/test_walk.py",
|
|
104
|
+
"tests/test_walk3d.py",
|
|
105
|
+
"tests/test_spiral.py",
|
|
106
|
+
"tests/test_recurrence.py",
|
|
107
|
+
"tests/test_dispersion.py",
|
|
108
|
+
"tests/test_geometry.py",
|
|
109
|
+
]
|
|
110
|
+
pytest_add_cli_args = ["-p", "no:cacheprovider"]
|
|
111
|
+
|
|
112
|
+
[tool.ty]
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""lexograph — spatialize linear text into pictures you can read.
|
|
2
|
+
|
|
3
|
+
lexograph turns a linear text into a picture through one four-step spine:
|
|
4
|
+
**segment** the text into ordered units (characters, tokens, or sentences),
|
|
5
|
+
**lay them out** in 2-D or 3-D space, **encode** per-unit attributes onto
|
|
6
|
+
visual channels (size, colour, glyph), and **render** the result as a
|
|
7
|
+
matplotlib :class:`~matplotlib.figure.Figure` that displays inline in Jupyter
|
|
8
|
+
and saves cleanly with ``fig.savefig(...)``.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
|
|
13
|
+
from lexograph.datasets import load_demo_text
|
|
14
|
+
from lexograph.encode import (
|
|
15
|
+
Channels,
|
|
16
|
+
categorical_colors,
|
|
17
|
+
continuous_colors,
|
|
18
|
+
normalize_size,
|
|
19
|
+
)
|
|
20
|
+
from lexograph.layout import (
|
|
21
|
+
kwic,
|
|
22
|
+
linear_layout,
|
|
23
|
+
rendered_widths,
|
|
24
|
+
spiral_layout,
|
|
25
|
+
term_offsets,
|
|
26
|
+
walk3d_layout,
|
|
27
|
+
walk_layout,
|
|
28
|
+
)
|
|
29
|
+
from lexograph.presets import (
|
|
30
|
+
concordance,
|
|
31
|
+
punctuation_spiral,
|
|
32
|
+
recurrence_plot,
|
|
33
|
+
text_walk,
|
|
34
|
+
)
|
|
35
|
+
from lexograph.render import frame_axes, render_path, render_path_3d, render_points
|
|
36
|
+
from lexograph.scalars import frequencies, lengths, positions
|
|
37
|
+
from lexograph.segment import segment
|
|
38
|
+
|
|
39
|
+
__all__ = [
|
|
40
|
+
"segment",
|
|
41
|
+
"lengths",
|
|
42
|
+
"positions",
|
|
43
|
+
"frequencies",
|
|
44
|
+
"linear_layout",
|
|
45
|
+
"walk_layout",
|
|
46
|
+
"walk3d_layout",
|
|
47
|
+
"spiral_layout",
|
|
48
|
+
"rendered_widths",
|
|
49
|
+
"term_offsets",
|
|
50
|
+
"kwic",
|
|
51
|
+
"normalize_size",
|
|
52
|
+
"categorical_colors",
|
|
53
|
+
"continuous_colors",
|
|
54
|
+
"Channels",
|
|
55
|
+
"render_points",
|
|
56
|
+
"render_path",
|
|
57
|
+
"render_path_3d",
|
|
58
|
+
"frame_axes",
|
|
59
|
+
"punctuation_spiral",
|
|
60
|
+
"text_walk",
|
|
61
|
+
"recurrence_plot",
|
|
62
|
+
"concordance",
|
|
63
|
+
"load_demo_text",
|
|
64
|
+
"__version__",
|
|
65
|
+
]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Shared type aliases for the lexograph package.
|
|
2
|
+
|
|
3
|
+
The visual-channel arrays named here are the package's public data contract:
|
|
4
|
+
``encode`` accepts plain per-unit arrays, and ``analyze``/``integrations`` only
|
|
5
|
+
ever *produce* arrays that satisfy these aliases. Nothing in the core needs to
|
|
6
|
+
know where the numbers came from.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Literal, TypeAlias
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import numpy.typing as npt
|
|
15
|
+
|
|
16
|
+
Unit: TypeAlias = str
|
|
17
|
+
"""A single segmented unit of text: a character, token, or sentence."""
|
|
18
|
+
|
|
19
|
+
UnitKind: TypeAlias = Literal["chars", "tokens", "sentences"]
|
|
20
|
+
"""Which kind of unit a segmenter emits."""
|
|
21
|
+
|
|
22
|
+
FloatArray: TypeAlias = npt.NDArray[np.float64]
|
|
23
|
+
"""A 1-D array of floats: a per-unit scalar channel (e.g. ``size``)."""
|
|
24
|
+
|
|
25
|
+
Coords: TypeAlias = npt.NDArray[np.float64]
|
|
26
|
+
"""An ``(N, 2)`` or ``(N, 3)`` array of layout coordinates, one row per unit."""
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Optional analysis layer: turn a text into per-sentence channel arrays.
|
|
2
|
+
|
|
3
|
+
Behind the ``lexograph[graph]`` extra. It runs the Wittgenstein pipeline —
|
|
4
|
+
sentence embeddings → cosine kNN graph → (optional disparity backbone) → PageRank
|
|
5
|
+
and community detection — and hands back plain arrays that satisfy the
|
|
6
|
+
``encode`` data contract: a ``size`` array (PageRank centrality), a ``community``
|
|
7
|
+
array (colour labels), and a cosine ``distances`` matrix (for a semantic
|
|
8
|
+
recurrence dotplot). The core never imports this; the arrow points inward.
|
|
9
|
+
|
|
10
|
+
The core dependency-free scalars (length, position, frequency) live in
|
|
11
|
+
:mod:`lexograph.scalars`.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
|
+
|
|
19
|
+
from lexograph.analyze.backbone import extract_backbone
|
|
20
|
+
from lexograph.analyze.embeddings import embed_sentences
|
|
21
|
+
from lexograph.analyze.graph import (
|
|
22
|
+
community_labels,
|
|
23
|
+
embedding_distances,
|
|
24
|
+
knn_graph,
|
|
25
|
+
pagerank_scores,
|
|
26
|
+
)
|
|
27
|
+
from lexograph.segment.units import sentences as split_sentences
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
import numpy as np
|
|
31
|
+
|
|
32
|
+
from lexograph._types import FloatArray
|
|
33
|
+
from lexograph.analyze.graph import CommunityMethod
|
|
34
|
+
|
|
35
|
+
__all__ = [
|
|
36
|
+
"Analysis",
|
|
37
|
+
"analyze_text",
|
|
38
|
+
"embed_sentences",
|
|
39
|
+
"knn_graph",
|
|
40
|
+
"embedding_distances",
|
|
41
|
+
"pagerank_scores",
|
|
42
|
+
"community_labels",
|
|
43
|
+
"extract_backbone",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass(frozen=True, slots=True)
|
|
48
|
+
class Analysis:
|
|
49
|
+
"""The per-sentence channel arrays derived from a text.
|
|
50
|
+
|
|
51
|
+
Attributes:
|
|
52
|
+
sentences: The segmented sentences (length ``N``).
|
|
53
|
+
embeddings: The ``(N, D)`` sentence embeddings.
|
|
54
|
+
size: PageRank centrality per sentence — the size channel.
|
|
55
|
+
community: Community id per sentence — the colour channel.
|
|
56
|
+
distances: The ``(N, N)`` cosine distance matrix — pass it to
|
|
57
|
+
``recurrence_plot(distances=...)`` for a semantic dotplot.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
sentences: list[str]
|
|
61
|
+
embeddings: FloatArray
|
|
62
|
+
size: FloatArray
|
|
63
|
+
community: np.ndarray
|
|
64
|
+
distances: FloatArray
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def analyze_text(
|
|
68
|
+
text: str,
|
|
69
|
+
*,
|
|
70
|
+
embeddings: FloatArray | None = None,
|
|
71
|
+
k: int = 5,
|
|
72
|
+
community: CommunityMethod = "louvain",
|
|
73
|
+
n_clusters: int = 10,
|
|
74
|
+
backbone: bool = False,
|
|
75
|
+
min_alpha_ptile: float = 0.5,
|
|
76
|
+
seed: int = 42,
|
|
77
|
+
) -> Analysis:
|
|
78
|
+
"""Run the analysis pipeline and return per-sentence channel arrays.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
text: The source text.
|
|
82
|
+
embeddings: Precomputed ``(N, D)`` embeddings to use. If ``None``, the
|
|
83
|
+
sentences are embedded with the default model (a model download).
|
|
84
|
+
k: Neighbours per node in the kNN graph.
|
|
85
|
+
community: Community method — ``"louvain"`` (default) or ``"kmeans"``.
|
|
86
|
+
n_clusters: Number of clusters for the ``"kmeans"`` method.
|
|
87
|
+
backbone: If ``True``, sparsify the kNN graph with the disparity filter
|
|
88
|
+
before PageRank and community detection.
|
|
89
|
+
min_alpha_ptile: Disparity-filter threshold (used when ``backbone``).
|
|
90
|
+
seed: Random seed for reproducibility.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
An :class:`Analysis` whose arrays align to the segmented sentences.
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
ValueError: If ``embeddings`` is given but its length does not match the
|
|
97
|
+
sentence count.
|
|
98
|
+
|
|
99
|
+
Example:
|
|
100
|
+
Drive a walk and a semantic dotplot from the analysis (not run as a
|
|
101
|
+
doctest — embedding downloads a model)::
|
|
102
|
+
|
|
103
|
+
from lexograph import text_walk, recurrence_plot, load_demo_text
|
|
104
|
+
from lexograph.analyze import analyze_text
|
|
105
|
+
|
|
106
|
+
a = analyze_text(load_demo_text())
|
|
107
|
+
walk = text_walk(load_demo_text(), colour=a.community,
|
|
108
|
+
colour_kind="categorical", size=a.size)
|
|
109
|
+
dots = recurrence_plot(load_demo_text(), distances=a.distances,
|
|
110
|
+
threshold=0.4)
|
|
111
|
+
"""
|
|
112
|
+
units = split_sentences(text)
|
|
113
|
+
n = len(units)
|
|
114
|
+
if embeddings is None:
|
|
115
|
+
embeddings = embed_sentences(units)
|
|
116
|
+
elif len(embeddings) != n:
|
|
117
|
+
msg = f"embeddings must have one row per sentence ({n}), got {len(embeddings)}"
|
|
118
|
+
raise ValueError(msg)
|
|
119
|
+
|
|
120
|
+
graph = knn_graph(embeddings, k=k)
|
|
121
|
+
if backbone:
|
|
122
|
+
graph = extract_backbone(graph, min_alpha_ptile=min_alpha_ptile)
|
|
123
|
+
size = pagerank_scores(graph, n)
|
|
124
|
+
labels = community_labels(
|
|
125
|
+
graph,
|
|
126
|
+
n,
|
|
127
|
+
method=community,
|
|
128
|
+
embeddings=embeddings,
|
|
129
|
+
n_clusters=n_clusters,
|
|
130
|
+
seed=seed,
|
|
131
|
+
)
|
|
132
|
+
distances = embedding_distances(embeddings)
|
|
133
|
+
return Analysis(
|
|
134
|
+
sentences=units,
|
|
135
|
+
embeddings=embeddings,
|
|
136
|
+
size=size,
|
|
137
|
+
community=labels,
|
|
138
|
+
distances=distances,
|
|
139
|
+
)
|