keyflux 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keyflux-0.1.0/PKG-INFO +131 -0
- keyflux-0.1.0/README.md +95 -0
- keyflux-0.1.0/pyproject.toml +99 -0
- keyflux-0.1.0/src/keyflux/__init__.py +30 -0
- keyflux-0.1.0/src/keyflux/_types.py +31 -0
- keyflux-0.1.0/src/keyflux/datasets/__init__.py +104 -0
- keyflux-0.1.0/src/keyflux/divergence/__init__.py +5 -0
- keyflux-0.1.0/src/keyflux/divergence/rtd.py +213 -0
- keyflux-0.1.0/src/keyflux/io/__init__.py +5 -0
- keyflux-0.1.0/src/keyflux/io/corpus.py +99 -0
- keyflux-0.1.0/src/keyflux/keyness/__init__.py +43 -0
- keyflux-0.1.0/src/keyflux/keyness/classify.py +176 -0
- keyflux-0.1.0/src/keyflux/keyness/keyness.py +374 -0
- keyflux-0.1.0/src/keyflux/keyness/measures.py +342 -0
- keyflux-0.1.0/src/keyflux/py.typed +0 -0
- keyflux-0.1.0/src/keyflux/ranking/__init__.py +5 -0
- keyflux-0.1.0/src/keyflux/ranking/rankedlist.py +214 -0
- keyflux-0.1.0/src/keyflux/viz/__init__.py +5 -0
- keyflux-0.1.0/src/keyflux/viz/allotaxonograph.py +159 -0
keyflux-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: keyflux
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Corpus keyness, rank-turbulence divergence, and allotaxonographs
|
|
5
|
+
Keywords: nlp,keyness,keywords,corpus-linguistics,rank-turbulence-divergence,allotaxonometry,text-analysis,computational-humanities
|
|
6
|
+
Author: Zoltan Varju, Orsolya Putz
|
|
7
|
+
Author-email: Zoltan Varju <zoltan.varju@crowintelligence.org>, Orsolya Putz <orsolya.putz@crowintelligence.org>
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
18
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
19
|
+
Requires-Dist: numpy>=1.26
|
|
20
|
+
Requires-Dist: matplotlib>=3.8
|
|
21
|
+
Requires-Dist: pytest>=8 ; extra == 'dev'
|
|
22
|
+
Requires-Dist: pytest-cov ; extra == 'dev'
|
|
23
|
+
Requires-Dist: hypothesis>=6.100 ; extra == 'dev'
|
|
24
|
+
Requires-Dist: ruff>=0.4 ; extra == 'dev'
|
|
25
|
+
Requires-Dist: ty ; extra == 'dev'
|
|
26
|
+
Requires-Dist: mutmut>=3 ; extra == 'dev'
|
|
27
|
+
Requires-Dist: mkdocs-material ; extra == 'docs'
|
|
28
|
+
Requires-Dist: mkdocstrings[python] ; extra == 'docs'
|
|
29
|
+
Requires-Python: >=3.11
|
|
30
|
+
Project-URL: Homepage, https://crowintelligence.org/
|
|
31
|
+
Project-URL: Repository, https://github.com/crow-intelligence/keyflux
|
|
32
|
+
Project-URL: Documentation, https://keyflux.readthedocs.io
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Provides-Extra: docs
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
<p align="center">
|
|
38
|
+
<img src="https://raw.githubusercontent.com/crow-intelligence/keyflux/main/imgs/logo.svg" alt="keyflux logo" width="440">
|
|
39
|
+
</p>
|
|
40
|
+
|
|
41
|
+
# keyflux
|
|
42
|
+
|
|
43
|
+
Corpus keyness, rank-turbulence divergence, and allotaxonographs — in pure Python.
|
|
44
|
+
|
|
45
|
+
keyflux owns the whole comparison arc that diachronic and comparative discourse
|
|
46
|
+
analysis usually splits across tools and languages. It derives **keywords** and
|
|
47
|
+
**lockwords** from a focus-versus-reference comparison using proper corpus-linguistic
|
|
48
|
+
measures (log-likelihood for significance, log ratio for effect size — not just
|
|
49
|
+
chi-square), compares the resulting ranked lists with **rank-turbulence divergence
|
|
50
|
+
(RTD)**, and renders the **allotaxonograph**: the rank-rank map plus the ranked list
|
|
51
|
+
of which exact words drove the shift. No JavaScript runtime — figures are matplotlib.
|
|
52
|
+
|
|
53
|
+
It replaces the usual "Jaccard overlap on the top-N keywords" summary — one opaque
|
|
54
|
+
number that throws away rank, everything below the cutoff, and any account of *which*
|
|
55
|
+
words moved — with a transparent, pip-installable pipeline.
|
|
56
|
+
|
|
57
|
+
## Installation
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
uv add keyflux
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Quickstart
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from collections import Counter
|
|
67
|
+
|
|
68
|
+
from keyflux import Keyness, RankedList, rtd, allotaxonograph
|
|
69
|
+
|
|
70
|
+
# 1. Keyness: focus vs reference
|
|
71
|
+
focus = Counter({"climate": 30, "carbon": 12, "the": 80, "policy": 9})
|
|
72
|
+
reference = Counter({"climate": 3, "carbon": 1, "the": 78, "market": 15})
|
|
73
|
+
k = Keyness(focus, reference, measure="log_likelihood")
|
|
74
|
+
keywords = k.keywords(top=20)
|
|
75
|
+
lockwords = k.lockwords()
|
|
76
|
+
|
|
77
|
+
# 2. Rank-turbulence divergence between two ranked lists
|
|
78
|
+
r1 = RankedList.from_counts(focus, label="2019")
|
|
79
|
+
r2 = RankedList.from_counts(reference, label="2024")
|
|
80
|
+
result = rtd(r1, r2, alpha=1 / 3)
|
|
81
|
+
print(result.divergence)
|
|
82
|
+
|
|
83
|
+
# 3. Allotaxonograph (returns a matplotlib Figure)
|
|
84
|
+
fig = allotaxonograph(r1, r2, alpha=1 / 3, labels=("2019", "2024"))
|
|
85
|
+
fig.savefig("allotaxonograph.png")
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Features
|
|
89
|
+
|
|
90
|
+
- **Keyness measures**: log-likelihood (Dunning), log ratio, Simple Maths, %DIFF, and
|
|
91
|
+
chi-square (for contrast) — significance flagged against the chi-square thresholds
|
|
92
|
+
- **Keywords and lockwords**: positive / negative keywords plus the stable lockword zone
|
|
93
|
+
- **Rank-turbulence divergence**: tunable, rank-sensitive corpus comparison with
|
|
94
|
+
per-type contributions and an explicit alpha-to-zero log limit
|
|
95
|
+
- **Allotaxonograph**: publication-quality two-panel matplotlib figure, no JS runtime
|
|
96
|
+
- **Reproducibility records**: every keyness result emits its reference, cutoffs, and measure
|
|
97
|
+
|
|
98
|
+
## Documentation
|
|
99
|
+
|
|
100
|
+
Full documentation — quickstart, the keyness and allotaxonograph tutorials,
|
|
101
|
+
troubleshooting, and the complete API reference — is at
|
|
102
|
+
[keyflux.readthedocs.io](https://keyflux.readthedocs.io). The sources live in `docs/`.
|
|
103
|
+
|
|
104
|
+
## Roadmap
|
|
105
|
+
|
|
106
|
+
Planned for the next iteration. The robustness items are analysed in detail in
|
|
107
|
+
[`PRE-MORTEM.md`](PRE-MORTEM.md), and the open modelling choices are listed in
|
|
108
|
+
[`CHANGES_SUMMARY.md`](CHANGES_SUMMARY.md).
|
|
109
|
+
|
|
110
|
+
**Robustness / API decisions**
|
|
111
|
+
|
|
112
|
+
- [ ] Revisit the zero-cell floor default (0.5): it sets the effect size of every exclusive keyword and reorders the top of the list.
|
|
113
|
+
- [ ] Decide whether `min_focus_freq` / `min_reference_freq` should default asymmetrically (keep focus-exclusive keywords while demanding more reference evidence).
|
|
114
|
+
- [ ] Add Cohen's *d* (dispersion-aware effect size) once the corpus input can carry sub-corpus structure.
|
|
115
|
+
|
|
116
|
+
**Proposed features**
|
|
117
|
+
|
|
118
|
+
- [ ] `RankedList.from_keyness(..., by="score")` — rank by keyness score, not just frequency, so "compare the distinctive-word lists over time" is a one-liner.
|
|
119
|
+
- [ ] Optional self-contained interactive HTML+JS allotaxonograph export (an alpha slider), gated behind an extra so the core stays pure Python.
|
|
120
|
+
|
|
121
|
+
**Maintenance**
|
|
122
|
+
|
|
123
|
+
- [ ] Publish to PyPI and wire up ReadTheDocs.
|
|
124
|
+
|
|
125
|
+
## Made by
|
|
126
|
+
|
|
127
|
+
keyflux is made by [Crow Intelligence](https://crowintelligence.org/).
|
|
128
|
+
|
|
129
|
+
## License
|
|
130
|
+
|
|
131
|
+
MIT
|
keyflux-0.1.0/README.md
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://raw.githubusercontent.com/crow-intelligence/keyflux/main/imgs/logo.svg" alt="keyflux logo" width="440">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
# keyflux
|
|
6
|
+
|
|
7
|
+
Corpus keyness, rank-turbulence divergence, and allotaxonographs — in pure Python.
|
|
8
|
+
|
|
9
|
+
keyflux owns the whole comparison arc that diachronic and comparative discourse
|
|
10
|
+
analysis usually splits across tools and languages. It derives **keywords** and
|
|
11
|
+
**lockwords** from a focus-versus-reference comparison using proper corpus-linguistic
|
|
12
|
+
measures (log-likelihood for significance, log ratio for effect size — not just
|
|
13
|
+
chi-square), compares the resulting ranked lists with **rank-turbulence divergence
|
|
14
|
+
(RTD)**, and renders the **allotaxonograph**: the rank-rank map plus the ranked list
|
|
15
|
+
of which exact words drove the shift. No JavaScript runtime — figures are matplotlib.
|
|
16
|
+
|
|
17
|
+
It replaces the usual "Jaccard overlap on the top-N keywords" summary — one opaque
|
|
18
|
+
number that throws away rank, everything below the cutoff, and any account of *which*
|
|
19
|
+
words moved — with a transparent, pip-installable pipeline.
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
uv add keyflux
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Quickstart
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from collections import Counter
|
|
31
|
+
|
|
32
|
+
from keyflux import Keyness, RankedList, rtd, allotaxonograph
|
|
33
|
+
|
|
34
|
+
# 1. Keyness: focus vs reference
|
|
35
|
+
focus = Counter({"climate": 30, "carbon": 12, "the": 80, "policy": 9})
|
|
36
|
+
reference = Counter({"climate": 3, "carbon": 1, "the": 78, "market": 15})
|
|
37
|
+
k = Keyness(focus, reference, measure="log_likelihood")
|
|
38
|
+
keywords = k.keywords(top=20)
|
|
39
|
+
lockwords = k.lockwords()
|
|
40
|
+
|
|
41
|
+
# 2. Rank-turbulence divergence between two ranked lists
|
|
42
|
+
r1 = RankedList.from_counts(focus, label="2019")
|
|
43
|
+
r2 = RankedList.from_counts(reference, label="2024")
|
|
44
|
+
result = rtd(r1, r2, alpha=1 / 3)
|
|
45
|
+
print(result.divergence)
|
|
46
|
+
|
|
47
|
+
# 3. Allotaxonograph (returns a matplotlib Figure)
|
|
48
|
+
fig = allotaxonograph(r1, r2, alpha=1 / 3, labels=("2019", "2024"))
|
|
49
|
+
fig.savefig("allotaxonograph.png")
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Features
|
|
53
|
+
|
|
54
|
+
- **Keyness measures**: log-likelihood (Dunning), log ratio, Simple Maths, %DIFF, and
|
|
55
|
+
chi-square (for contrast) — significance flagged against the chi-square thresholds
|
|
56
|
+
- **Keywords and lockwords**: positive / negative keywords plus the stable lockword zone
|
|
57
|
+
- **Rank-turbulence divergence**: tunable, rank-sensitive corpus comparison with
|
|
58
|
+
per-type contributions and an explicit alpha-to-zero log limit
|
|
59
|
+
- **Allotaxonograph**: publication-quality two-panel matplotlib figure, no JS runtime
|
|
60
|
+
- **Reproducibility records**: every keyness result emits its reference, cutoffs, and measure
|
|
61
|
+
|
|
62
|
+
## Documentation
|
|
63
|
+
|
|
64
|
+
Full documentation — quickstart, the keyness and allotaxonograph tutorials,
|
|
65
|
+
troubleshooting, and the complete API reference — is at
|
|
66
|
+
[keyflux.readthedocs.io](https://keyflux.readthedocs.io). The sources live in `docs/`.
|
|
67
|
+
|
|
68
|
+
## Roadmap
|
|
69
|
+
|
|
70
|
+
Planned for the next iteration. The robustness items are analysed in detail in
|
|
71
|
+
[`PRE-MORTEM.md`](PRE-MORTEM.md), and the open modelling choices are listed in
|
|
72
|
+
[`CHANGES_SUMMARY.md`](CHANGES_SUMMARY.md).
|
|
73
|
+
|
|
74
|
+
**Robustness / API decisions**
|
|
75
|
+
|
|
76
|
+
- [ ] Revisit the zero-cell floor default (0.5): it sets the effect size of every exclusive keyword and reorders the top of the list.
|
|
77
|
+
- [ ] Decide whether `min_focus_freq` / `min_reference_freq` should default asymmetrically (keep focus-exclusive keywords while demanding more reference evidence).
|
|
78
|
+
- [ ] Add Cohen's *d* (dispersion-aware effect size) once the corpus input can carry sub-corpus structure.
|
|
79
|
+
|
|
80
|
+
**Proposed features**
|
|
81
|
+
|
|
82
|
+
- [ ] `RankedList.from_keyness(..., by="score")` — rank by keyness score, not just frequency, so "compare the distinctive-word lists over time" is a one-liner.
|
|
83
|
+
- [ ] Optional self-contained interactive HTML+JS allotaxonograph export (an alpha slider), gated behind an extra so the core stays pure Python.
|
|
84
|
+
|
|
85
|
+
**Maintenance**
|
|
86
|
+
|
|
87
|
+
- [ ] Publish to PyPI and wire up ReadTheDocs.
|
|
88
|
+
|
|
89
|
+
## Made by
|
|
90
|
+
|
|
91
|
+
keyflux is made by [Crow Intelligence](https://crowintelligence.org/).
|
|
92
|
+
|
|
93
|
+
## License
|
|
94
|
+
|
|
95
|
+
MIT
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "keyflux"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Corpus keyness, rank-turbulence divergence, and allotaxonographs"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Zoltan Varju", email = "zoltan.varju@crowintelligence.org" },
|
|
8
|
+
{ name = "Orsolya Putz", email = "orsolya.putz@crowintelligence.org" },
|
|
9
|
+
]
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
keywords = [
|
|
13
|
+
"nlp",
|
|
14
|
+
"keyness",
|
|
15
|
+
"keywords",
|
|
16
|
+
"corpus-linguistics",
|
|
17
|
+
"rank-turbulence-divergence",
|
|
18
|
+
"allotaxonometry",
|
|
19
|
+
"text-analysis",
|
|
20
|
+
"computational-humanities",
|
|
21
|
+
]
|
|
22
|
+
classifiers = [
|
|
23
|
+
"Development Status :: 3 - Alpha",
|
|
24
|
+
"Intended Audience :: Science/Research",
|
|
25
|
+
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.11",
|
|
28
|
+
"Programming Language :: Python :: 3.12",
|
|
29
|
+
"Programming Language :: Python :: 3.13",
|
|
30
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
31
|
+
"Topic :: Scientific/Engineering :: Visualization",
|
|
32
|
+
"Topic :: Text Processing :: Linguistic",
|
|
33
|
+
]
|
|
34
|
+
dependencies = [
|
|
35
|
+
"numpy>=1.26",
|
|
36
|
+
"matplotlib>=3.8",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[project.urls]
|
|
40
|
+
Homepage = "https://crowintelligence.org/"
|
|
41
|
+
Repository = "https://github.com/crow-intelligence/keyflux"
|
|
42
|
+
Documentation = "https://keyflux.readthedocs.io"
|
|
43
|
+
|
|
44
|
+
[project.optional-dependencies]
|
|
45
|
+
dev = [
|
|
46
|
+
"pytest>=8",
|
|
47
|
+
"pytest-cov",
|
|
48
|
+
"hypothesis>=6.100",
|
|
49
|
+
"ruff>=0.4",
|
|
50
|
+
"ty",
|
|
51
|
+
"mutmut>=3",
|
|
52
|
+
]
|
|
53
|
+
docs = [
|
|
54
|
+
"mkdocs-material",
|
|
55
|
+
"mkdocstrings[python]",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
[build-system]
|
|
59
|
+
requires = ["uv_build>=0.9.22,<0.10.0"]
|
|
60
|
+
build-backend = "uv_build"
|
|
61
|
+
|
|
62
|
+
[tool.ruff]
|
|
63
|
+
line-length = 88
|
|
64
|
+
target-version = "py311"
|
|
65
|
+
|
|
66
|
+
[tool.ruff.lint]
|
|
67
|
+
select = ["E", "F", "I", "N", "UP", "ANN", "D"]
|
|
68
|
+
ignore = ["D105", "D107"]
|
|
69
|
+
|
|
70
|
+
[tool.ruff.lint.per-file-ignores]
|
|
71
|
+
"tests/**" = ["D100", "D102", "D103", "D104", "ANN"]
|
|
72
|
+
"examples/**" = ["D100", "ANN"]
|
|
73
|
+
|
|
74
|
+
[tool.ruff.lint.pydocstyle]
|
|
75
|
+
convention = "google"
|
|
76
|
+
|
|
77
|
+
[tool.pytest.ini_options]
|
|
78
|
+
addopts = "--doctest-modules --tb=short"
|
|
79
|
+
testpaths = ["src", "tests"]
|
|
80
|
+
|
|
81
|
+
# Mutation testing (dev-only, not run in CI). Scoped to the math-heavy core and
|
|
82
|
+
# its fast, plotting-free test files. Run with `uv run mutmut run`.
|
|
83
|
+
[tool.mutmut]
|
|
84
|
+
source_paths = ["src/keyflux"]
|
|
85
|
+
only_mutate = [
|
|
86
|
+
"src/keyflux/keyness/measures.py",
|
|
87
|
+
"src/keyflux/keyness/keyness.py",
|
|
88
|
+
"src/keyflux/ranking/rankedlist.py",
|
|
89
|
+
"src/keyflux/divergence/rtd.py",
|
|
90
|
+
]
|
|
91
|
+
pytest_add_cli_args_test_selection = [
|
|
92
|
+
"tests/test_measures.py",
|
|
93
|
+
"tests/test_keyness.py",
|
|
94
|
+
"tests/test_rankedlist.py",
|
|
95
|
+
"tests/test_rtd.py",
|
|
96
|
+
]
|
|
97
|
+
pytest_add_cli_args = ["-p", "no:cacheprovider"]
|
|
98
|
+
|
|
99
|
+
[tool.ty]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Keyflux — keyness, rank-turbulence divergence, and allotaxonographs."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
from keyflux.divergence import Contribution, RTDResult, rtd
|
|
6
|
+
from keyflux.io.corpus import counts_from_text, counts_from_tokens, load_counts
|
|
7
|
+
from keyflux.keyness import (
|
|
8
|
+
Keyness,
|
|
9
|
+
KeynessRow,
|
|
10
|
+
KeywordTable,
|
|
11
|
+
ReproRecord,
|
|
12
|
+
)
|
|
13
|
+
from keyflux.ranking import RankedList
|
|
14
|
+
from keyflux.viz import allotaxonograph
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"Keyness",
|
|
18
|
+
"KeynessRow",
|
|
19
|
+
"KeywordTable",
|
|
20
|
+
"ReproRecord",
|
|
21
|
+
"RankedList",
|
|
22
|
+
"rtd",
|
|
23
|
+
"RTDResult",
|
|
24
|
+
"Contribution",
|
|
25
|
+
"allotaxonograph",
|
|
26
|
+
"counts_from_tokens",
|
|
27
|
+
"counts_from_text",
|
|
28
|
+
"load_counts",
|
|
29
|
+
"__version__",
|
|
30
|
+
]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Shared type aliases for the keyflux package."""
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from typing import Literal, TypeAlias
|
|
5
|
+
|
|
6
|
+
Token: TypeAlias = str
|
|
7
|
+
"""A surface or lemmatised word type."""
|
|
8
|
+
|
|
9
|
+
Count: TypeAlias = int
|
|
10
|
+
"""A raw frequency count for a single type."""
|
|
11
|
+
|
|
12
|
+
FreqTable: TypeAlias = Counter[str]
|
|
13
|
+
"""Type-to-count mapping for one corpus (focus or reference)."""
|
|
14
|
+
|
|
15
|
+
Rank: TypeAlias = float
|
|
16
|
+
"""A 1-based rank; float because tied and tied-last ranks are averaged."""
|
|
17
|
+
|
|
18
|
+
MeasureName: TypeAlias = Literal[
|
|
19
|
+
"log_likelihood",
|
|
20
|
+
"log_ratio",
|
|
21
|
+
"simple_maths",
|
|
22
|
+
"percent_diff",
|
|
23
|
+
"chi_square",
|
|
24
|
+
]
|
|
25
|
+
"""Identifier selecting a keyness scoring function."""
|
|
26
|
+
|
|
27
|
+
Significance: TypeAlias = Literal["ns", "p05", "p01", "p001", "p0001"]
|
|
28
|
+
"""Significance band from a log-likelihood / chi-square statistic (1 d.f.)."""
|
|
29
|
+
|
|
30
|
+
Direction: TypeAlias = Literal["positive", "negative", "neutral"]
|
|
31
|
+
"""Keyness polarity: over-represented, under-represented, or neither in focus."""
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Tiny bundled corpora and fixtures for docs and tests.
|
|
2
|
+
|
|
3
|
+
The data lives inline as Python dicts so it is always importable in doctests
|
|
4
|
+
with no package-data or ``importlib.resources`` machinery.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections import Counter
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from keyflux.ranking.rankedlist import RankedList
|
|
14
|
+
|
|
15
|
+
# The jkbren/rank-turbulence-divergence reference example. Both systems rank the
|
|
16
|
+
# same seven elements; rtd at alpha=1.0 is 0.45924793111057804.
|
|
17
|
+
_JKBREN_FOCUS: dict[str, int] = {
|
|
18
|
+
"a": 20,
|
|
19
|
+
"e": 14,
|
|
20
|
+
"c": 8,
|
|
21
|
+
"b": 7,
|
|
22
|
+
"f": 4,
|
|
23
|
+
"g": 2,
|
|
24
|
+
"d": 1,
|
|
25
|
+
}
|
|
26
|
+
_JKBREN_REFERENCE: dict[str, int] = {
|
|
27
|
+
"b": 24,
|
|
28
|
+
"a": 16,
|
|
29
|
+
"e": 5,
|
|
30
|
+
"d": 4,
|
|
31
|
+
"c": 3,
|
|
32
|
+
"f": 2,
|
|
33
|
+
"g": 1,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
_DEMO_FOCUS: dict[str, int] = {
|
|
37
|
+
"climate": 42,
|
|
38
|
+
"carbon": 28,
|
|
39
|
+
"emissions": 19,
|
|
40
|
+
"warming": 14,
|
|
41
|
+
"policy": 11,
|
|
42
|
+
"the": 320,
|
|
43
|
+
"of": 180,
|
|
44
|
+
"and": 165,
|
|
45
|
+
"to": 150,
|
|
46
|
+
"energy": 22,
|
|
47
|
+
"renewable": 9,
|
|
48
|
+
"global": 17,
|
|
49
|
+
}
|
|
50
|
+
_DEMO_REFERENCE: dict[str, int] = {
|
|
51
|
+
"market": 40,
|
|
52
|
+
"stock": 26,
|
|
53
|
+
"trade": 21,
|
|
54
|
+
"profit": 13,
|
|
55
|
+
"policy": 12,
|
|
56
|
+
"the": 318,
|
|
57
|
+
"of": 176,
|
|
58
|
+
"and": 170,
|
|
59
|
+
"to": 148,
|
|
60
|
+
"energy": 8,
|
|
61
|
+
"shares": 15,
|
|
62
|
+
"global": 16,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def load_demo_pair() -> tuple[Counter[str], Counter[str]]:
|
|
67
|
+
"""Return the bundled (focus, reference) demo corpus pair.
|
|
68
|
+
|
|
69
|
+
A tiny climate-discourse focus corpus versus a finance-discourse reference
|
|
70
|
+
corpus, with shared function words and a couple of lockword-like overlaps.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
``(focus, reference)`` frequency Counters.
|
|
74
|
+
|
|
75
|
+
Examples:
|
|
76
|
+
>>> focus, reference = load_demo_pair()
|
|
77
|
+
>>> focus["climate"], reference["market"]
|
|
78
|
+
(42, 40)
|
|
79
|
+
"""
|
|
80
|
+
return Counter(_DEMO_FOCUS), Counter(_DEMO_REFERENCE)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def load_jkbren_example() -> tuple[RankedList, RankedList]:
|
|
84
|
+
"""Return the jkbren rank-turbulence-divergence regression pair.
|
|
85
|
+
|
|
86
|
+
Two ranked lists over the same seven elements. Their rank-turbulence
|
|
87
|
+
divergence at ``alpha=1.0`` is ``0.45924793111057804`` — the regression
|
|
88
|
+
anchor from the reference implementation.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
``(list1, list2)`` as :class:`keyflux.ranking.rankedlist.RankedList`.
|
|
92
|
+
|
|
93
|
+
Examples:
|
|
94
|
+
>>> from keyflux.divergence import rtd
|
|
95
|
+
>>> r1, r2 = load_jkbren_example()
|
|
96
|
+
>>> round(rtd(r1, r2, alpha=1.0).divergence, 6)
|
|
97
|
+
0.459248
|
|
98
|
+
"""
|
|
99
|
+
from keyflux.ranking.rankedlist import RankedList
|
|
100
|
+
|
|
101
|
+
return (
|
|
102
|
+
RankedList.from_counts(_JKBREN_FOCUS, label="system 1"),
|
|
103
|
+
RankedList.from_counts(_JKBREN_REFERENCE, label="system 2"),
|
|
104
|
+
)
|