keyflux 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
keyflux/__init__.py ADDED
@@ -0,0 +1,30 @@
1
+ """Keyflux — keyness, rank-turbulence divergence, and allotaxonographs."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from keyflux.divergence import Contribution, RTDResult, rtd
6
+ from keyflux.io.corpus import counts_from_text, counts_from_tokens, load_counts
7
+ from keyflux.keyness import (
8
+ Keyness,
9
+ KeynessRow,
10
+ KeywordTable,
11
+ ReproRecord,
12
+ )
13
+ from keyflux.ranking import RankedList
14
+ from keyflux.viz import allotaxonograph
15
+
16
+ __all__ = [
17
+ "Keyness",
18
+ "KeynessRow",
19
+ "KeywordTable",
20
+ "ReproRecord",
21
+ "RankedList",
22
+ "rtd",
23
+ "RTDResult",
24
+ "Contribution",
25
+ "allotaxonograph",
26
+ "counts_from_tokens",
27
+ "counts_from_text",
28
+ "load_counts",
29
+ "__version__",
30
+ ]
keyflux/_types.py ADDED
@@ -0,0 +1,31 @@
1
+ """Shared type aliases for the keyflux package."""
2
+
3
+ from collections import Counter
4
+ from typing import Literal, TypeAlias
5
+
6
+ Token: TypeAlias = str
7
+ """A surface or lemmatised word type."""
8
+
9
+ Count: TypeAlias = int
10
+ """A raw frequency count for a single type."""
11
+
12
+ FreqTable: TypeAlias = Counter[str]
13
+ """Type-to-count mapping for one corpus (focus or reference)."""
14
+
15
+ Rank: TypeAlias = float
16
+ """A 1-based rank; float because tied and tied-last ranks are averaged."""
17
+
18
+ MeasureName: TypeAlias = Literal[
19
+ "log_likelihood",
20
+ "log_ratio",
21
+ "simple_maths",
22
+ "percent_diff",
23
+ "chi_square",
24
+ ]
25
+ """Identifier selecting a keyness scoring function."""
26
+
27
+ Significance: TypeAlias = Literal["ns", "p05", "p01", "p001", "p0001"]
28
+ """Significance band from a log-likelihood / chi-square statistic (1 d.f.)."""
29
+
30
+ Direction: TypeAlias = Literal["positive", "negative", "neutral"]
31
+ """Keyness polarity: over-represented, under-represented, or neither in focus."""
@@ -0,0 +1,104 @@
1
+ """Tiny bundled corpora and fixtures for docs and tests.
2
+
3
+ The data lives inline as Python dicts so it is always importable in doctests
4
+ with no package-data or ``importlib.resources`` machinery.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from collections import Counter
10
+ from typing import TYPE_CHECKING
11
+
12
+ if TYPE_CHECKING:
13
+ from keyflux.ranking.rankedlist import RankedList
14
+
15
+ # The jkbren/rank-turbulence-divergence reference example. Both systems rank the
16
+ # same seven elements; rtd at alpha=1.0 is 0.45924793111057804.
17
+ _JKBREN_FOCUS: dict[str, int] = {
18
+ "a": 20,
19
+ "e": 14,
20
+ "c": 8,
21
+ "b": 7,
22
+ "f": 4,
23
+ "g": 2,
24
+ "d": 1,
25
+ }
26
+ _JKBREN_REFERENCE: dict[str, int] = {
27
+ "b": 24,
28
+ "a": 16,
29
+ "e": 5,
30
+ "d": 4,
31
+ "c": 3,
32
+ "f": 2,
33
+ "g": 1,
34
+ }
35
+
36
+ _DEMO_FOCUS: dict[str, int] = {
37
+ "climate": 42,
38
+ "carbon": 28,
39
+ "emissions": 19,
40
+ "warming": 14,
41
+ "policy": 11,
42
+ "the": 320,
43
+ "of": 180,
44
+ "and": 165,
45
+ "to": 150,
46
+ "energy": 22,
47
+ "renewable": 9,
48
+ "global": 17,
49
+ }
50
+ _DEMO_REFERENCE: dict[str, int] = {
51
+ "market": 40,
52
+ "stock": 26,
53
+ "trade": 21,
54
+ "profit": 13,
55
+ "policy": 12,
56
+ "the": 318,
57
+ "of": 176,
58
+ "and": 170,
59
+ "to": 148,
60
+ "energy": 8,
61
+ "shares": 15,
62
+ "global": 16,
63
+ }
64
+
65
+
66
+ def load_demo_pair() -> tuple[Counter[str], Counter[str]]:
67
+ """Return the bundled (focus, reference) demo corpus pair.
68
+
69
+ A tiny climate-discourse focus corpus versus a finance-discourse reference
70
+ corpus, with shared function words and a couple of lockword-like overlaps.
71
+
72
+ Returns:
73
+ ``(focus, reference)`` frequency Counters.
74
+
75
+ Examples:
76
+ >>> focus, reference = load_demo_pair()
77
+ >>> focus["climate"], reference["market"]
78
+ (42, 40)
79
+ """
80
+ return Counter(_DEMO_FOCUS), Counter(_DEMO_REFERENCE)
81
+
82
+
83
+ def load_jkbren_example() -> tuple[RankedList, RankedList]:
84
+ """Return the jkbren rank-turbulence-divergence regression pair.
85
+
86
+ Two ranked lists over the same seven elements. Their rank-turbulence
87
+ divergence at ``alpha=1.0`` is ``0.45924793111057804`` — the regression
88
+ anchor from the reference implementation.
89
+
90
+ Returns:
91
+ ``(list1, list2)`` as :class:`keyflux.ranking.rankedlist.RankedList`.
92
+
93
+ Examples:
94
+ >>> from keyflux.divergence import rtd
95
+ >>> r1, r2 = load_jkbren_example()
96
+ >>> round(rtd(r1, r2, alpha=1.0).divergence, 6)
97
+ 0.459248
98
+ """
99
+ from keyflux.ranking.rankedlist import RankedList
100
+
101
+ return (
102
+ RankedList.from_counts(_JKBREN_FOCUS, label="system 1"),
103
+ RankedList.from_counts(_JKBREN_REFERENCE, label="system 2"),
104
+ )
@@ -0,0 +1,5 @@
1
+ """Divergence: rank-turbulence divergence between two ranked lists."""
2
+
3
+ from keyflux.divergence.rtd import Contribution, RTDResult, rtd
4
+
5
+ __all__ = ["rtd", "RTDResult", "Contribution"]
@@ -0,0 +1,213 @@
1
+ r"""Rank-turbulence divergence (Dodds et al. 2020, arXiv:2002.09770).
2
+
3
+ Rank-turbulence divergence (RTD) compares two ranked lists by how far each type
4
+ moves between them, with a tunable parameter ``alpha``. Small ``alpha`` surfaces
5
+ churn among rare, low-rank types; large ``alpha`` surfaces shifts among common,
6
+ high-rank types. The per-type divergence is
7
+
8
+ delta(tau; alpha) = | r1 ** -alpha - r2 ** -alpha | ** (1 / (alpha + 1))
9
+
10
+ and the total is normalised to [0, 1] by the divergence of the maximally
11
+ disjoint pair, so RTD(x, x) = 0 and disjoint lists approach 1. As ``alpha -> 0``
12
+ the per-type term degenerates to the logarithmic form ``| ln r1 - ln r2 |``,
13
+ which is used directly to avoid dividing by zero.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import math
19
+ from dataclasses import dataclass
20
+ from typing import TYPE_CHECKING, Literal
21
+
22
+ if TYPE_CHECKING:
23
+ from keyflux.ranking.rankedlist import RankedList
24
+
25
+ ShiftDirection = Literal["system1", "system2", "shared"]
26
+ """Which system a type is more characteristic of (lower rank = more typical)."""
27
+
28
+ _ALPHA_ZERO_TOL = 1e-10
29
+
30
+
31
+ @dataclass(frozen=True, slots=True)
32
+ class Contribution:
33
+ """One type's contribution to the total divergence.
34
+
35
+ Attributes:
36
+ type: The type.
37
+ delta: The raw per-type divergence term (before normalisation).
38
+ contribution: This type's additive share of ``divergence`` (the shares
39
+ sum to the total divergence).
40
+ rank1: The type's rank in the first list (tied-last if absent).
41
+ rank2: The type's rank in the second list (tied-last if absent).
42
+ direction: Which list the type leans toward (lower rank wins).
43
+ """
44
+
45
+ type: str
46
+ delta: float
47
+ contribution: float
48
+ rank1: float
49
+ rank2: float
50
+ direction: ShiftDirection
51
+
52
+
53
+ @dataclass(frozen=True, slots=True)
54
+ class RTDResult:
55
+ """The result of a rank-turbulence divergence computation.
56
+
57
+ Attributes:
58
+ divergence: The normalised divergence in [0, 1].
59
+ raw: The un-normalised weighted sum (matches the jkbren reference at
60
+ ``alpha=1.0`` when ``normalize=False``).
61
+ alpha: The tuning parameter used.
62
+ contributions: Per-type contributions, sorted by contribution descending.
63
+ labels: The two list labels.
64
+ """
65
+
66
+ divergence: float
67
+ raw: float
68
+ alpha: float
69
+ contributions: tuple[Contribution, ...]
70
+ labels: tuple[str, str]
71
+
72
+
73
+ def _direction(rank1: float, rank2: float) -> ShiftDirection:
74
+ if rank1 < rank2:
75
+ return "system1"
76
+ if rank2 < rank1:
77
+ return "system2"
78
+ return "shared"
79
+
80
+
81
+ def _elements_general(
82
+ ranks1: list[float],
83
+ ranks2: list[float],
84
+ n1: int,
85
+ n2: int,
86
+ alpha: float,
87
+ ) -> tuple[list[float], float]:
88
+ """Per-type terms and normalisation for alpha > 0."""
89
+ exp = 1.0 / (alpha + 1.0)
90
+ ref1 = (n1 + 0.5 * n2) ** (-alpha)
91
+ ref2 = (n2 + 0.5 * n1) ** (-alpha)
92
+ deltas: list[float] = []
93
+ norm = 0.0
94
+ for r1, r2 in zip(ranks1, ranks2, strict=True):
95
+ a1 = r1**-alpha
96
+ a2 = r2**-alpha
97
+ deltas.append(abs(a1 - a2) ** exp)
98
+ norm += abs(a1 - ref1) ** exp + abs(ref2 - a2) ** exp
99
+ return deltas, norm
100
+
101
+
102
+ def _elements_log_limit(
103
+ ranks1: list[float],
104
+ ranks2: list[float],
105
+ n1: int,
106
+ n2: int,
107
+ ) -> tuple[list[float], float]:
108
+ """Per-type terms and normalisation for the alpha -> 0 logarithmic limit."""
109
+ ref1 = math.log(n1 + 0.5 * n2)
110
+ ref2 = math.log(n2 + 0.5 * n1)
111
+ deltas: list[float] = []
112
+ norm = 0.0
113
+ for r1, r2 in zip(ranks1, ranks2, strict=True):
114
+ l1 = math.log(r1)
115
+ l2 = math.log(r2)
116
+ deltas.append(abs(l1 - l2))
117
+ norm += abs(l1 - ref1) + abs(ref2 - l2)
118
+ return deltas, norm
119
+
120
+
121
+ def rtd(
122
+ list1: RankedList,
123
+ list2: RankedList,
124
+ *,
125
+ alpha: float = 1.0 / 3.0,
126
+ normalize: bool = True,
127
+ ) -> RTDResult:
128
+ """Compute the rank-turbulence divergence between two ranked lists.
129
+
130
+ Args:
131
+ list1: The first ranked list.
132
+ list2: The second ranked list.
133
+ alpha: Tuning parameter (``>= 0``). Small values surface rare, low-rank
134
+ churn; large values surface common-word shifts. ``alpha == 0`` uses
135
+ the logarithmic limit. The default ``1/3`` is the Dodds et al.
136
+ recommendation for text.
137
+ normalize: If True, return ``divergence`` scaled to [0, 1]; the ``raw``
138
+ field always holds the un-normalised sum.
139
+
140
+ Returns:
141
+ An :class:`RTDResult` with the scalar divergence and the sorted per-type
142
+ contributions, each tagged with its shift direction.
143
+
144
+ Raises:
145
+ ValueError: If ``alpha`` is negative, or either list is empty.
146
+
147
+ Contract:
148
+ - ``rtd(x, x).divergence == 0`` (a list never diverges from itself).
149
+ - ``0 <= divergence <= 1`` for every input and every ``alpha``.
150
+ - Symmetric: ``rtd(a, b).divergence == rtd(b, a).divergence``.
151
+ - The per-type contributions sum to ``divergence``.
152
+ - Exclusives (present in one list only) are placed at a tied-last rank.
153
+
154
+ Examples:
155
+ >>> from keyflux.ranking.rankedlist import RankedList
156
+ >>> r1 = RankedList.from_counts({"a": 20, "e": 14, "c": 8, "b": 7,
157
+ ... "f": 4, "g": 2, "d": 1})
158
+ >>> r2 = RankedList.from_counts({"b": 24, "a": 16, "e": 5, "d": 4,
159
+ ... "c": 3, "f": 2, "g": 1})
160
+ >>> round(rtd(r1, r2, alpha=1.0).divergence, 6)
161
+ 0.459248
162
+ >>> rtd(r1, r1, alpha=1.0).divergence
163
+ 0.0
164
+ """
165
+ if alpha < 0:
166
+ msg = f"alpha must be non-negative, got {alpha}."
167
+ raise ValueError(msg)
168
+ if not len(list1) or not len(list2):
169
+ msg = "Both ranked lists must be non-empty."
170
+ raise ValueError(msg)
171
+
172
+ types, ranks1, ranks2 = list1.aligned(list2)
173
+ n1 = len(list1)
174
+ n2 = len(list2)
175
+
176
+ if alpha < _ALPHA_ZERO_TOL:
177
+ deltas, norm = _elements_log_limit(ranks1, ranks2, n1, n2)
178
+ prefactor = 1.0
179
+ else:
180
+ deltas, norm = _elements_general(ranks1, ranks2, n1, n2, alpha)
181
+ prefactor = (alpha + 1.0) / alpha
182
+
183
+ raw = prefactor * math.fsum(deltas)
184
+ normalizer = prefactor * norm
185
+ divergence = raw / normalizer if normalize and normalizer > 0 else raw
186
+ if normalize and normalizer <= 0:
187
+ divergence = 0.0
188
+
189
+ share_denom = normalizer if normalize and normalizer > 0 else 1.0
190
+ contributions = tuple(
191
+ sorted(
192
+ (
193
+ Contribution(
194
+ type=t,
195
+ delta=d,
196
+ contribution=prefactor * d / share_denom,
197
+ rank1=r1,
198
+ rank2=r2,
199
+ direction=_direction(r1, r2),
200
+ )
201
+ for t, d, r1, r2 in zip(types, deltas, ranks1, ranks2, strict=True)
202
+ ),
203
+ key=lambda c: c.contribution,
204
+ reverse=True,
205
+ )
206
+ )
207
+ return RTDResult(
208
+ divergence=divergence,
209
+ raw=raw,
210
+ alpha=alpha,
211
+ contributions=contributions,
212
+ labels=(list1.label, list2.label),
213
+ )
keyflux/io/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """I/O: build frequency Counters from tokens, text, or count files."""
2
+
3
+ from keyflux.io.corpus import counts_from_text, counts_from_tokens, load_counts
4
+
5
+ __all__ = ["counts_from_tokens", "counts_from_text", "load_counts"]
keyflux/io/corpus.py ADDED
@@ -0,0 +1,99 @@
1
+ """Build frequency Counters from tokens, text, or count files.
2
+
3
+ keyflux is about keyness and rank comparison, not tokenisation. These helpers
4
+ cover the simple cases; for real linguistic tokenisation, pre-tokenise (for
5
+ example with ``kenon.Tokenizer``) and pass the resulting Counter directly.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from collections import Counter
12
+ from collections.abc import Iterable
13
+ from pathlib import Path
14
+
15
+ _WORD_RE = re.compile(r"\w+", re.UNICODE)
16
+
17
+
18
+ def counts_from_tokens(
19
+ tokens: Iterable[str], *, lowercase: bool = True
20
+ ) -> Counter[str]:
21
+ """Build a frequency Counter from a token iterable.
22
+
23
+ Args:
24
+ tokens: Already-tokenised word types.
25
+ lowercase: If True, lowercase each token before counting.
26
+
27
+ Returns:
28
+ A Counter mapping each type to its frequency.
29
+
30
+ Examples:
31
+ >>> counts_from_tokens(["The", "cat", "the", "CAT"])
32
+ Counter({'the': 2, 'cat': 2})
33
+ """
34
+ if lowercase:
35
+ return Counter(t.lower() for t in tokens)
36
+ return Counter(tokens)
37
+
38
+
39
+ def counts_from_text(text: str, *, lowercase: bool = True) -> Counter[str]:
40
+ """Tokenise a string on word characters, then count.
41
+
42
+ Uses a simple word-character regular expression — adequate for demos and
43
+ tests, not a substitute for linguistic tokenisation.
44
+
45
+ Args:
46
+ text: Raw text to tokenise and count.
47
+ lowercase: If True, lowercase before counting.
48
+
49
+ Returns:
50
+ A Counter mapping each type to its frequency.
51
+
52
+ Examples:
53
+ >>> counts_from_text("The cat sat. The dog ran.")["the"]
54
+ 2
55
+ """
56
+ tokens = _WORD_RE.findall(text.lower() if lowercase else text)
57
+ return Counter(tokens)
58
+
59
+
60
+ def load_counts(path: str | Path) -> Counter[str]:
61
+ """Read a count file into a Counter.
62
+
63
+ Each non-empty line is either ``type<TAB>count`` or a bare ``type`` (counted
64
+ as one occurrence per line).
65
+
66
+ Args:
67
+ path: Path to the count file.
68
+
69
+ Returns:
70
+ A Counter built from the file.
71
+
72
+ Raises:
73
+ FileNotFoundError: If ``path`` does not exist.
74
+ ValueError: If a count field is present but not an integer.
75
+
76
+ Examples:
77
+ >>> import tempfile, pathlib
78
+ >>> p = pathlib.Path(tempfile.mkdtemp()) / "counts.tsv"
79
+ >>> rows = ["climate" + chr(9) + "30", "carbon" + chr(9) + "12"]
80
+ >>> _ = p.write_text(chr(10).join(rows))
81
+ >>> load_counts(p)
82
+ Counter({'climate': 30, 'carbon': 12})
83
+ """
84
+ counts: Counter[str] = Counter()
85
+ for line in Path(path).read_text(encoding="utf-8").splitlines():
86
+ line = line.strip()
87
+ if not line:
88
+ continue
89
+ parts = line.split("\t")
90
+ if len(parts) == 1:
91
+ counts[parts[0]] += 1
92
+ else:
93
+ word, raw = parts[0], parts[1]
94
+ try:
95
+ counts[word] += int(raw)
96
+ except ValueError as exc:
97
+ msg = f"Non-integer count {raw!r} for type {word!r}."
98
+ raise ValueError(msg) from exc
99
+ return counts
@@ -0,0 +1,43 @@
1
+ """Keyness: focus-vs-reference keywords, lockwords, and association measures."""
2
+
3
+ from keyflux.keyness.classify import (
4
+ Category,
5
+ classify_direction,
6
+ classify_row,
7
+ is_significant,
8
+ partition,
9
+ )
10
+ from keyflux.keyness.keyness import (
11
+ Keyness,
12
+ KeynessRow,
13
+ KeywordTable,
14
+ ReproRecord,
15
+ )
16
+ from keyflux.keyness.measures import (
17
+ chi_square,
18
+ expected_counts,
19
+ log_likelihood,
20
+ log_ratio,
21
+ percent_diff,
22
+ significance_band,
23
+ simple_maths,
24
+ )
25
+
26
+ __all__ = [
27
+ "Keyness",
28
+ "KeynessRow",
29
+ "KeywordTable",
30
+ "ReproRecord",
31
+ "Category",
32
+ "classify_direction",
33
+ "classify_row",
34
+ "is_significant",
35
+ "partition",
36
+ "chi_square",
37
+ "expected_counts",
38
+ "log_likelihood",
39
+ "log_ratio",
40
+ "percent_diff",
41
+ "significance_band",
42
+ "simple_maths",
43
+ ]