keyflux 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keyflux/__init__.py +30 -0
- keyflux/_types.py +31 -0
- keyflux/datasets/__init__.py +104 -0
- keyflux/divergence/__init__.py +5 -0
- keyflux/divergence/rtd.py +213 -0
- keyflux/io/__init__.py +5 -0
- keyflux/io/corpus.py +99 -0
- keyflux/keyness/__init__.py +43 -0
- keyflux/keyness/classify.py +176 -0
- keyflux/keyness/keyness.py +374 -0
- keyflux/keyness/measures.py +342 -0
- keyflux/py.typed +0 -0
- keyflux/ranking/__init__.py +5 -0
- keyflux/ranking/rankedlist.py +214 -0
- keyflux/viz/__init__.py +5 -0
- keyflux/viz/allotaxonograph.py +159 -0
- keyflux-0.1.0.dist-info/METADATA +131 -0
- keyflux-0.1.0.dist-info/RECORD +19 -0
- keyflux-0.1.0.dist-info/WHEEL +4 -0
keyflux/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Keyflux — keyness, rank-turbulence divergence, and allotaxonographs."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
from keyflux.divergence import Contribution, RTDResult, rtd
|
|
6
|
+
from keyflux.io.corpus import counts_from_text, counts_from_tokens, load_counts
|
|
7
|
+
from keyflux.keyness import (
|
|
8
|
+
Keyness,
|
|
9
|
+
KeynessRow,
|
|
10
|
+
KeywordTable,
|
|
11
|
+
ReproRecord,
|
|
12
|
+
)
|
|
13
|
+
from keyflux.ranking import RankedList
|
|
14
|
+
from keyflux.viz import allotaxonograph
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"Keyness",
|
|
18
|
+
"KeynessRow",
|
|
19
|
+
"KeywordTable",
|
|
20
|
+
"ReproRecord",
|
|
21
|
+
"RankedList",
|
|
22
|
+
"rtd",
|
|
23
|
+
"RTDResult",
|
|
24
|
+
"Contribution",
|
|
25
|
+
"allotaxonograph",
|
|
26
|
+
"counts_from_tokens",
|
|
27
|
+
"counts_from_text",
|
|
28
|
+
"load_counts",
|
|
29
|
+
"__version__",
|
|
30
|
+
]
|
keyflux/_types.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Shared type aliases for the keyflux package."""
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from typing import Literal, TypeAlias
|
|
5
|
+
|
|
6
|
+
Token: TypeAlias = str
|
|
7
|
+
"""A surface or lemmatised word type."""
|
|
8
|
+
|
|
9
|
+
Count: TypeAlias = int
|
|
10
|
+
"""A raw frequency count for a single type."""
|
|
11
|
+
|
|
12
|
+
FreqTable: TypeAlias = Counter[str]
|
|
13
|
+
"""Type-to-count mapping for one corpus (focus or reference)."""
|
|
14
|
+
|
|
15
|
+
Rank: TypeAlias = float
|
|
16
|
+
"""A 1-based rank; float because tied and tied-last ranks are averaged."""
|
|
17
|
+
|
|
18
|
+
MeasureName: TypeAlias = Literal[
|
|
19
|
+
"log_likelihood",
|
|
20
|
+
"log_ratio",
|
|
21
|
+
"simple_maths",
|
|
22
|
+
"percent_diff",
|
|
23
|
+
"chi_square",
|
|
24
|
+
]
|
|
25
|
+
"""Identifier selecting a keyness scoring function."""
|
|
26
|
+
|
|
27
|
+
Significance: TypeAlias = Literal["ns", "p05", "p01", "p001", "p0001"]
|
|
28
|
+
"""Significance band from a log-likelihood / chi-square statistic (1 d.f.)."""
|
|
29
|
+
|
|
30
|
+
Direction: TypeAlias = Literal["positive", "negative", "neutral"]
|
|
31
|
+
"""Keyness polarity: over-represented, under-represented, or neither in focus."""
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Tiny bundled corpora and fixtures for docs and tests.
|
|
2
|
+
|
|
3
|
+
The data lives inline as Python dicts so it is always importable in doctests
|
|
4
|
+
with no package-data or ``importlib.resources`` machinery.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections import Counter
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from keyflux.ranking.rankedlist import RankedList
|
|
14
|
+
|
|
15
|
+
# The jkbren/rank-turbulence-divergence reference example. Both systems rank the
|
|
16
|
+
# same seven elements; rtd at alpha=1.0 is 0.45924793111057804.
|
|
17
|
+
_JKBREN_FOCUS: dict[str, int] = {
|
|
18
|
+
"a": 20,
|
|
19
|
+
"e": 14,
|
|
20
|
+
"c": 8,
|
|
21
|
+
"b": 7,
|
|
22
|
+
"f": 4,
|
|
23
|
+
"g": 2,
|
|
24
|
+
"d": 1,
|
|
25
|
+
}
|
|
26
|
+
_JKBREN_REFERENCE: dict[str, int] = {
|
|
27
|
+
"b": 24,
|
|
28
|
+
"a": 16,
|
|
29
|
+
"e": 5,
|
|
30
|
+
"d": 4,
|
|
31
|
+
"c": 3,
|
|
32
|
+
"f": 2,
|
|
33
|
+
"g": 1,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
_DEMO_FOCUS: dict[str, int] = {
|
|
37
|
+
"climate": 42,
|
|
38
|
+
"carbon": 28,
|
|
39
|
+
"emissions": 19,
|
|
40
|
+
"warming": 14,
|
|
41
|
+
"policy": 11,
|
|
42
|
+
"the": 320,
|
|
43
|
+
"of": 180,
|
|
44
|
+
"and": 165,
|
|
45
|
+
"to": 150,
|
|
46
|
+
"energy": 22,
|
|
47
|
+
"renewable": 9,
|
|
48
|
+
"global": 17,
|
|
49
|
+
}
|
|
50
|
+
_DEMO_REFERENCE: dict[str, int] = {
|
|
51
|
+
"market": 40,
|
|
52
|
+
"stock": 26,
|
|
53
|
+
"trade": 21,
|
|
54
|
+
"profit": 13,
|
|
55
|
+
"policy": 12,
|
|
56
|
+
"the": 318,
|
|
57
|
+
"of": 176,
|
|
58
|
+
"and": 170,
|
|
59
|
+
"to": 148,
|
|
60
|
+
"energy": 8,
|
|
61
|
+
"shares": 15,
|
|
62
|
+
"global": 16,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def load_demo_pair() -> tuple[Counter[str], Counter[str]]:
|
|
67
|
+
"""Return the bundled (focus, reference) demo corpus pair.
|
|
68
|
+
|
|
69
|
+
A tiny climate-discourse focus corpus versus a finance-discourse reference
|
|
70
|
+
corpus, with shared function words and a couple of lockword-like overlaps.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
``(focus, reference)`` frequency Counters.
|
|
74
|
+
|
|
75
|
+
Examples:
|
|
76
|
+
>>> focus, reference = load_demo_pair()
|
|
77
|
+
>>> focus["climate"], reference["market"]
|
|
78
|
+
(42, 40)
|
|
79
|
+
"""
|
|
80
|
+
return Counter(_DEMO_FOCUS), Counter(_DEMO_REFERENCE)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def load_jkbren_example() -> tuple[RankedList, RankedList]:
|
|
84
|
+
"""Return the jkbren rank-turbulence-divergence regression pair.
|
|
85
|
+
|
|
86
|
+
Two ranked lists over the same seven elements. Their rank-turbulence
|
|
87
|
+
divergence at ``alpha=1.0`` is ``0.45924793111057804`` — the regression
|
|
88
|
+
anchor from the reference implementation.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
``(list1, list2)`` as :class:`keyflux.ranking.rankedlist.RankedList`.
|
|
92
|
+
|
|
93
|
+
Examples:
|
|
94
|
+
>>> from keyflux.divergence import rtd
|
|
95
|
+
>>> r1, r2 = load_jkbren_example()
|
|
96
|
+
>>> round(rtd(r1, r2, alpha=1.0).divergence, 6)
|
|
97
|
+
0.459248
|
|
98
|
+
"""
|
|
99
|
+
from keyflux.ranking.rankedlist import RankedList
|
|
100
|
+
|
|
101
|
+
return (
|
|
102
|
+
RankedList.from_counts(_JKBREN_FOCUS, label="system 1"),
|
|
103
|
+
RankedList.from_counts(_JKBREN_REFERENCE, label="system 2"),
|
|
104
|
+
)
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
r"""Rank-turbulence divergence (Dodds et al. 2020, arXiv:2002.09770).
|
|
2
|
+
|
|
3
|
+
Rank-turbulence divergence (RTD) compares two ranked lists by how far each type
|
|
4
|
+
moves between them, with a tunable parameter ``alpha``. Small ``alpha`` surfaces
|
|
5
|
+
churn among rare, low-rank types; large ``alpha`` surfaces shifts among common,
|
|
6
|
+
high-rank types. The per-type divergence is
|
|
7
|
+
|
|
8
|
+
delta(tau; alpha) = | r1 ** -alpha - r2 ** -alpha | ** (1 / (alpha + 1))
|
|
9
|
+
|
|
10
|
+
and the total is normalised to [0, 1] by the divergence of the maximally
|
|
11
|
+
disjoint pair, so RTD(x, x) = 0 and disjoint lists approach 1. As ``alpha -> 0``
|
|
12
|
+
the per-type term degenerates to the logarithmic form ``| ln r1 - ln r2 |``,
|
|
13
|
+
which is used directly to avoid dividing by zero.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import math
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from typing import TYPE_CHECKING, Literal
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from keyflux.ranking.rankedlist import RankedList
|
|
24
|
+
|
|
25
|
+
ShiftDirection = Literal["system1", "system2", "shared"]
|
|
26
|
+
"""Which system a type is more characteristic of (lower rank = more typical)."""
|
|
27
|
+
|
|
28
|
+
_ALPHA_ZERO_TOL = 1e-10
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass(frozen=True, slots=True)
|
|
32
|
+
class Contribution:
|
|
33
|
+
"""One type's contribution to the total divergence.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
type: The type.
|
|
37
|
+
delta: The raw per-type divergence term (before normalisation).
|
|
38
|
+
contribution: This type's additive share of ``divergence`` (the shares
|
|
39
|
+
sum to the total divergence).
|
|
40
|
+
rank1: The type's rank in the first list (tied-last if absent).
|
|
41
|
+
rank2: The type's rank in the second list (tied-last if absent).
|
|
42
|
+
direction: Which list the type leans toward (lower rank wins).
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
type: str
|
|
46
|
+
delta: float
|
|
47
|
+
contribution: float
|
|
48
|
+
rank1: float
|
|
49
|
+
rank2: float
|
|
50
|
+
direction: ShiftDirection
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(frozen=True, slots=True)
|
|
54
|
+
class RTDResult:
|
|
55
|
+
"""The result of a rank-turbulence divergence computation.
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
divergence: The normalised divergence in [0, 1].
|
|
59
|
+
raw: The un-normalised weighted sum (matches the jkbren reference at
|
|
60
|
+
``alpha=1.0`` when ``normalize=False``).
|
|
61
|
+
alpha: The tuning parameter used.
|
|
62
|
+
contributions: Per-type contributions, sorted by contribution descending.
|
|
63
|
+
labels: The two list labels.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
divergence: float
|
|
67
|
+
raw: float
|
|
68
|
+
alpha: float
|
|
69
|
+
contributions: tuple[Contribution, ...]
|
|
70
|
+
labels: tuple[str, str]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _direction(rank1: float, rank2: float) -> ShiftDirection:
|
|
74
|
+
if rank1 < rank2:
|
|
75
|
+
return "system1"
|
|
76
|
+
if rank2 < rank1:
|
|
77
|
+
return "system2"
|
|
78
|
+
return "shared"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _elements_general(
|
|
82
|
+
ranks1: list[float],
|
|
83
|
+
ranks2: list[float],
|
|
84
|
+
n1: int,
|
|
85
|
+
n2: int,
|
|
86
|
+
alpha: float,
|
|
87
|
+
) -> tuple[list[float], float]:
|
|
88
|
+
"""Per-type terms and normalisation for alpha > 0."""
|
|
89
|
+
exp = 1.0 / (alpha + 1.0)
|
|
90
|
+
ref1 = (n1 + 0.5 * n2) ** (-alpha)
|
|
91
|
+
ref2 = (n2 + 0.5 * n1) ** (-alpha)
|
|
92
|
+
deltas: list[float] = []
|
|
93
|
+
norm = 0.0
|
|
94
|
+
for r1, r2 in zip(ranks1, ranks2, strict=True):
|
|
95
|
+
a1 = r1**-alpha
|
|
96
|
+
a2 = r2**-alpha
|
|
97
|
+
deltas.append(abs(a1 - a2) ** exp)
|
|
98
|
+
norm += abs(a1 - ref1) ** exp + abs(ref2 - a2) ** exp
|
|
99
|
+
return deltas, norm
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _elements_log_limit(
|
|
103
|
+
ranks1: list[float],
|
|
104
|
+
ranks2: list[float],
|
|
105
|
+
n1: int,
|
|
106
|
+
n2: int,
|
|
107
|
+
) -> tuple[list[float], float]:
|
|
108
|
+
"""Per-type terms and normalisation for the alpha -> 0 logarithmic limit."""
|
|
109
|
+
ref1 = math.log(n1 + 0.5 * n2)
|
|
110
|
+
ref2 = math.log(n2 + 0.5 * n1)
|
|
111
|
+
deltas: list[float] = []
|
|
112
|
+
norm = 0.0
|
|
113
|
+
for r1, r2 in zip(ranks1, ranks2, strict=True):
|
|
114
|
+
l1 = math.log(r1)
|
|
115
|
+
l2 = math.log(r2)
|
|
116
|
+
deltas.append(abs(l1 - l2))
|
|
117
|
+
norm += abs(l1 - ref1) + abs(ref2 - l2)
|
|
118
|
+
return deltas, norm
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def rtd(
|
|
122
|
+
list1: RankedList,
|
|
123
|
+
list2: RankedList,
|
|
124
|
+
*,
|
|
125
|
+
alpha: float = 1.0 / 3.0,
|
|
126
|
+
normalize: bool = True,
|
|
127
|
+
) -> RTDResult:
|
|
128
|
+
"""Compute the rank-turbulence divergence between two ranked lists.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
list1: The first ranked list.
|
|
132
|
+
list2: The second ranked list.
|
|
133
|
+
alpha: Tuning parameter (``>= 0``). Small values surface rare, low-rank
|
|
134
|
+
churn; large values surface common-word shifts. ``alpha == 0`` uses
|
|
135
|
+
the logarithmic limit. The default ``1/3`` is the Dodds et al.
|
|
136
|
+
recommendation for text.
|
|
137
|
+
normalize: If True, return ``divergence`` scaled to [0, 1]; the ``raw``
|
|
138
|
+
field always holds the un-normalised sum.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
An :class:`RTDResult` with the scalar divergence and the sorted per-type
|
|
142
|
+
contributions, each tagged with its shift direction.
|
|
143
|
+
|
|
144
|
+
Raises:
|
|
145
|
+
ValueError: If ``alpha`` is negative, or either list is empty.
|
|
146
|
+
|
|
147
|
+
Contract:
|
|
148
|
+
- ``rtd(x, x).divergence == 0`` (a list never diverges from itself).
|
|
149
|
+
- ``0 <= divergence <= 1`` for every input and every ``alpha``.
|
|
150
|
+
- Symmetric: ``rtd(a, b).divergence == rtd(b, a).divergence``.
|
|
151
|
+
- The per-type contributions sum to ``divergence``.
|
|
152
|
+
- Exclusives (present in one list only) are placed at a tied-last rank.
|
|
153
|
+
|
|
154
|
+
Examples:
|
|
155
|
+
>>> from keyflux.ranking.rankedlist import RankedList
|
|
156
|
+
>>> r1 = RankedList.from_counts({"a": 20, "e": 14, "c": 8, "b": 7,
|
|
157
|
+
... "f": 4, "g": 2, "d": 1})
|
|
158
|
+
>>> r2 = RankedList.from_counts({"b": 24, "a": 16, "e": 5, "d": 4,
|
|
159
|
+
... "c": 3, "f": 2, "g": 1})
|
|
160
|
+
>>> round(rtd(r1, r2, alpha=1.0).divergence, 6)
|
|
161
|
+
0.459248
|
|
162
|
+
>>> rtd(r1, r1, alpha=1.0).divergence
|
|
163
|
+
0.0
|
|
164
|
+
"""
|
|
165
|
+
if alpha < 0:
|
|
166
|
+
msg = f"alpha must be non-negative, got {alpha}."
|
|
167
|
+
raise ValueError(msg)
|
|
168
|
+
if not len(list1) or not len(list2):
|
|
169
|
+
msg = "Both ranked lists must be non-empty."
|
|
170
|
+
raise ValueError(msg)
|
|
171
|
+
|
|
172
|
+
types, ranks1, ranks2 = list1.aligned(list2)
|
|
173
|
+
n1 = len(list1)
|
|
174
|
+
n2 = len(list2)
|
|
175
|
+
|
|
176
|
+
if alpha < _ALPHA_ZERO_TOL:
|
|
177
|
+
deltas, norm = _elements_log_limit(ranks1, ranks2, n1, n2)
|
|
178
|
+
prefactor = 1.0
|
|
179
|
+
else:
|
|
180
|
+
deltas, norm = _elements_general(ranks1, ranks2, n1, n2, alpha)
|
|
181
|
+
prefactor = (alpha + 1.0) / alpha
|
|
182
|
+
|
|
183
|
+
raw = prefactor * math.fsum(deltas)
|
|
184
|
+
normalizer = prefactor * norm
|
|
185
|
+
divergence = raw / normalizer if normalize and normalizer > 0 else raw
|
|
186
|
+
if normalize and normalizer <= 0:
|
|
187
|
+
divergence = 0.0
|
|
188
|
+
|
|
189
|
+
share_denom = normalizer if normalize and normalizer > 0 else 1.0
|
|
190
|
+
contributions = tuple(
|
|
191
|
+
sorted(
|
|
192
|
+
(
|
|
193
|
+
Contribution(
|
|
194
|
+
type=t,
|
|
195
|
+
delta=d,
|
|
196
|
+
contribution=prefactor * d / share_denom,
|
|
197
|
+
rank1=r1,
|
|
198
|
+
rank2=r2,
|
|
199
|
+
direction=_direction(r1, r2),
|
|
200
|
+
)
|
|
201
|
+
for t, d, r1, r2 in zip(types, deltas, ranks1, ranks2, strict=True)
|
|
202
|
+
),
|
|
203
|
+
key=lambda c: c.contribution,
|
|
204
|
+
reverse=True,
|
|
205
|
+
)
|
|
206
|
+
)
|
|
207
|
+
return RTDResult(
|
|
208
|
+
divergence=divergence,
|
|
209
|
+
raw=raw,
|
|
210
|
+
alpha=alpha,
|
|
211
|
+
contributions=contributions,
|
|
212
|
+
labels=(list1.label, list2.label),
|
|
213
|
+
)
|
keyflux/io/__init__.py
ADDED
keyflux/io/corpus.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Build frequency Counters from tokens, text, or count files.
|
|
2
|
+
|
|
3
|
+
keyflux is about keyness and rank comparison, not tokenisation. These helpers
|
|
4
|
+
cover the simple cases; for real linguistic tokenisation, pre-tokenise (for
|
|
5
|
+
example with ``kenon.Tokenizer``) and pass the resulting Counter directly.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from collections import Counter
|
|
12
|
+
from collections.abc import Iterable
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
_WORD_RE = re.compile(r"\w+", re.UNICODE)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def counts_from_tokens(
|
|
19
|
+
tokens: Iterable[str], *, lowercase: bool = True
|
|
20
|
+
) -> Counter[str]:
|
|
21
|
+
"""Build a frequency Counter from a token iterable.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
tokens: Already-tokenised word types.
|
|
25
|
+
lowercase: If True, lowercase each token before counting.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
A Counter mapping each type to its frequency.
|
|
29
|
+
|
|
30
|
+
Examples:
|
|
31
|
+
>>> counts_from_tokens(["The", "cat", "the", "CAT"])
|
|
32
|
+
Counter({'the': 2, 'cat': 2})
|
|
33
|
+
"""
|
|
34
|
+
if lowercase:
|
|
35
|
+
return Counter(t.lower() for t in tokens)
|
|
36
|
+
return Counter(tokens)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def counts_from_text(text: str, *, lowercase: bool = True) -> Counter[str]:
|
|
40
|
+
"""Tokenise a string on word characters, then count.
|
|
41
|
+
|
|
42
|
+
Uses a simple word-character regular expression — adequate for demos and
|
|
43
|
+
tests, not a substitute for linguistic tokenisation.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
text: Raw text to tokenise and count.
|
|
47
|
+
lowercase: If True, lowercase before counting.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
A Counter mapping each type to its frequency.
|
|
51
|
+
|
|
52
|
+
Examples:
|
|
53
|
+
>>> counts_from_text("The cat sat. The dog ran.")["the"]
|
|
54
|
+
2
|
|
55
|
+
"""
|
|
56
|
+
tokens = _WORD_RE.findall(text.lower() if lowercase else text)
|
|
57
|
+
return Counter(tokens)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def load_counts(path: str | Path) -> Counter[str]:
|
|
61
|
+
"""Read a count file into a Counter.
|
|
62
|
+
|
|
63
|
+
Each non-empty line is either ``type<TAB>count`` or a bare ``type`` (counted
|
|
64
|
+
as one occurrence per line).
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
path: Path to the count file.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
A Counter built from the file.
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
FileNotFoundError: If ``path`` does not exist.
|
|
74
|
+
ValueError: If a count field is present but not an integer.
|
|
75
|
+
|
|
76
|
+
Examples:
|
|
77
|
+
>>> import tempfile, pathlib
|
|
78
|
+
>>> p = pathlib.Path(tempfile.mkdtemp()) / "counts.tsv"
|
|
79
|
+
>>> rows = ["climate" + chr(9) + "30", "carbon" + chr(9) + "12"]
|
|
80
|
+
>>> _ = p.write_text(chr(10).join(rows))
|
|
81
|
+
>>> load_counts(p)
|
|
82
|
+
Counter({'climate': 30, 'carbon': 12})
|
|
83
|
+
"""
|
|
84
|
+
counts: Counter[str] = Counter()
|
|
85
|
+
for line in Path(path).read_text(encoding="utf-8").splitlines():
|
|
86
|
+
line = line.strip()
|
|
87
|
+
if not line:
|
|
88
|
+
continue
|
|
89
|
+
parts = line.split("\t")
|
|
90
|
+
if len(parts) == 1:
|
|
91
|
+
counts[parts[0]] += 1
|
|
92
|
+
else:
|
|
93
|
+
word, raw = parts[0], parts[1]
|
|
94
|
+
try:
|
|
95
|
+
counts[word] += int(raw)
|
|
96
|
+
except ValueError as exc:
|
|
97
|
+
msg = f"Non-integer count {raw!r} for type {word!r}."
|
|
98
|
+
raise ValueError(msg) from exc
|
|
99
|
+
return counts
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Keyness: focus-vs-reference keywords, lockwords, and association measures."""
|
|
2
|
+
|
|
3
|
+
from keyflux.keyness.classify import (
|
|
4
|
+
Category,
|
|
5
|
+
classify_direction,
|
|
6
|
+
classify_row,
|
|
7
|
+
is_significant,
|
|
8
|
+
partition,
|
|
9
|
+
)
|
|
10
|
+
from keyflux.keyness.keyness import (
|
|
11
|
+
Keyness,
|
|
12
|
+
KeynessRow,
|
|
13
|
+
KeywordTable,
|
|
14
|
+
ReproRecord,
|
|
15
|
+
)
|
|
16
|
+
from keyflux.keyness.measures import (
|
|
17
|
+
chi_square,
|
|
18
|
+
expected_counts,
|
|
19
|
+
log_likelihood,
|
|
20
|
+
log_ratio,
|
|
21
|
+
percent_diff,
|
|
22
|
+
significance_band,
|
|
23
|
+
simple_maths,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"Keyness",
|
|
28
|
+
"KeynessRow",
|
|
29
|
+
"KeywordTable",
|
|
30
|
+
"ReproRecord",
|
|
31
|
+
"Category",
|
|
32
|
+
"classify_direction",
|
|
33
|
+
"classify_row",
|
|
34
|
+
"is_significant",
|
|
35
|
+
"partition",
|
|
36
|
+
"chi_square",
|
|
37
|
+
"expected_counts",
|
|
38
|
+
"log_likelihood",
|
|
39
|
+
"log_ratio",
|
|
40
|
+
"percent_diff",
|
|
41
|
+
"significance_band",
|
|
42
|
+
"simple_maths",
|
|
43
|
+
]
|