pycorpdiff 0.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycorpdiff/__init__.py +126 -0
- pycorpdiff/_backends/__init__.py +3 -0
- pycorpdiff/_backends/pandas.py +3 -0
- pycorpdiff/_backends/polars.py +3 -0
- pycorpdiff/collocation/__init__.py +19 -0
- pycorpdiff/collocation/cooccurrence.py +65 -0
- pycorpdiff/collocation/measures.py +102 -0
- pycorpdiff/collocation/network.py +233 -0
- pycorpdiff/collocation/shift.py +146 -0
- pycorpdiff/compare.py +345 -0
- pycorpdiff/corpus.py +411 -0
- pycorpdiff/datasets/__init__.py +27 -0
- pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- pycorpdiff/datasets/_generate_hansard.py +221 -0
- pycorpdiff/datasets/hansard.py +235 -0
- pycorpdiff/datasets/histwords.py +221 -0
- pycorpdiff/explain.py +177 -0
- pycorpdiff/io/__init__.py +16 -0
- pycorpdiff/io/duckdb.py +92 -0
- pycorpdiff/io/huggingface.py +142 -0
- pycorpdiff/io/readers.py +138 -0
- pycorpdiff/keyness/__init__.py +26 -0
- pycorpdiff/keyness/bayes.py +50 -0
- pycorpdiff/keyness/chi_squared.py +94 -0
- pycorpdiff/keyness/correction.py +34 -0
- pycorpdiff/keyness/dispersion.py +89 -0
- pycorpdiff/keyness/effect_sizes.py +65 -0
- pycorpdiff/keyness/loglikelihood.py +92 -0
- pycorpdiff/keyness/multicorpus.py +143 -0
- pycorpdiff/keyness/permutation.py +154 -0
- pycorpdiff/py.typed +0 -0
- pycorpdiff/results.py +635 -0
- pycorpdiff/semantic/__init__.py +18 -0
- pycorpdiff/semantic/alignment.py +53 -0
- pycorpdiff/semantic/embed.py +84 -0
- pycorpdiff/semantic/shift.py +224 -0
- pycorpdiff/semantic/trajectory.py +166 -0
- pycorpdiff/stats.py +69 -0
- pycorpdiff/temporal/__init__.py +15 -0
- pycorpdiff/temporal/bocpd.py +233 -0
- pycorpdiff/temporal/causal_impact.py +293 -0
- pycorpdiff/temporal/changepoint.py +92 -0
- pycorpdiff/temporal/forecast.py +405 -0
- pycorpdiff/temporal/its.py +123 -0
- pycorpdiff/temporal/slicing.py +174 -0
- pycorpdiff/tokenize.py +110 -0
- pycorpdiff/viz/__init__.py +37 -0
- pycorpdiff/viz/bocpd.py +173 -0
- pycorpdiff/viz/causal_impact.py +142 -0
- pycorpdiff/viz/collocation.py +48 -0
- pycorpdiff/viz/dispersion.py +117 -0
- pycorpdiff/viz/forecast.py +129 -0
- pycorpdiff/viz/keyness.py +96 -0
- pycorpdiff/viz/network.py +186 -0
- pycorpdiff/viz/scattertext.py +160 -0
- pycorpdiff/viz/semantic_forecast.py +114 -0
- pycorpdiff/viz/trajectory.py +48 -0
- pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
- pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
- pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
- pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
pycorpdiff/__init__.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""pycorpdiff — comparative corpus analysis for modern Python workflows.
|
|
2
|
+
|
|
3
|
+
The package exposes three public verbs (:func:`compare`, :func:`track`,
|
|
4
|
+
plus the :class:`Corpus` constructor and the I/O ``read_*`` helpers) and
|
|
5
|
+
four families of result objects (:class:`KeynessResult`,
|
|
6
|
+
:class:`CollocationShiftResult`, :class:`SemanticShiftResult`,
|
|
7
|
+
:class:`TemporalTrajectory`).
|
|
8
|
+
|
|
9
|
+
Layer-1 ingestion utilities are functional in this scaffolding release;
|
|
10
|
+
Layer-2 analytical methods raise :class:`NotImplementedError` until Phase 1
|
|
11
|
+
of the roadmap lands.
|
|
12
|
+
|
|
13
|
+
Example
|
|
14
|
+
-------
|
|
15
|
+
|
|
16
|
+
>>> import pycorpdiff as pcd
|
|
17
|
+
>>> pcd.__version__
|
|
18
|
+
'0.1.0a0'
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
__version__ = "0.1.0a0"
|
|
24
|
+
|
|
25
|
+
from .collocation.network import NetworkResult, cooccurrence_network
|
|
26
|
+
from .compare import Comparison, compare
|
|
27
|
+
from .corpus import Corpus, CorpusSlice
|
|
28
|
+
from .datasets import (
|
|
29
|
+
fetch_hansard,
|
|
30
|
+
fetch_histwords_decade,
|
|
31
|
+
histwords_cosine_shift,
|
|
32
|
+
load_hansard_sample,
|
|
33
|
+
)
|
|
34
|
+
from .explain import kwic, representative_docs
|
|
35
|
+
from .io.duckdb import read_duckdb
|
|
36
|
+
from .io.huggingface import from_huggingface
|
|
37
|
+
from .io.readers import from_dataframe, read_csv, read_parquet, read_txt
|
|
38
|
+
from .keyness.multicorpus import keyness_multi
|
|
39
|
+
from .results import (
|
|
40
|
+
CollocationShiftResult,
|
|
41
|
+
ConcordanceResult,
|
|
42
|
+
KeynessResult,
|
|
43
|
+
SemanticShiftResult,
|
|
44
|
+
TemporalTrajectory,
|
|
45
|
+
)
|
|
46
|
+
from .semantic.embed import Embedder, HashEmbedder, SBERTEmbedder
|
|
47
|
+
from .semantic.shift import neighborhood_drift
|
|
48
|
+
from .semantic.trajectory import semantic_trajectory
|
|
49
|
+
from .temporal.bocpd import BocpdResult, bocpd
|
|
50
|
+
from .temporal.causal_impact import CausalImpactResult, causal_impact
|
|
51
|
+
from .temporal.forecast import (
|
|
52
|
+
ForecastResult,
|
|
53
|
+
forecast_semantic_drift,
|
|
54
|
+
forecast_trajectory,
|
|
55
|
+
)
|
|
56
|
+
from .temporal.slicing import TemporalCorpus, track
|
|
57
|
+
from .tokenize import NgramTokenizer, RegexTokenizer, Tokenizer
|
|
58
|
+
|
|
59
|
+
# Convenience aliases for the standalone (non-`.plot()`-delegated)
|
|
60
|
+
# visualisation functions. The full plot family also lives at
|
|
61
|
+
# ``pycorpdiff.viz.*`` — these are surfaced at the root so common
|
|
62
|
+
# patterns like ``pcd.dispersion_plot(corpus, term)`` work without
|
|
63
|
+
# requiring a separate import path. Plots that are only meaningful
|
|
64
|
+
# as ``Result.plot()`` (keyness_volcano, keyness_top_n_bar,
|
|
65
|
+
# collocation_diverging_bar, trajectory_with_ci) stay in ``viz`` only.
|
|
66
|
+
from .viz import (
|
|
67
|
+
bocpd_plot,
|
|
68
|
+
causal_impact_plot,
|
|
69
|
+
dispersion_plot,
|
|
70
|
+
forecast_plot,
|
|
71
|
+
network_plot,
|
|
72
|
+
scattertext_plot,
|
|
73
|
+
semantic_forecast_plot,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
__all__ = [
|
|
77
|
+
"BocpdResult",
|
|
78
|
+
"CausalImpactResult",
|
|
79
|
+
"CollocationShiftResult",
|
|
80
|
+
"Comparison",
|
|
81
|
+
"ConcordanceResult",
|
|
82
|
+
"Corpus",
|
|
83
|
+
"CorpusSlice",
|
|
84
|
+
"Embedder",
|
|
85
|
+
"ForecastResult",
|
|
86
|
+
"HashEmbedder",
|
|
87
|
+
"KeynessResult",
|
|
88
|
+
"NetworkResult",
|
|
89
|
+
"NgramTokenizer",
|
|
90
|
+
"RegexTokenizer",
|
|
91
|
+
"SBERTEmbedder",
|
|
92
|
+
"SemanticShiftResult",
|
|
93
|
+
"TemporalCorpus",
|
|
94
|
+
"TemporalTrajectory",
|
|
95
|
+
"Tokenizer",
|
|
96
|
+
"__version__",
|
|
97
|
+
"bocpd",
|
|
98
|
+
"bocpd_plot",
|
|
99
|
+
"causal_impact",
|
|
100
|
+
"causal_impact_plot",
|
|
101
|
+
"compare",
|
|
102
|
+
"cooccurrence_network",
|
|
103
|
+
"dispersion_plot",
|
|
104
|
+
"fetch_hansard",
|
|
105
|
+
"fetch_histwords_decade",
|
|
106
|
+
"forecast_plot",
|
|
107
|
+
"forecast_semantic_drift",
|
|
108
|
+
"forecast_trajectory",
|
|
109
|
+
"from_dataframe",
|
|
110
|
+
"from_huggingface",
|
|
111
|
+
"histwords_cosine_shift",
|
|
112
|
+
"keyness_multi",
|
|
113
|
+
"kwic",
|
|
114
|
+
"load_hansard_sample",
|
|
115
|
+
"neighborhood_drift",
|
|
116
|
+
"network_plot",
|
|
117
|
+
"read_csv",
|
|
118
|
+
"read_duckdb",
|
|
119
|
+
"read_parquet",
|
|
120
|
+
"read_txt",
|
|
121
|
+
"representative_docs",
|
|
122
|
+
"scattertext_plot",
|
|
123
|
+
"semantic_forecast_plot",
|
|
124
|
+
"semantic_trajectory",
|
|
125
|
+
"track",
|
|
126
|
+
]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Collocation measures and collocation-shift analysis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .cooccurrence import collocate_counts
|
|
6
|
+
from .measures import logdice, mi_three, pmi, t_score
|
|
7
|
+
from .network import NetworkResult, cooccurrence_network
|
|
8
|
+
from .shift import collocation_shift
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"NetworkResult",
|
|
12
|
+
"collocate_counts",
|
|
13
|
+
"collocation_shift",
|
|
14
|
+
"cooccurrence_network",
|
|
15
|
+
"logdice",
|
|
16
|
+
"mi_three",
|
|
17
|
+
"pmi",
|
|
18
|
+
"t_score",
|
|
19
|
+
]
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Window-based co-occurrence extraction.
|
|
2
|
+
|
|
3
|
+
Given a tokenized corpus and a target term, walks each document and
|
|
4
|
+
accumulates the count of every other token that appears within ``window``
|
|
5
|
+
positions of the target. Windows never cross document boundaries — each
|
|
6
|
+
document's contexts are isolated, which matches the SketchEngine / NLTK
|
|
7
|
+
convention and avoids spurious cross-doc associations.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from collections import Counter
|
|
13
|
+
from collections.abc import Sequence
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def collocate_counts(
|
|
17
|
+
docs_tokens: Sequence[Sequence[str]],
|
|
18
|
+
target: str,
|
|
19
|
+
window: int = 5,
|
|
20
|
+
) -> tuple[Counter[str], int]:
|
|
21
|
+
"""Return ``(collocate_counter, target_occurrences)`` for ``target``.
|
|
22
|
+
|
|
23
|
+
Each occurrence of ``target`` contributes to up to ``2 * window``
|
|
24
|
+
collocate counts (fewer near document boundaries). The target itself
|
|
25
|
+
is excluded from its own context window — if the target appears
|
|
26
|
+
twice within ``window`` of itself, each occurrence still contributes
|
|
27
|
+
one count to the other.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
docs_tokens
|
|
32
|
+
One sequence of tokens per document, in original order.
|
|
33
|
+
target
|
|
34
|
+
The pivot term whose context we are accumulating.
|
|
35
|
+
window
|
|
36
|
+
Tokens on each side of the target. ``window=5`` gives a
|
|
37
|
+
ten-token context (five left, five right).
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
counter
|
|
42
|
+
Mapping ``collocate -> joint count``.
|
|
43
|
+
target_occurrences
|
|
44
|
+
Number of times the target appears across all documents. Used
|
|
45
|
+
downstream as ``f_x`` (the marginal target count) for
|
|
46
|
+
association measures.
|
|
47
|
+
"""
|
|
48
|
+
if window < 1:
|
|
49
|
+
raise ValueError(f"window must be >= 1; got {window}")
|
|
50
|
+
|
|
51
|
+
counter: Counter[str] = Counter()
|
|
52
|
+
target_n = 0
|
|
53
|
+
for tokens in docs_tokens:
|
|
54
|
+
n = len(tokens)
|
|
55
|
+
for i in range(n):
|
|
56
|
+
if tokens[i] != target:
|
|
57
|
+
continue
|
|
58
|
+
target_n += 1
|
|
59
|
+
lo = max(0, i - window)
|
|
60
|
+
hi = min(n, i + window + 1)
|
|
61
|
+
for j in range(lo, hi):
|
|
62
|
+
if j == i:
|
|
63
|
+
continue
|
|
64
|
+
counter[tokens[j]] += 1
|
|
65
|
+
return counter, target_n
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Collocation association measures.
|
|
2
|
+
|
|
3
|
+
All four functions accept the same five-argument shape:
|
|
4
|
+
|
|
5
|
+
``f_xy``
|
|
6
|
+
Joint count of (target, collocate) in a window.
|
|
7
|
+
``f_x``
|
|
8
|
+
Total occurrences of the target in the corpus.
|
|
9
|
+
``f_y``
|
|
10
|
+
Total occurrences of the collocate in the corpus.
|
|
11
|
+
``n``
|
|
12
|
+
Total tokens in the corpus (required by every measure except logDice).
|
|
13
|
+
|
|
14
|
+
Counts may be passed as scalars or pandas Series; broadcasting follows
|
|
15
|
+
NumPy / pandas conventions.
|
|
16
|
+
|
|
17
|
+
References
|
|
18
|
+
----------
|
|
19
|
+
Rychlý, P. (2008). A lexicographer-friendly association score. In
|
|
20
|
+
*Proceedings of RASLAN 2008*.
|
|
21
|
+
|
|
22
|
+
Church, K., Gale, W., Hanks, P., & Hindle, D. (1991). Using statistics in
|
|
23
|
+
lexical analysis. In *Lexical Acquisition*, 115-164.
|
|
24
|
+
|
|
25
|
+
Daille, B. (1994). *Approche mixte pour l'extraction automatique de
|
|
26
|
+
terminologie*. PhD thesis, Université Paris 7.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
|
|
31
|
+
import numpy as np
|
|
32
|
+
import pandas as pd
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def logdice(
|
|
36
|
+
f_xy: pd.Series,
|
|
37
|
+
f_x: float,
|
|
38
|
+
f_y: pd.Series,
|
|
39
|
+
) -> pd.Series:
|
|
40
|
+
"""Rychlý's logDice: ``14 + log2(2 · f_xy / (f_x + f_y))``.
|
|
41
|
+
|
|
42
|
+
Range-bounded above at 14 (perfect co-occurrence). Robust to corpus
|
|
43
|
+
size because it never references the total. Values below 0 are
|
|
44
|
+
typically noise; the practical interesting band is roughly 7..14.
|
|
45
|
+
|
|
46
|
+
Zero joint counts yield ``-inf``; pre-smooth ``f_xy`` upstream
|
|
47
|
+
(e.g. via :func:`pycorpdiff.collocation.collocation_shift`) if you
|
|
48
|
+
need finite scores across the union of vocabularies.
|
|
49
|
+
"""
|
|
50
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
51
|
+
ratio = (2.0 * f_xy) / (f_x + f_y)
|
|
52
|
+
return pd.Series(14.0 + np.log2(ratio), index=f_xy.index)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def pmi(
|
|
56
|
+
f_xy: pd.Series,
|
|
57
|
+
f_x: float,
|
|
58
|
+
f_y: pd.Series,
|
|
59
|
+
n: int,
|
|
60
|
+
) -> pd.Series:
|
|
61
|
+
"""Pointwise mutual information: ``log2(f_xy · N / (f_x · f_y))``.
|
|
62
|
+
|
|
63
|
+
The "association ratio" of Church & Hanks (1990). PMI rewards rare
|
|
64
|
+
pairs disproportionately — always pair with a frequency floor or
|
|
65
|
+
use MI³ if rare-pair inflation is a concern.
|
|
66
|
+
"""
|
|
67
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
68
|
+
return pd.Series(np.log2((f_xy * n) / (f_x * f_y)), index=f_xy.index)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def t_score(
|
|
72
|
+
f_xy: pd.Series,
|
|
73
|
+
f_x: float,
|
|
74
|
+
f_y: pd.Series,
|
|
75
|
+
n: int,
|
|
76
|
+
) -> pd.Series:
|
|
77
|
+
"""Welch-style t-score: ``(f_xy - E[f_xy]) / sqrt(f_xy)``.
|
|
78
|
+
|
|
79
|
+
Where ``E[f_xy] = f_x · f_y / N`` is the count expected under
|
|
80
|
+
independence. Favours frequent collocates — the inverse of PMI's
|
|
81
|
+
sparsity bias.
|
|
82
|
+
"""
|
|
83
|
+
expected = (f_x * f_y) / n
|
|
84
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
85
|
+
return pd.Series((f_xy - expected) / np.sqrt(f_xy), index=f_xy.index)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def mi_three(
|
|
89
|
+
f_xy: pd.Series,
|
|
90
|
+
f_x: float,
|
|
91
|
+
f_y: pd.Series,
|
|
92
|
+
n: int,
|
|
93
|
+
) -> pd.Series:
|
|
94
|
+
"""Daille's MI³: ``log2(f_xy³ · N / (f_x · f_y))``.
|
|
95
|
+
|
|
96
|
+
Cubes the joint count in the numerator, which empirically downweights
|
|
97
|
+
PMI's rare-pair bias without t-score's frequency dominance.
|
|
98
|
+
"""
|
|
99
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
100
|
+
return pd.Series(
|
|
101
|
+
np.log2((np.power(f_xy, 3) * n) / (f_x * f_y)), index=f_xy.index
|
|
102
|
+
)
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""Term co-occurrence networks.
|
|
2
|
+
|
|
3
|
+
For exploratory work — "what does the discourse *graph* look like?" —
|
|
4
|
+
the natural artefact is a network: nodes are the corpus's most frequent
|
|
5
|
+
terms, edges connect terms that co-occur within a window, and edge
|
|
6
|
+
weights come from a standard association measure (PMI, t-score, MI³).
|
|
7
|
+
|
|
8
|
+
This is the term-as-vertex visualisation that gephi-style network tools
|
|
9
|
+
have made common in digital humanities; here it lands as a first-class
|
|
10
|
+
:class:`pycorpdiff.collocation.NetworkResult` with the same
|
|
11
|
+
:meth:`to_df` / :meth:`plot` shape as every other Result.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from collections import Counter
|
|
17
|
+
from collections.abc import Sequence
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from typing import TYPE_CHECKING, Literal
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
import pandas as pd
|
|
23
|
+
|
|
24
|
+
from ..corpus import Corpus, CorpusSlice
|
|
25
|
+
from .measures import logdice, mi_three, pmi, t_score
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
import altair as alt
|
|
29
|
+
|
|
30
|
+
NetworkMeasure = Literal["PMI", "t_score", "MI3", "logDice"]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True)
|
|
34
|
+
class NetworkResult:
|
|
35
|
+
"""A term co-occurrence network with nodes, edges, and a plot method.
|
|
36
|
+
|
|
37
|
+
The two DataFrames are the canonical "long-format" shape every
|
|
38
|
+
network analytics tool consumes:
|
|
39
|
+
|
|
40
|
+
- ``nodes``: index = term, columns = ``count``, ``degree``
|
|
41
|
+
- ``edges``: columns = ``source``, ``target``, ``cooccur_count``,
|
|
42
|
+
``weight`` (the association score), and ``rank`` (0-based, by
|
|
43
|
+
``|weight|`` descending).
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
nodes: pd.DataFrame
|
|
47
|
+
edges: pd.DataFrame
|
|
48
|
+
measure: NetworkMeasure
|
|
49
|
+
window: int
|
|
50
|
+
label: str = ""
|
|
51
|
+
params: dict[str, object] = field(default_factory=dict)
|
|
52
|
+
|
|
53
|
+
def to_df(self) -> pd.DataFrame:
|
|
54
|
+
"""Return the edges as a flat tidy DataFrame (for round-trips)."""
|
|
55
|
+
return self.edges.copy()
|
|
56
|
+
|
|
57
|
+
def summary(self) -> str:
|
|
58
|
+
return (
|
|
59
|
+
f"NetworkResult(measure={self.measure}, window={self.window}, "
|
|
60
|
+
f"nodes={len(self.nodes):,}, edges={len(self.edges):,})"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def plot(self, **kw: object) -> alt.Chart:
|
|
64
|
+
"""Render the network as an altair force-directed-style plot."""
|
|
65
|
+
from ..viz.network import network_plot
|
|
66
|
+
|
|
67
|
+
return network_plot(self, **kw) # type: ignore[arg-type]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def cooccurrence_network(
|
|
71
|
+
corpus: Corpus | CorpusSlice,
|
|
72
|
+
*,
|
|
73
|
+
top_n: int = 50,
|
|
74
|
+
window: int = 5,
|
|
75
|
+
measure: NetworkMeasure = "PMI",
|
|
76
|
+
min_count: int = 3,
|
|
77
|
+
min_cooccur: int = 2,
|
|
78
|
+
smoothing: float = 0.5,
|
|
79
|
+
) -> NetworkResult:
|
|
80
|
+
"""Build a term co-occurrence network for the ``top_n`` terms.
|
|
81
|
+
|
|
82
|
+
Each pair of distinct terms among the ``top_n`` vocabulary is
|
|
83
|
+
weighted by the chosen association measure on their joint counts
|
|
84
|
+
within ``window`` tokens of each other inside a document.
|
|
85
|
+
|
|
86
|
+
Parameters
|
|
87
|
+
----------
|
|
88
|
+
corpus
|
|
89
|
+
A :class:`Corpus` or :class:`CorpusSlice`.
|
|
90
|
+
top_n
|
|
91
|
+
Vocabulary cap — the ``top_n`` most frequent terms (after
|
|
92
|
+
``min_count``) become network nodes.
|
|
93
|
+
window
|
|
94
|
+
Symmetric context window for the co-occurrence count.
|
|
95
|
+
measure
|
|
96
|
+
Edge-weight association measure.
|
|
97
|
+
min_count
|
|
98
|
+
Drop terms below this corpus-wide frequency before picking the
|
|
99
|
+
top-N.
|
|
100
|
+
min_cooccur
|
|
101
|
+
Drop edges with joint count below this. Acts as the network's
|
|
102
|
+
noise floor.
|
|
103
|
+
smoothing
|
|
104
|
+
Laplace constant added to joint and marginal counts before
|
|
105
|
+
scoring (mirrors :func:`collocation_shift`'s convention so the
|
|
106
|
+
same measures stay finite on absent pairs).
|
|
107
|
+
|
|
108
|
+
Returns
|
|
109
|
+
-------
|
|
110
|
+
NetworkResult
|
|
111
|
+
"""
|
|
112
|
+
if top_n < 2:
|
|
113
|
+
raise ValueError(f"top_n must be >= 2; got {top_n}")
|
|
114
|
+
if window < 1:
|
|
115
|
+
raise ValueError(f"window must be >= 1; got {window}")
|
|
116
|
+
if smoothing <= 0:
|
|
117
|
+
raise ValueError(f"smoothing must be > 0; got {smoothing}")
|
|
118
|
+
|
|
119
|
+
vocab = corpus.vocab(min_count=min_count).head(top_n)
|
|
120
|
+
if len(vocab) < 2:
|
|
121
|
+
raise ValueError(
|
|
122
|
+
f"need at least 2 terms after min_count={min_count} filter; "
|
|
123
|
+
f"got {len(vocab)}"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
keep_set = set(vocab.index)
|
|
127
|
+
pair_counts: Counter[tuple[str, str]] = Counter()
|
|
128
|
+
|
|
129
|
+
for tokens in corpus.tokens():
|
|
130
|
+
# Pre-filter to in-vocab tokens with original positions.
|
|
131
|
+
positions = [(i, t) for i, t in enumerate(tokens) if t in keep_set]
|
|
132
|
+
for k, (i, t_i) in enumerate(positions):
|
|
133
|
+
for j, t_j in positions[k + 1 :]:
|
|
134
|
+
if j - i > window:
|
|
135
|
+
break # positions are sorted; rest are further away
|
|
136
|
+
if t_i == t_j:
|
|
137
|
+
continue
|
|
138
|
+
pair = (t_i, t_j) if t_i < t_j else (t_j, t_i)
|
|
139
|
+
pair_counts[pair] += 1
|
|
140
|
+
|
|
141
|
+
if not pair_counts:
|
|
142
|
+
return NetworkResult(
|
|
143
|
+
nodes=vocab.rename("count").to_frame().assign(degree=0),
|
|
144
|
+
edges=pd.DataFrame(
|
|
145
|
+
columns=["source", "target", "cooccur_count", "weight", "rank"]
|
|
146
|
+
),
|
|
147
|
+
measure=measure,
|
|
148
|
+
window=window,
|
|
149
|
+
label=_corpus_label(corpus),
|
|
150
|
+
params={"top_n": top_n, "min_count": min_count, "min_cooccur": min_cooccur},
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
n_total = corpus.total_tokens()
|
|
154
|
+
rows = []
|
|
155
|
+
for (src, tgt), joint in pair_counts.items():
|
|
156
|
+
if joint < min_cooccur:
|
|
157
|
+
continue
|
|
158
|
+
rows.append(
|
|
159
|
+
{
|
|
160
|
+
"source": src,
|
|
161
|
+
"target": tgt,
|
|
162
|
+
"cooccur_count": joint,
|
|
163
|
+
"f_a": int(vocab[src]),
|
|
164
|
+
"f_b": int(vocab[tgt]),
|
|
165
|
+
}
|
|
166
|
+
)
|
|
167
|
+
if not rows:
|
|
168
|
+
return NetworkResult(
|
|
169
|
+
nodes=vocab.rename("count").to_frame().assign(degree=0),
|
|
170
|
+
edges=pd.DataFrame(
|
|
171
|
+
columns=["source", "target", "cooccur_count", "weight", "rank"]
|
|
172
|
+
),
|
|
173
|
+
measure=measure,
|
|
174
|
+
window=window,
|
|
175
|
+
label=_corpus_label(corpus),
|
|
176
|
+
params={"top_n": top_n, "min_count": min_count, "min_cooccur": min_cooccur},
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
edges = pd.DataFrame(rows)
|
|
180
|
+
f_xy_arr = edges["cooccur_count"].to_numpy(dtype=float) + smoothing
|
|
181
|
+
f_a_arr = edges["f_a"].to_numpy(dtype=float) + smoothing
|
|
182
|
+
f_b_arr = edges["f_b"].to_numpy(dtype=float) + smoothing
|
|
183
|
+
|
|
184
|
+
if measure == "PMI":
|
|
185
|
+
weight_arr = np.log2((f_xy_arr * n_total) / (f_a_arr * f_b_arr))
|
|
186
|
+
elif measure == "t_score":
|
|
187
|
+
expected_arr = (f_a_arr * f_b_arr) / n_total
|
|
188
|
+
weight_arr = (f_xy_arr - expected_arr) / np.sqrt(f_xy_arr)
|
|
189
|
+
elif measure == "MI3":
|
|
190
|
+
weight_arr = np.log2((np.power(f_xy_arr, 3) * n_total) / (f_a_arr * f_b_arr))
|
|
191
|
+
elif measure == "logDice":
|
|
192
|
+
weight_arr = 14.0 + np.log2((2.0 * f_xy_arr) / (f_a_arr + f_b_arr))
|
|
193
|
+
else:
|
|
194
|
+
raise ValueError(f"unknown measure={measure!r}")
|
|
195
|
+
|
|
196
|
+
# Silence the lint warnings on unused-but-validated helper imports.
|
|
197
|
+
_ = (pmi, t_score, mi_three, logdice)
|
|
198
|
+
|
|
199
|
+
edges = edges.drop(columns=["f_a", "f_b"]).assign(weight=weight_arr)
|
|
200
|
+
edges = edges.sort_values("weight", ascending=False, key=lambda s: s.abs())
|
|
201
|
+
edges = edges.reset_index(drop=True).assign(rank=lambda d: d.index.astype(int))
|
|
202
|
+
|
|
203
|
+
# Degrees (undirected): how many edges touch each node?
|
|
204
|
+
degree_a = edges.groupby("source").size()
|
|
205
|
+
degree_b = edges.groupby("target").size()
|
|
206
|
+
degrees = degree_a.add(degree_b, fill_value=0).astype(int)
|
|
207
|
+
|
|
208
|
+
nodes = vocab.rename("count").to_frame()
|
|
209
|
+
nodes["degree"] = degrees.reindex(nodes.index, fill_value=0).astype(int)
|
|
210
|
+
|
|
211
|
+
return NetworkResult(
|
|
212
|
+
nodes=nodes,
|
|
213
|
+
edges=edges,
|
|
214
|
+
measure=measure,
|
|
215
|
+
window=window,
|
|
216
|
+
label=_corpus_label(corpus),
|
|
217
|
+
params={
|
|
218
|
+
"top_n": top_n,
|
|
219
|
+
"min_count": min_count,
|
|
220
|
+
"min_cooccur": min_cooccur,
|
|
221
|
+
"smoothing": smoothing,
|
|
222
|
+
},
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _corpus_label(c: Corpus | CorpusSlice) -> str:
|
|
227
|
+
if isinstance(c, CorpusSlice):
|
|
228
|
+
return c.label
|
|
229
|
+
return "corpus"
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# Public type alias for users to depend on if they want.
|
|
233
|
+
__all__: Sequence[str] = ["NetworkResult", "cooccurrence_network", "NetworkMeasure"]
|