pycorpdiff 0.1.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pycorpdiff/__init__.py +126 -0
  2. pycorpdiff/_backends/__init__.py +3 -0
  3. pycorpdiff/_backends/pandas.py +3 -0
  4. pycorpdiff/_backends/polars.py +3 -0
  5. pycorpdiff/collocation/__init__.py +19 -0
  6. pycorpdiff/collocation/cooccurrence.py +65 -0
  7. pycorpdiff/collocation/measures.py +102 -0
  8. pycorpdiff/collocation/network.py +233 -0
  9. pycorpdiff/collocation/shift.py +146 -0
  10. pycorpdiff/compare.py +345 -0
  11. pycorpdiff/corpus.py +411 -0
  12. pycorpdiff/datasets/__init__.py +27 -0
  13. pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  14. pycorpdiff/datasets/_generate_hansard.py +221 -0
  15. pycorpdiff/datasets/hansard.py +235 -0
  16. pycorpdiff/datasets/histwords.py +221 -0
  17. pycorpdiff/explain.py +177 -0
  18. pycorpdiff/io/__init__.py +16 -0
  19. pycorpdiff/io/duckdb.py +92 -0
  20. pycorpdiff/io/huggingface.py +142 -0
  21. pycorpdiff/io/readers.py +138 -0
  22. pycorpdiff/keyness/__init__.py +26 -0
  23. pycorpdiff/keyness/bayes.py +50 -0
  24. pycorpdiff/keyness/chi_squared.py +94 -0
  25. pycorpdiff/keyness/correction.py +34 -0
  26. pycorpdiff/keyness/dispersion.py +89 -0
  27. pycorpdiff/keyness/effect_sizes.py +65 -0
  28. pycorpdiff/keyness/loglikelihood.py +92 -0
  29. pycorpdiff/keyness/multicorpus.py +143 -0
  30. pycorpdiff/keyness/permutation.py +154 -0
  31. pycorpdiff/py.typed +0 -0
  32. pycorpdiff/results.py +635 -0
  33. pycorpdiff/semantic/__init__.py +18 -0
  34. pycorpdiff/semantic/alignment.py +53 -0
  35. pycorpdiff/semantic/embed.py +84 -0
  36. pycorpdiff/semantic/shift.py +224 -0
  37. pycorpdiff/semantic/trajectory.py +166 -0
  38. pycorpdiff/stats.py +69 -0
  39. pycorpdiff/temporal/__init__.py +15 -0
  40. pycorpdiff/temporal/bocpd.py +233 -0
  41. pycorpdiff/temporal/causal_impact.py +293 -0
  42. pycorpdiff/temporal/changepoint.py +92 -0
  43. pycorpdiff/temporal/forecast.py +405 -0
  44. pycorpdiff/temporal/its.py +123 -0
  45. pycorpdiff/temporal/slicing.py +174 -0
  46. pycorpdiff/tokenize.py +110 -0
  47. pycorpdiff/viz/__init__.py +37 -0
  48. pycorpdiff/viz/bocpd.py +173 -0
  49. pycorpdiff/viz/causal_impact.py +142 -0
  50. pycorpdiff/viz/collocation.py +48 -0
  51. pycorpdiff/viz/dispersion.py +117 -0
  52. pycorpdiff/viz/forecast.py +129 -0
  53. pycorpdiff/viz/keyness.py +96 -0
  54. pycorpdiff/viz/network.py +186 -0
  55. pycorpdiff/viz/scattertext.py +160 -0
  56. pycorpdiff/viz/semantic_forecast.py +114 -0
  57. pycorpdiff/viz/trajectory.py +48 -0
  58. pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
  59. pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
  60. pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
  61. pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
pycorpdiff/__init__.py ADDED
@@ -0,0 +1,126 @@
1
+ """pycorpdiff — comparative corpus analysis for modern Python workflows.
2
+
3
+ The package exposes three public verbs (:func:`compare`, :func:`track`,
4
+ plus the :class:`Corpus` constructor and the I/O ``read_*`` helpers) and
5
+ four families of result objects (:class:`KeynessResult`,
6
+ :class:`CollocationShiftResult`, :class:`SemanticShiftResult`,
7
+ :class:`TemporalTrajectory`).
8
+
9
+ Layer-1 ingestion utilities are functional in this scaffolding release;
10
+ Layer-2 analytical methods raise :class:`NotImplementedError` until Phase 1
11
+ of the roadmap lands.
12
+
13
+ Example
14
+ -------
15
+
16
+ >>> import pycorpdiff as pcd
17
+ >>> pcd.__version__
18
+ '0.1.0a0'
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ __version__ = "0.1.0a0"
24
+
25
+ from .collocation.network import NetworkResult, cooccurrence_network
26
+ from .compare import Comparison, compare
27
+ from .corpus import Corpus, CorpusSlice
28
+ from .datasets import (
29
+ fetch_hansard,
30
+ fetch_histwords_decade,
31
+ histwords_cosine_shift,
32
+ load_hansard_sample,
33
+ )
34
+ from .explain import kwic, representative_docs
35
+ from .io.duckdb import read_duckdb
36
+ from .io.huggingface import from_huggingface
37
+ from .io.readers import from_dataframe, read_csv, read_parquet, read_txt
38
+ from .keyness.multicorpus import keyness_multi
39
+ from .results import (
40
+ CollocationShiftResult,
41
+ ConcordanceResult,
42
+ KeynessResult,
43
+ SemanticShiftResult,
44
+ TemporalTrajectory,
45
+ )
46
+ from .semantic.embed import Embedder, HashEmbedder, SBERTEmbedder
47
+ from .semantic.shift import neighborhood_drift
48
+ from .semantic.trajectory import semantic_trajectory
49
+ from .temporal.bocpd import BocpdResult, bocpd
50
+ from .temporal.causal_impact import CausalImpactResult, causal_impact
51
+ from .temporal.forecast import (
52
+ ForecastResult,
53
+ forecast_semantic_drift,
54
+ forecast_trajectory,
55
+ )
56
+ from .temporal.slicing import TemporalCorpus, track
57
+ from .tokenize import NgramTokenizer, RegexTokenizer, Tokenizer
58
+
59
+ # Convenience aliases for the standalone (non-`.plot()`-delegated)
60
+ # visualisation functions. The full plot family also lives at
61
+ # ``pycorpdiff.viz.*`` — these are surfaced at the root so common
62
+ # patterns like ``pcd.dispersion_plot(corpus, term)`` work without
63
+ # requiring a separate import path. Plots that are only meaningful
64
+ # as ``Result.plot()`` (keyness_volcano, keyness_top_n_bar,
65
+ # collocation_diverging_bar, trajectory_with_ci) stay in ``viz`` only.
66
+ from .viz import (
67
+ bocpd_plot,
68
+ causal_impact_plot,
69
+ dispersion_plot,
70
+ forecast_plot,
71
+ network_plot,
72
+ scattertext_plot,
73
+ semantic_forecast_plot,
74
+ )
75
+
76
+ __all__ = [
77
+ "BocpdResult",
78
+ "CausalImpactResult",
79
+ "CollocationShiftResult",
80
+ "Comparison",
81
+ "ConcordanceResult",
82
+ "Corpus",
83
+ "CorpusSlice",
84
+ "Embedder",
85
+ "ForecastResult",
86
+ "HashEmbedder",
87
+ "KeynessResult",
88
+ "NetworkResult",
89
+ "NgramTokenizer",
90
+ "RegexTokenizer",
91
+ "SBERTEmbedder",
92
+ "SemanticShiftResult",
93
+ "TemporalCorpus",
94
+ "TemporalTrajectory",
95
+ "Tokenizer",
96
+ "__version__",
97
+ "bocpd",
98
+ "bocpd_plot",
99
+ "causal_impact",
100
+ "causal_impact_plot",
101
+ "compare",
102
+ "cooccurrence_network",
103
+ "dispersion_plot",
104
+ "fetch_hansard",
105
+ "fetch_histwords_decade",
106
+ "forecast_plot",
107
+ "forecast_semantic_drift",
108
+ "forecast_trajectory",
109
+ "from_dataframe",
110
+ "from_huggingface",
111
+ "histwords_cosine_shift",
112
+ "keyness_multi",
113
+ "kwic",
114
+ "load_hansard_sample",
115
+ "neighborhood_drift",
116
+ "network_plot",
117
+ "read_csv",
118
+ "read_duckdb",
119
+ "read_parquet",
120
+ "read_txt",
121
+ "representative_docs",
122
+ "scattertext_plot",
123
+ "semantic_forecast_plot",
124
+ "semantic_trajectory",
125
+ "track",
126
+ ]
@@ -0,0 +1,3 @@
1
+ """Backend shims. Pandas is the default; polars is reserved for a later phase."""
2
+
3
+ from __future__ import annotations
@@ -0,0 +1,3 @@
1
+ """Pandas backend shim — placeholder until backend abstractions are needed."""
2
+
3
+ from __future__ import annotations
@@ -0,0 +1,3 @@
1
+ """Polars backend shim — populated when the polars extra is wired up."""
2
+
3
+ from __future__ import annotations
@@ -0,0 +1,19 @@
1
+ """Collocation measures and collocation-shift analysis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .cooccurrence import collocate_counts
6
+ from .measures import logdice, mi_three, pmi, t_score
7
+ from .network import NetworkResult, cooccurrence_network
8
+ from .shift import collocation_shift
9
+
10
+ __all__ = [
11
+ "NetworkResult",
12
+ "collocate_counts",
13
+ "collocation_shift",
14
+ "cooccurrence_network",
15
+ "logdice",
16
+ "mi_three",
17
+ "pmi",
18
+ "t_score",
19
+ ]
@@ -0,0 +1,65 @@
1
+ """Window-based co-occurrence extraction.
2
+
3
+ Given a tokenized corpus and a target term, walks each document and
4
+ accumulates the count of every other token that appears within ``window``
5
+ positions of the target. Windows never cross document boundaries — each
6
+ document's contexts are isolated, which matches the SketchEngine / NLTK
7
+ convention and avoids spurious cross-doc associations.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from collections import Counter
13
+ from collections.abc import Sequence
14
+
15
+
16
+ def collocate_counts(
17
+ docs_tokens: Sequence[Sequence[str]],
18
+ target: str,
19
+ window: int = 5,
20
+ ) -> tuple[Counter[str], int]:
21
+ """Return ``(collocate_counter, target_occurrences)`` for ``target``.
22
+
23
+ Each occurrence of ``target`` contributes to up to ``2 * window``
24
+ collocate counts (fewer near document boundaries). The target itself
25
+ is excluded from its own context window — if the target appears
26
+ twice within ``window`` of itself, each occurrence still contributes
27
+ one count to the other.
28
+
29
+ Parameters
30
+ ----------
31
+ docs_tokens
32
+ One sequence of tokens per document, in original order.
33
+ target
34
+ The pivot term whose context we are accumulating.
35
+ window
36
+ Tokens on each side of the target. ``window=5`` gives a
37
+ ten-token context (five left, five right).
38
+
39
+ Returns
40
+ -------
41
+ counter
42
+ Mapping ``collocate -> joint count``.
43
+ target_occurrences
44
+ Number of times the target appears across all documents. Used
45
+ downstream as ``f_x`` (the marginal target count) for
46
+ association measures.
47
+ """
48
+ if window < 1:
49
+ raise ValueError(f"window must be >= 1; got {window}")
50
+
51
+ counter: Counter[str] = Counter()
52
+ target_n = 0
53
+ for tokens in docs_tokens:
54
+ n = len(tokens)
55
+ for i in range(n):
56
+ if tokens[i] != target:
57
+ continue
58
+ target_n += 1
59
+ lo = max(0, i - window)
60
+ hi = min(n, i + window + 1)
61
+ for j in range(lo, hi):
62
+ if j == i:
63
+ continue
64
+ counter[tokens[j]] += 1
65
+ return counter, target_n
@@ -0,0 +1,102 @@
1
+ """Collocation association measures.
2
+
3
+ All four functions accept the same five-argument shape:
4
+
5
+ ``f_xy``
6
+ Joint count of (target, collocate) in a window.
7
+ ``f_x``
8
+ Total occurrences of the target in the corpus.
9
+ ``f_y``
10
+ Total occurrences of the collocate in the corpus.
11
+ ``n``
12
+ Total tokens in the corpus (required by every measure except logDice).
13
+
14
+ Counts may be passed as scalars or pandas Series; broadcasting follows
15
+ NumPy / pandas conventions.
16
+
17
+ References
18
+ ----------
19
+ Rychlý, P. (2008). A lexicographer-friendly association score. In
20
+ *Proceedings of RASLAN 2008*.
21
+
22
+ Church, K., Gale, W., Hanks, P., & Hindle, D. (1991). Using statistics in
23
+ lexical analysis. In *Lexical Acquisition*, 115-164.
24
+
25
+ Daille, B. (1994). *Approche mixte pour l'extraction automatique de
26
+ terminologie*. PhD thesis, Université Paris 7.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import numpy as np
32
+ import pandas as pd
33
+
34
+
35
+ def logdice(
36
+ f_xy: pd.Series,
37
+ f_x: float,
38
+ f_y: pd.Series,
39
+ ) -> pd.Series:
40
+ """Rychlý's logDice: ``14 + log2(2 · f_xy / (f_x + f_y))``.
41
+
42
+ Range-bounded above at 14 (perfect co-occurrence). Robust to corpus
43
+ size because it never references the total. Values below 0 are
44
+ typically noise; the practical interesting band is roughly 7..14.
45
+
46
+ Zero joint counts yield ``-inf``; pre-smooth ``f_xy`` upstream
47
+ (e.g. via :func:`pycorpdiff.collocation.collocation_shift`) if you
48
+ need finite scores across the union of vocabularies.
49
+ """
50
+ with np.errstate(divide="ignore", invalid="ignore"):
51
+ ratio = (2.0 * f_xy) / (f_x + f_y)
52
+ return pd.Series(14.0 + np.log2(ratio), index=f_xy.index)
53
+
54
+
55
+ def pmi(
56
+ f_xy: pd.Series,
57
+ f_x: float,
58
+ f_y: pd.Series,
59
+ n: int,
60
+ ) -> pd.Series:
61
+ """Pointwise mutual information: ``log2(f_xy · N / (f_x · f_y))``.
62
+
63
+ The "association ratio" of Church & Hanks (1990). PMI rewards rare
64
+ pairs disproportionately — always pair with a frequency floor or
65
+ use MI³ if rare-pair inflation is a concern.
66
+ """
67
+ with np.errstate(divide="ignore", invalid="ignore"):
68
+ return pd.Series(np.log2((f_xy * n) / (f_x * f_y)), index=f_xy.index)
69
+
70
+
71
+ def t_score(
72
+ f_xy: pd.Series,
73
+ f_x: float,
74
+ f_y: pd.Series,
75
+ n: int,
76
+ ) -> pd.Series:
77
+ """Welch-style t-score: ``(f_xy - E[f_xy]) / sqrt(f_xy)``.
78
+
79
+ Where ``E[f_xy] = f_x · f_y / N`` is the count expected under
80
+ independence. Favours frequent collocates — the inverse of PMI's
81
+ sparsity bias.
82
+ """
83
+ expected = (f_x * f_y) / n
84
+ with np.errstate(divide="ignore", invalid="ignore"):
85
+ return pd.Series((f_xy - expected) / np.sqrt(f_xy), index=f_xy.index)
86
+
87
+
88
+ def mi_three(
89
+ f_xy: pd.Series,
90
+ f_x: float,
91
+ f_y: pd.Series,
92
+ n: int,
93
+ ) -> pd.Series:
94
+ """Daille's MI³: ``log2(f_xy³ · N / (f_x · f_y))``.
95
+
96
+ Cubes the joint count in the numerator, which empirically downweights
97
+ PMI's rare-pair bias without t-score's frequency dominance.
98
+ """
99
+ with np.errstate(divide="ignore", invalid="ignore"):
100
+ return pd.Series(
101
+ np.log2((np.power(f_xy, 3) * n) / (f_x * f_y)), index=f_xy.index
102
+ )
@@ -0,0 +1,233 @@
1
+ """Term co-occurrence networks.
2
+
3
+ For exploratory work — "what does the discourse *graph* look like?" —
4
+ the natural artefact is a network: nodes are the corpus's most frequent
5
+ terms, edges connect terms that co-occur within a window, and edge
6
+ weights come from a standard association measure (PMI, t-score, MI³).
7
+
8
+ This is the term-as-vertex visualisation that gephi-style network tools
9
+ have made common in digital humanities; here it lands as a first-class
10
+ :class:`pycorpdiff.collocation.NetworkResult` with the same
11
+ :meth:`to_df` / :meth:`plot` shape as every other Result.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from collections import Counter
17
+ from collections.abc import Sequence
18
+ from dataclasses import dataclass, field
19
+ from typing import TYPE_CHECKING, Literal
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+
24
+ from ..corpus import Corpus, CorpusSlice
25
+ from .measures import logdice, mi_three, pmi, t_score
26
+
27
+ if TYPE_CHECKING:
28
+ import altair as alt
29
+
30
+ NetworkMeasure = Literal["PMI", "t_score", "MI3", "logDice"]
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class NetworkResult:
35
+ """A term co-occurrence network with nodes, edges, and a plot method.
36
+
37
+ The two DataFrames are the canonical "long-format" shape every
38
+ network analytics tool consumes:
39
+
40
+ - ``nodes``: index = term, columns = ``count``, ``degree``
41
+ - ``edges``: columns = ``source``, ``target``, ``cooccur_count``,
42
+ ``weight`` (the association score), and ``rank`` (0-based, by
43
+ ``|weight|`` descending).
44
+ """
45
+
46
+ nodes: pd.DataFrame
47
+ edges: pd.DataFrame
48
+ measure: NetworkMeasure
49
+ window: int
50
+ label: str = ""
51
+ params: dict[str, object] = field(default_factory=dict)
52
+
53
+ def to_df(self) -> pd.DataFrame:
54
+ """Return the edges as a flat tidy DataFrame (for round-trips)."""
55
+ return self.edges.copy()
56
+
57
+ def summary(self) -> str:
58
+ return (
59
+ f"NetworkResult(measure={self.measure}, window={self.window}, "
60
+ f"nodes={len(self.nodes):,}, edges={len(self.edges):,})"
61
+ )
62
+
63
+ def plot(self, **kw: object) -> alt.Chart:
64
+ """Render the network as an altair force-directed-style plot."""
65
+ from ..viz.network import network_plot
66
+
67
+ return network_plot(self, **kw) # type: ignore[arg-type]
68
+
69
+
70
+ def cooccurrence_network(
71
+ corpus: Corpus | CorpusSlice,
72
+ *,
73
+ top_n: int = 50,
74
+ window: int = 5,
75
+ measure: NetworkMeasure = "PMI",
76
+ min_count: int = 3,
77
+ min_cooccur: int = 2,
78
+ smoothing: float = 0.5,
79
+ ) -> NetworkResult:
80
+ """Build a term co-occurrence network for the ``top_n`` terms.
81
+
82
+ Each pair of distinct terms among the ``top_n`` vocabulary is
83
+ weighted by the chosen association measure on their joint counts
84
+ within ``window`` tokens of each other inside a document.
85
+
86
+ Parameters
87
+ ----------
88
+ corpus
89
+ A :class:`Corpus` or :class:`CorpusSlice`.
90
+ top_n
91
+ Vocabulary cap — the ``top_n`` most frequent terms (after
92
+ ``min_count``) become network nodes.
93
+ window
94
+ Symmetric context window for the co-occurrence count.
95
+ measure
96
+ Edge-weight association measure.
97
+ min_count
98
+ Drop terms below this corpus-wide frequency before picking the
99
+ top-N.
100
+ min_cooccur
101
+ Drop edges with joint count below this. Acts as the network's
102
+ noise floor.
103
+ smoothing
104
+ Laplace constant added to joint and marginal counts before
105
+ scoring (mirrors :func:`collocation_shift`'s convention so the
106
+ same measures stay finite on absent pairs).
107
+
108
+ Returns
109
+ -------
110
+ NetworkResult
111
+ """
112
+ if top_n < 2:
113
+ raise ValueError(f"top_n must be >= 2; got {top_n}")
114
+ if window < 1:
115
+ raise ValueError(f"window must be >= 1; got {window}")
116
+ if smoothing <= 0:
117
+ raise ValueError(f"smoothing must be > 0; got {smoothing}")
118
+
119
+ vocab = corpus.vocab(min_count=min_count).head(top_n)
120
+ if len(vocab) < 2:
121
+ raise ValueError(
122
+ f"need at least 2 terms after min_count={min_count} filter; "
123
+ f"got {len(vocab)}"
124
+ )
125
+
126
+ keep_set = set(vocab.index)
127
+ pair_counts: Counter[tuple[str, str]] = Counter()
128
+
129
+ for tokens in corpus.tokens():
130
+ # Pre-filter to in-vocab tokens with original positions.
131
+ positions = [(i, t) for i, t in enumerate(tokens) if t in keep_set]
132
+ for k, (i, t_i) in enumerate(positions):
133
+ for j, t_j in positions[k + 1 :]:
134
+ if j - i > window:
135
+ break # positions are sorted; rest are further away
136
+ if t_i == t_j:
137
+ continue
138
+ pair = (t_i, t_j) if t_i < t_j else (t_j, t_i)
139
+ pair_counts[pair] += 1
140
+
141
+ if not pair_counts:
142
+ return NetworkResult(
143
+ nodes=vocab.rename("count").to_frame().assign(degree=0),
144
+ edges=pd.DataFrame(
145
+ columns=["source", "target", "cooccur_count", "weight", "rank"]
146
+ ),
147
+ measure=measure,
148
+ window=window,
149
+ label=_corpus_label(corpus),
150
+ params={"top_n": top_n, "min_count": min_count, "min_cooccur": min_cooccur},
151
+ )
152
+
153
+ n_total = corpus.total_tokens()
154
+ rows = []
155
+ for (src, tgt), joint in pair_counts.items():
156
+ if joint < min_cooccur:
157
+ continue
158
+ rows.append(
159
+ {
160
+ "source": src,
161
+ "target": tgt,
162
+ "cooccur_count": joint,
163
+ "f_a": int(vocab[src]),
164
+ "f_b": int(vocab[tgt]),
165
+ }
166
+ )
167
+ if not rows:
168
+ return NetworkResult(
169
+ nodes=vocab.rename("count").to_frame().assign(degree=0),
170
+ edges=pd.DataFrame(
171
+ columns=["source", "target", "cooccur_count", "weight", "rank"]
172
+ ),
173
+ measure=measure,
174
+ window=window,
175
+ label=_corpus_label(corpus),
176
+ params={"top_n": top_n, "min_count": min_count, "min_cooccur": min_cooccur},
177
+ )
178
+
179
+ edges = pd.DataFrame(rows)
180
+ f_xy_arr = edges["cooccur_count"].to_numpy(dtype=float) + smoothing
181
+ f_a_arr = edges["f_a"].to_numpy(dtype=float) + smoothing
182
+ f_b_arr = edges["f_b"].to_numpy(dtype=float) + smoothing
183
+
184
+ if measure == "PMI":
185
+ weight_arr = np.log2((f_xy_arr * n_total) / (f_a_arr * f_b_arr))
186
+ elif measure == "t_score":
187
+ expected_arr = (f_a_arr * f_b_arr) / n_total
188
+ weight_arr = (f_xy_arr - expected_arr) / np.sqrt(f_xy_arr)
189
+ elif measure == "MI3":
190
+ weight_arr = np.log2((np.power(f_xy_arr, 3) * n_total) / (f_a_arr * f_b_arr))
191
+ elif measure == "logDice":
192
+ weight_arr = 14.0 + np.log2((2.0 * f_xy_arr) / (f_a_arr + f_b_arr))
193
+ else:
194
+ raise ValueError(f"unknown measure={measure!r}")
195
+
196
+ # Silence the lint warnings on unused-but-validated helper imports.
197
+ _ = (pmi, t_score, mi_three, logdice)
198
+
199
+ edges = edges.drop(columns=["f_a", "f_b"]).assign(weight=weight_arr)
200
+ edges = edges.sort_values("weight", ascending=False, key=lambda s: s.abs())
201
+ edges = edges.reset_index(drop=True).assign(rank=lambda d: d.index.astype(int))
202
+
203
+ # Degrees (undirected): how many edges touch each node?
204
+ degree_a = edges.groupby("source").size()
205
+ degree_b = edges.groupby("target").size()
206
+ degrees = degree_a.add(degree_b, fill_value=0).astype(int)
207
+
208
+ nodes = vocab.rename("count").to_frame()
209
+ nodes["degree"] = degrees.reindex(nodes.index, fill_value=0).astype(int)
210
+
211
+ return NetworkResult(
212
+ nodes=nodes,
213
+ edges=edges,
214
+ measure=measure,
215
+ window=window,
216
+ label=_corpus_label(corpus),
217
+ params={
218
+ "top_n": top_n,
219
+ "min_count": min_count,
220
+ "min_cooccur": min_cooccur,
221
+ "smoothing": smoothing,
222
+ },
223
+ )
224
+
225
+
226
+ def _corpus_label(c: Corpus | CorpusSlice) -> str:
227
+ if isinstance(c, CorpusSlice):
228
+ return c.label
229
+ return "corpus"
230
+
231
+
232
+ # Public type alias for users to depend on if they want.
233
+ __all__: Sequence[str] = ["NetworkResult", "cooccurrence_network", "NetworkMeasure"]