pycorpdiff 0.1.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pycorpdiff/__init__.py +126 -0
  2. pycorpdiff/_backends/__init__.py +3 -0
  3. pycorpdiff/_backends/pandas.py +3 -0
  4. pycorpdiff/_backends/polars.py +3 -0
  5. pycorpdiff/collocation/__init__.py +19 -0
  6. pycorpdiff/collocation/cooccurrence.py +65 -0
  7. pycorpdiff/collocation/measures.py +102 -0
  8. pycorpdiff/collocation/network.py +233 -0
  9. pycorpdiff/collocation/shift.py +146 -0
  10. pycorpdiff/compare.py +345 -0
  11. pycorpdiff/corpus.py +411 -0
  12. pycorpdiff/datasets/__init__.py +27 -0
  13. pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  14. pycorpdiff/datasets/_generate_hansard.py +221 -0
  15. pycorpdiff/datasets/hansard.py +235 -0
  16. pycorpdiff/datasets/histwords.py +221 -0
  17. pycorpdiff/explain.py +177 -0
  18. pycorpdiff/io/__init__.py +16 -0
  19. pycorpdiff/io/duckdb.py +92 -0
  20. pycorpdiff/io/huggingface.py +142 -0
  21. pycorpdiff/io/readers.py +138 -0
  22. pycorpdiff/keyness/__init__.py +26 -0
  23. pycorpdiff/keyness/bayes.py +50 -0
  24. pycorpdiff/keyness/chi_squared.py +94 -0
  25. pycorpdiff/keyness/correction.py +34 -0
  26. pycorpdiff/keyness/dispersion.py +89 -0
  27. pycorpdiff/keyness/effect_sizes.py +65 -0
  28. pycorpdiff/keyness/loglikelihood.py +92 -0
  29. pycorpdiff/keyness/multicorpus.py +143 -0
  30. pycorpdiff/keyness/permutation.py +154 -0
  31. pycorpdiff/py.typed +0 -0
  32. pycorpdiff/results.py +635 -0
  33. pycorpdiff/semantic/__init__.py +18 -0
  34. pycorpdiff/semantic/alignment.py +53 -0
  35. pycorpdiff/semantic/embed.py +84 -0
  36. pycorpdiff/semantic/shift.py +224 -0
  37. pycorpdiff/semantic/trajectory.py +166 -0
  38. pycorpdiff/stats.py +69 -0
  39. pycorpdiff/temporal/__init__.py +15 -0
  40. pycorpdiff/temporal/bocpd.py +233 -0
  41. pycorpdiff/temporal/causal_impact.py +293 -0
  42. pycorpdiff/temporal/changepoint.py +92 -0
  43. pycorpdiff/temporal/forecast.py +405 -0
  44. pycorpdiff/temporal/its.py +123 -0
  45. pycorpdiff/temporal/slicing.py +174 -0
  46. pycorpdiff/tokenize.py +110 -0
  47. pycorpdiff/viz/__init__.py +37 -0
  48. pycorpdiff/viz/bocpd.py +173 -0
  49. pycorpdiff/viz/causal_impact.py +142 -0
  50. pycorpdiff/viz/collocation.py +48 -0
  51. pycorpdiff/viz/dispersion.py +117 -0
  52. pycorpdiff/viz/forecast.py +129 -0
  53. pycorpdiff/viz/keyness.py +96 -0
  54. pycorpdiff/viz/network.py +186 -0
  55. pycorpdiff/viz/scattertext.py +160 -0
  56. pycorpdiff/viz/semantic_forecast.py +114 -0
  57. pycorpdiff/viz/trajectory.py +48 -0
  58. pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
  59. pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
  60. pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
  61. pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,92 @@
1
+ """Out-of-core corpus querying via DuckDB.
2
+
3
+ DuckDB is in the optional ``duckdb`` extra. The reader is a thin
4
+ shim that runs a SQL query and projects the result into a pandas
5
+ DataFrame — DuckDB handles the heavy lifting (out-of-core scans of
6
+ parquet, CSV, Arrow tables, SQLite, S3-hosted files) before the data
7
+ ever touches pandas.
8
+
9
+ Use this when your corpus is too large to fit in pandas comfortably
10
+ but small enough that the rows you actually need fit after filtering.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from typing import Any
16
+
17
+ from ..corpus import Corpus
18
+ from ..tokenize import Tokenizer
19
+
20
+
21
+ def read_duckdb(
22
+ connection: Any,
23
+ query: str,
24
+ text_col: str = "text",
25
+ id_col: str | None = None,
26
+ meta_cols: tuple[str, ...] = (),
27
+ tokenizer: Tokenizer | None = None,
28
+ params: list[Any] | dict[str, Any] | None = None,
29
+ ) -> Corpus:
30
+ """Run a SQL query against a DuckDB connection and wrap as a :class:`Corpus`.
31
+
32
+ Parameters
33
+ ----------
34
+ connection
35
+ A :class:`duckdb.DuckDBPyConnection` (the object returned by
36
+ ``duckdb.connect(...)``). Pass ``duckdb.connect()`` for an
37
+ in-memory database, or ``duckdb.connect("path/to/file.duckdb")``
38
+ for an on-disk one. DuckDB also accepts parquet / CSV / Arrow
39
+ directly in SQL via ``read_parquet('path')``.
40
+ query
41
+ SQL that returns rows; must include the text column named by
42
+ ``text_col``. Anything you can express in DuckDB SQL is fine —
43
+ filters, joins, aggregates — the only requirement is that the
44
+ final SELECT yields one row per document.
45
+ text_col
46
+ Name of the column containing document text. Default: ``"text"``.
47
+ id_col
48
+ Optional unique-document-id column.
49
+ meta_cols
50
+ Tuple of metadata column names to surface for slicing. If empty
51
+ (the default), every non-text column becomes metadata.
52
+ tokenizer
53
+ Optional :class:`Tokenizer`. Defaults to :class:`RegexTokenizer`.
54
+ params
55
+ Optional positional or named SQL parameters; forwarded to
56
+ :meth:`duckdb.DuckDBPyConnection.execute`.
57
+
58
+ Returns
59
+ -------
60
+ Corpus
61
+ Whose backing DataFrame is the result of the query.
62
+
63
+ Examples
64
+ --------
65
+ >>> import duckdb, pycorpdiff as pcd
66
+ >>> con = duckdb.connect()
67
+ >>> corpus = pcd.read_duckdb( # doctest: +SKIP
68
+ ... con,
69
+ ... "SELECT body AS text, outlet, year FROM read_parquet('news/*.parquet') "
70
+ ... "WHERE year >= 2020",
71
+ ... )
72
+ """
73
+ try:
74
+ import duckdb # noqa: F401
75
+ except ImportError as exc: # pragma: no cover
76
+ raise ImportError(
77
+ "read_duckdb requires duckdb. Install with: pip install 'pycorpdiff[duckdb]'"
78
+ ) from exc
79
+
80
+ cursor = connection.execute(query, params) if params is not None else connection.execute(query)
81
+ df = cursor.df()
82
+ if text_col not in df.columns:
83
+ raise ValueError(
84
+ f"text_col={text_col!r} not found in query result columns "
85
+ f"{list(df.columns)!r}"
86
+ )
87
+
88
+ from .readers import from_dataframe
89
+
90
+ return from_dataframe(
91
+ df, text_col=text_col, id_col=id_col, meta_cols=meta_cols, tokenizer=tokenizer
92
+ )
@@ -0,0 +1,142 @@
1
+ """HuggingFace Datasets loader.
2
+
3
+ The dominant modern source of public text corpora — Hansard mirrors,
4
+ news datasets, social-media archives, academic corpora — is the
5
+ HuggingFace `datasets` hub. This module wraps `datasets.load_dataset`
6
+ in a thin shim that converts the result to a :class:`pycorpdiff.Corpus`.
7
+
8
+ ``datasets`` is heavy (pulls ``pyarrow``, ``fsspec``, ``requests``,
9
+ ``aiohttp``), so it lives in the optional ``huggingface`` extra:
10
+
11
+ pip install 'pycorpdiff[huggingface]'
12
+
13
+ Then::
14
+
15
+ corpus = pcd.from_huggingface(
16
+ "stanfordnlp/imdb", split="train",
17
+ text_col="text", meta_cols=("label",),
18
+ )
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from collections.abc import Callable
24
+ from typing import Any
25
+
26
+ import pandas as pd
27
+
28
+ from ..corpus import Corpus
29
+ from ..io.readers import from_dataframe
30
+ from ..tokenize import Tokenizer
31
+
32
+
33
+ def from_huggingface(
34
+ dataset_id: str,
35
+ *,
36
+ split: str = "train",
37
+ text_col: str = "text",
38
+ id_col: str | None = None,
39
+ meta_cols: tuple[str, ...] = (),
40
+ tokenizer: Tokenizer | None = None,
41
+ config_name: str | None = None,
42
+ columns: list[str] | None = None,
43
+ n_rows: int | None = None,
44
+ _loader: Callable[..., Any] | None = None,
45
+ **load_dataset_kwargs: Any,
46
+ ) -> Corpus:
47
+ """Load a HuggingFace dataset and wrap it as a :class:`Corpus`.
48
+
49
+ Parameters
50
+ ----------
51
+ dataset_id
52
+ The hub identifier — e.g. ``"stanfordnlp/imdb"``,
53
+ ``"openwebtext"``, ``"wikitext"``, or any private repo path
54
+ you have access to.
55
+ split
56
+ Which split to materialise — ``"train"`` (default),
57
+ ``"test"``, ``"validation"``, or a slice expression like
58
+ ``"train[:1000]"``.
59
+ text_col
60
+ Name of the column carrying document text in the dataset.
61
+ id_col
62
+ Optional unique-document-id column.
63
+ meta_cols
64
+ Tuple of metadata column names to surface for slicing. If
65
+ empty, every non-text column becomes metadata (matching the
66
+ :func:`from_dataframe` default).
67
+ tokenizer
68
+ Optional :class:`Tokenizer`.
69
+ config_name
70
+ HuggingFace's "name" parameter for multi-config datasets
71
+ (e.g. ``"wikitext-103-v1"`` for the ``wikitext`` dataset).
72
+ columns
73
+ Restrict materialisation to a subset of columns — useful when
74
+ the dataset has many fields you don't need.
75
+ n_rows
76
+ Materialise only the first ``n_rows`` documents. Equivalent
77
+ to passing ``split=f"{split}[:{n_rows}]"``; the explicit
78
+ parameter is just more discoverable.
79
+ _loader
80
+ Internal hook for unit tests; substitutes
81
+ :func:`datasets.load_dataset`.
82
+ **load_dataset_kwargs
83
+ Anything else gets forwarded to ``datasets.load_dataset``.
84
+
85
+ Examples
86
+ --------
87
+ >>> import pycorpdiff as pcd
88
+ >>> corpus = pcd.from_huggingface( # doctest: +SKIP
89
+ ... "stanfordnlp/imdb", split="train[:1000]",
90
+ ... text_col="text", meta_cols=("label",),
91
+ ... )
92
+ >>> pos = corpus.slice(label=1); neg = corpus.slice(label=0) # doctest: +SKIP
93
+ >>> pcd.compare(pos, neg).keyness().plot() # doctest: +SKIP
94
+ """
95
+ loader = _loader
96
+ if loader is None:
97
+ try:
98
+ from datasets import load_dataset as _hf_load # type: ignore[import-not-found]
99
+ except ImportError as exc: # pragma: no cover
100
+ raise ImportError(
101
+ "from_huggingface requires the `datasets` library. "
102
+ "Install with: pip install 'pycorpdiff[huggingface]'"
103
+ ) from exc
104
+ loader = _hf_load
105
+
106
+ effective_split = split if n_rows is None else f"{split}[:{int(n_rows)}]"
107
+
108
+ ds = loader(
109
+ dataset_id,
110
+ name=config_name,
111
+ split=effective_split,
112
+ **load_dataset_kwargs,
113
+ )
114
+
115
+ # The datasets library exposes Arrow Tables; the canonical conversion
116
+ # to pandas is .to_pandas(), which most dataset objects implement.
117
+ if hasattr(ds, "to_pandas"):
118
+ df = ds.to_pandas()
119
+ elif isinstance(ds, pd.DataFrame):
120
+ df = ds
121
+ else:
122
+ # Last-resort: iterate as dicts.
123
+ df = pd.DataFrame(list(ds))
124
+
125
+ if columns is not None:
126
+ # Keep just the requested columns (plus text_col if not listed).
127
+ keep = list(dict.fromkeys([text_col, *columns]))
128
+ df = df[[c for c in keep if c in df.columns]]
129
+
130
+ if text_col not in df.columns:
131
+ raise ValueError(
132
+ f"text_col={text_col!r} not found in dataset columns "
133
+ f"{list(df.columns)!r}"
134
+ )
135
+
136
+ return from_dataframe(
137
+ df,
138
+ text_col=text_col,
139
+ id_col=id_col,
140
+ meta_cols=meta_cols,
141
+ tokenizer=tokenizer,
142
+ )
@@ -0,0 +1,138 @@
1
+ """Corpus readers — txt, csv, parquet, in-memory DataFrame."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ from ..corpus import Corpus
11
+ from ..tokenize import RegexTokenizer, Tokenizer
12
+
13
+
14
+ def from_dataframe(
15
+ df: Any,
16
+ text_col: str = "text",
17
+ id_col: str | None = None,
18
+ meta_cols: tuple[str, ...] = (),
19
+ tokenizer: Tokenizer | None = None,
20
+ ) -> Corpus:
21
+ """Construct a :class:`Corpus` from an in-memory DataFrame.
22
+
23
+ Accepts either a :class:`pandas.DataFrame` or a
24
+ :class:`polars.DataFrame`. Polars input is converted to pandas
25
+ internally — the analytical layer is pandas-based, but the
26
+ constructor is symmetric so polars-native pipelines slot in
27
+ without explicit conversion.
28
+ """
29
+ if isinstance(df, pd.DataFrame):
30
+ df = df.reset_index(drop=True)
31
+ # else: Corpus.__post_init__ handles polars → pandas coercion.
32
+ return Corpus(
33
+ docs=df,
34
+ text_col=text_col,
35
+ id_col=id_col,
36
+ meta_cols=meta_cols,
37
+ tokenizer=tokenizer if tokenizer is not None else RegexTokenizer(),
38
+ )
39
+
40
+
41
+ def read_csv(
42
+ path: str | Path,
43
+ text_col: str = "text",
44
+ id_col: str | None = None,
45
+ meta_cols: tuple[str, ...] = (),
46
+ tokenizer: Tokenizer | None = None,
47
+ **read_csv_kwargs: Any,
48
+ ) -> Corpus:
49
+ """Read a CSV file into a :class:`Corpus`.
50
+
51
+ Extra keyword arguments are forwarded to :func:`pandas.read_csv`.
52
+ """
53
+ df = pd.read_csv(path, **read_csv_kwargs)
54
+ return from_dataframe(
55
+ df, text_col=text_col, id_col=id_col, meta_cols=meta_cols, tokenizer=tokenizer
56
+ )
57
+
58
+
59
+ def read_parquet(
60
+ path: str | Path,
61
+ text_col: str = "text",
62
+ id_col: str | None = None,
63
+ meta_cols: tuple[str, ...] = (),
64
+ tokenizer: Tokenizer | None = None,
65
+ use_polars: bool = False,
66
+ **read_parquet_kwargs: Any,
67
+ ) -> Corpus:
68
+ """Read a parquet file (or directory of parquet files) into a :class:`Corpus`.
69
+
70
+ Set ``use_polars=True`` to read via ``polars.read_parquet`` instead
71
+ of ``pandas.read_parquet`` — polars's parquet reader is often
72
+ several × faster on large files, particularly when only a subset of
73
+ columns is materialised. The result is converted to pandas
74
+ internally; the user-visible Corpus is identical either way.
75
+ Requires the ``polars`` extra.
76
+ """
77
+ if use_polars:
78
+ try:
79
+ import polars as pl
80
+ except ImportError as exc: # pragma: no cover
81
+ raise ImportError(
82
+ "use_polars=True requires polars. Install with: "
83
+ "pip install 'pycorpdiff[polars]'"
84
+ ) from exc
85
+ df_pl = pl.read_parquet(path, **read_parquet_kwargs)
86
+ return from_dataframe(
87
+ df_pl, text_col=text_col, id_col=id_col, meta_cols=meta_cols, tokenizer=tokenizer
88
+ )
89
+ df = pd.read_parquet(path, **read_parquet_kwargs)
90
+ return from_dataframe(
91
+ df, text_col=text_col, id_col=id_col, meta_cols=meta_cols, tokenizer=tokenizer
92
+ )
93
+
94
+
95
+ def read_txt(
96
+ path: str | Path,
97
+ encoding: str = "utf-8",
98
+ one_doc_per: str = "file",
99
+ tokenizer: Tokenizer | None = None,
100
+ ) -> Corpus:
101
+ """Read a single text file into a :class:`Corpus`.
102
+
103
+ Parameters
104
+ ----------
105
+ path
106
+ Path to a UTF-8 text file (override via ``encoding``).
107
+ one_doc_per
108
+ ``"file"`` treats the whole file as one document. ``"line"``
109
+ treats each non-empty line as its own document — useful for
110
+ per-line corpora like JSONL exports already projected to text,
111
+ or one-utterance-per-line transcripts.
112
+ tokenizer
113
+ Optional :class:`Tokenizer`. Defaults to :class:`RegexTokenizer`.
114
+
115
+ Returns
116
+ -------
117
+ Corpus
118
+ Has columns ``text``, ``source`` (the path), and — when
119
+ ``one_doc_per="line"`` — an integer ``line`` column with the
120
+ 1-based line number so KWIC results can point back at the
121
+ original file.
122
+ """
123
+ if one_doc_per not in ("file", "line"):
124
+ raise ValueError(
125
+ f"one_doc_per must be 'file' or 'line'; got {one_doc_per!r}"
126
+ )
127
+ text = Path(path).read_text(encoding=encoding)
128
+ if one_doc_per == "file":
129
+ df = pd.DataFrame({"text": [text], "source": [str(path)]})
130
+ else:
131
+ lines = text.splitlines()
132
+ rows = [
133
+ {"text": line, "source": str(path), "line": i + 1}
134
+ for i, line in enumerate(lines)
135
+ if line.strip()
136
+ ]
137
+ df = pd.DataFrame(rows, columns=["text", "source", "line"])
138
+ return from_dataframe(df, text_col="text", tokenizer=tokenizer)
@@ -0,0 +1,26 @@
1
+ """Keyness measures — Dunning log-likelihood, LogRatio, Bayes factor."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .bayes import bayes_factor
6
+ from .chi_squared import chi_squared
7
+ from .correction import benjamini_hochberg, bonferroni
8
+ from .dispersion import dispersion_dp, juilland_d
9
+ from .effect_sizes import log_ratio, percent_diff
10
+ from .loglikelihood import log_likelihood
11
+ from .multicorpus import keyness_multi
12
+ from .permutation import permutation_pvalues
13
+
14
+ __all__ = [
15
+ "bayes_factor",
16
+ "benjamini_hochberg",
17
+ "bonferroni",
18
+ "chi_squared",
19
+ "dispersion_dp",
20
+ "juilland_d",
21
+ "keyness_multi",
22
+ "log_likelihood",
23
+ "log_ratio",
24
+ "percent_diff",
25
+ "permutation_pvalues",
26
+ ]
@@ -0,0 +1,50 @@
1
+ """Bayes factor keyness, BIC-based approximation.
2
+
3
+ References
4
+ ----------
5
+ Wilson, A. (2013). Embracing Bayes factors for key item analysis in
6
+ corpus linguistics. In *New Approaches to the Study of Linguistic
7
+ Variability* (pp. 3-11).
8
+
9
+ Kass, R. E., & Raftery, A. E. (1995). Bayes factors. *Journal of the
10
+ American Statistical Association*, 90(430), 773-795.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+
18
+ from .loglikelihood import log_likelihood
19
+
20
+
21
+ def bayes_factor(
22
+ counts_a: pd.Series,
23
+ counts_b: pd.Series,
24
+ total_a: int,
25
+ total_b: int,
26
+ ) -> pd.Series:
27
+ """BIC-approximated Bayes factor for each term's frequency difference.
28
+
29
+ Uses Wilson's BIC approximation: ``BIC = |G²| - ln(N)`` where ``N``
30
+ is the total tokens across both corpora and ``G²`` is the unsigned
31
+ log-likelihood. The Bayes factor is then ``exp(BIC / 2)``.
32
+
33
+ Interpret with Kass & Raftery (1995):
34
+
35
+ - ``BF > 2`` : positive evidence
36
+ - ``BF > 6`` : strong evidence
37
+ - ``BF > 10`` : very strong evidence
38
+ - ``BF > 100``: decisive evidence
39
+
40
+ Very large BF values overflow float64 and surface as ``inf``; that is
41
+ semantically correct ("evidence is essentially conclusive") and pandas
42
+ plots / sorts handle it.
43
+ """
44
+ terms = counts_a.index.union(counts_b.index)
45
+ ll_table = log_likelihood(counts_a, counts_b, total_a, total_b)
46
+ g2_abs = ll_table["g2"].abs()
47
+ bic = g2_abs - np.log(total_a + total_b)
48
+ with np.errstate(over="ignore"):
49
+ bf = np.exp(bic / 2.0)
50
+ return pd.Series(bf, index=terms, name="bayes_factor")
@@ -0,0 +1,94 @@
1
+ """Pearson's χ² keyness statistic.
2
+
3
+ The historical alternative to Dunning's G² for 2×2 corpus-comparison
4
+ contingency tables. Both are asymptotically χ²(1)-distributed under
5
+ the null of identical relative frequencies; G² is more robust to
6
+ small expected counts (Dunning 1993), which is why pycorpdiff defaults
7
+ to it. χ² is exposed here for the *test* of the equivalence and for
8
+ researchers replicating older keyness-via-chi-squared studies.
9
+
10
+ Reference
11
+ ---------
12
+ Pearson, K. (1900). On the criterion that a given system of deviations
13
+ from the probable in the case of a correlated system of variables is
14
+ such that it can be reasonably supposed to have arisen from random
15
+ sampling. *Philosophical Magazine*, 50(302), 157–175.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import numpy as np
21
+ import pandas as pd
22
+ from scipy.stats import chi2
23
+
24
+
25
+ def chi_squared(
26
+ counts_a: pd.Series,
27
+ counts_b: pd.Series,
28
+ total_a: int,
29
+ total_b: int,
30
+ ) -> pd.DataFrame:
31
+ """Compute Pearson χ² for every term in ``counts_a ∪ counts_b``.
32
+
33
+ Inputs and conventions mirror
34
+ :func:`pycorpdiff.keyness.log_likelihood`: caller is responsible for
35
+ min-count filtering; the returned ``chi_squared`` column is **signed**
36
+ by the direction of overuse (positive when A's rate exceeds B's),
37
+ while the *p*-value is computed from ``|χ²|``.
38
+
39
+ Parameters
40
+ ----------
41
+ counts_a, counts_b
42
+ Term-frequency series; missing terms imputed as zero on the union.
43
+ total_a, total_b
44
+ Corpus totals.
45
+
46
+ Returns
47
+ -------
48
+ pandas.DataFrame
49
+ Indexed by term, columns ``count_a``, ``count_b``, ``expected_a``,
50
+ ``expected_b``, ``chi_squared`` (signed), ``p_value``.
51
+ """
52
+ if total_a <= 0 or total_b <= 0:
53
+ raise ValueError(f"total_a and total_b must be positive; got {total_a}, {total_b}")
54
+
55
+ terms = counts_a.index.union(counts_b.index)
56
+ # Cast to float64 throughout: the 2×2 numerator below is
57
+ # ``(ad − bc)² · N``, which overflows int64 for any realistic
58
+ # corpus size (a few hundred occurrences against a million-token
59
+ # corpus is enough).
60
+ a = counts_a.reindex(terms, fill_value=0).astype(np.float64).to_numpy()
61
+ b = counts_b.reindex(terms, fill_value=0).astype(np.float64).to_numpy()
62
+
63
+ obs_sum = a + b
64
+ total = float(total_a + total_b)
65
+ expected_a = float(total_a) * obs_sum / total
66
+ expected_b = float(total_b) * obs_sum / total
67
+
68
+ # 2×2 closed form: χ² = ((ad − bc)² · N) / ((a+b)(c+d)(a+c)(b+d))
69
+ # where c = N_a − a (non-term in A), d = N_b − b (non-term in B).
70
+ c = float(total_a) - a
71
+ d = float(total_b) - b
72
+ numerator = (a * d - b * c) ** 2 * total
73
+ denominator = obs_sum * (c + d) * float(total_a) * float(total_b)
74
+ with np.errstate(divide="ignore", invalid="ignore"):
75
+ unsigned = np.where(denominator > 0, numerator / denominator, 0.0)
76
+ unsigned = np.maximum(unsigned, 0.0)
77
+
78
+ a_rate = a / total_a
79
+ b_rate = b / total_b
80
+ sign = np.where(a_rate >= b_rate, 1.0, -1.0)
81
+ signed = sign * unsigned
82
+ p_value = chi2.sf(unsigned, df=1)
83
+
84
+ return pd.DataFrame(
85
+ {
86
+ "count_a": a,
87
+ "count_b": b,
88
+ "expected_a": expected_a,
89
+ "expected_b": expected_b,
90
+ "chi_squared": signed,
91
+ "p_value": p_value,
92
+ },
93
+ index=terms,
94
+ )
@@ -0,0 +1,34 @@
1
+ """Multiple-comparison correction for keyness *p*-value vectors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import numpy.typing as npt
7
+
8
+
9
+ def benjamini_hochberg(pvals: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]:
10
+ """Return Benjamini–Hochberg–adjusted *p*-values.
11
+
12
+ For each input *p*, the adjusted value is the minimum over the
13
+ rank-cumulative ``p_(k) * n / k`` from that rank rightward, clipped
14
+ to ``[0, 1]``. Order of the input is preserved.
15
+ """
16
+ pvals = np.asarray(pvals, dtype=np.float64)
17
+ n = pvals.size
18
+ if n == 0:
19
+ return pvals
20
+ order = np.argsort(pvals)
21
+ ranks = np.arange(1, n + 1)
22
+ raw = pvals[order] * n / ranks
23
+ # Cumulative minimum from the right enforces monotonicity.
24
+ monotone = np.minimum.accumulate(raw[::-1])[::-1]
25
+ monotone = np.clip(monotone, 0.0, 1.0)
26
+ out = np.empty(n, dtype=np.float64)
27
+ out[order] = monotone
28
+ return out
29
+
30
+
31
+ def bonferroni(pvals: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]:
32
+ """Bonferroni-corrected *p*-values: ``min(p * n, 1)`` elementwise."""
33
+ pvals = np.asarray(pvals, dtype=np.float64)
34
+ return np.clip(pvals * pvals.size, 0.0, 1.0)
@@ -0,0 +1,89 @@
1
+ """Dispersion measures for corpus-comparison sanity checks.
2
+
3
+ A term can be "key" (significant + large effect) simply because one
4
+ document overuses it. Reporting dispersion alongside keyness lets the
5
+ caller filter out these spurious findings.
6
+
7
+ References
8
+ ----------
9
+ Juilland, A., & Chang-Rodríguez, E. (1964). *Frequency Dictionary of
10
+ Spanish Words*. Mouton.
11
+
12
+ Gries, S. Th. (2008). Dispersions and adjusted frequencies in corpora.
13
+ *International Journal of Corpus Linguistics*, 13(4), 403-437.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import numpy as np
19
+ import pandas as pd
20
+
21
+
22
+ def juilland_d(doc_term_matrix: pd.DataFrame) -> pd.Series:
23
+ """Juilland's D — a 0..1 dispersion score; higher is more even.
24
+
25
+ Assumes the rows of ``doc_term_matrix`` are equally weighted parts
26
+ (i.e. treats each document as one "part"). For arbitrarily-sized
27
+ parts, aggregate to fixed-size buckets first.
28
+
29
+ Per-term D = ``1 - CV / sqrt(k - 1)``, where CV is the coefficient
30
+ of variation of the term's per-document relative frequencies and
31
+ ``k`` is the number of documents. D = 1 means perfectly even
32
+ spread; D = 0 means concentrated in one document.
33
+
34
+ Edge cases: when ``k == 1`` the formula is undefined and we return
35
+ NaN. When a term's count is zero everywhere the per-document rates
36
+ are all zero and we return 0 (no spread).
37
+ """
38
+ k = len(doc_term_matrix)
39
+ if k <= 1:
40
+ return pd.Series(np.nan, index=doc_term_matrix.columns, name="juilland_d")
41
+
42
+ counts = doc_term_matrix.to_numpy(dtype=float) # (k, V)
43
+ doc_totals = counts.sum(axis=1) # (k,)
44
+ # Per-document relative frequencies. Empty documents contribute zero
45
+ # rate everywhere (avoid divide-by-zero with a safe denominator).
46
+ safe_totals = np.where(doc_totals > 0, doc_totals, 1.0)
47
+ rates = counts / safe_totals[:, None]
48
+ rates = np.where(doc_totals[:, None] > 0, rates, 0.0)
49
+
50
+ mean = rates.mean(axis=0)
51
+ std = rates.std(axis=0, ddof=0)
52
+ with np.errstate(divide="ignore", invalid="ignore"):
53
+ cv = np.where(mean > 0, std / mean, 0.0)
54
+ d = np.where(mean > 0, 1.0 - cv / np.sqrt(k - 1), 0.0)
55
+ return pd.Series(d, index=doc_term_matrix.columns, name="juilland_d")
56
+
57
+
58
+ def dispersion_dp(doc_term_matrix: pd.DataFrame) -> pd.Series:
59
+ """Gries's DP (Deviation of Proportions) — 0..1; lower is more even.
60
+
61
+ For each document ``i`` with size ``s_i`` (in tokens) and target-term
62
+ count ``c_i``, let ``expected_i = s_i / S`` (the document's share of
63
+ the corpus) and ``observed_i = c_i / C`` (the document's share of the
64
+ target's occurrences). Then ``DP = 0.5 * Σ |observed_i - expected_i|``.
65
+
66
+ DP = 0 means perfectly even spread; DP near 1 means total
67
+ concentration. We return the unnormalised form (Gries 2008 §3); the
68
+ normalised variant ``DPnorm = DP / (1 - min(expected_i))`` is a
69
+ one-line transformation if needed.
70
+ """
71
+ if len(doc_term_matrix) == 0:
72
+ return pd.Series(dtype=float, name="dispersion_dp")
73
+
74
+ doc_sizes = doc_term_matrix.sum(axis=1).astype(float).to_numpy()
75
+ s_total = doc_sizes.sum()
76
+ if s_total == 0:
77
+ return pd.Series(np.nan, index=doc_term_matrix.columns, name="dispersion_dp")
78
+ expected = doc_sizes / s_total # shape (k,)
79
+
80
+ term_totals = doc_term_matrix.sum(axis=0).astype(float)
81
+ counts = doc_term_matrix.astype(float).to_numpy() # shape (k, V)
82
+
83
+ with np.errstate(divide="ignore", invalid="ignore"):
84
+ observed = counts / term_totals.to_numpy() # shape (k, V), broadcasts
85
+ # Terms with zero total get observed = NaN/Inf → replace with expected
86
+ # so |observed - expected| = 0 (a uniformly-absent term has trivial DP).
87
+ observed = np.where(term_totals.to_numpy() > 0, observed, expected[:, None])
88
+ dp = 0.5 * np.abs(observed - expected[:, None]).sum(axis=0)
89
+ return pd.Series(dp, index=doc_term_matrix.columns, name="dispersion_dp")