pycorpdiff 0.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycorpdiff/__init__.py +126 -0
- pycorpdiff/_backends/__init__.py +3 -0
- pycorpdiff/_backends/pandas.py +3 -0
- pycorpdiff/_backends/polars.py +3 -0
- pycorpdiff/collocation/__init__.py +19 -0
- pycorpdiff/collocation/cooccurrence.py +65 -0
- pycorpdiff/collocation/measures.py +102 -0
- pycorpdiff/collocation/network.py +233 -0
- pycorpdiff/collocation/shift.py +146 -0
- pycorpdiff/compare.py +345 -0
- pycorpdiff/corpus.py +411 -0
- pycorpdiff/datasets/__init__.py +27 -0
- pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- pycorpdiff/datasets/_generate_hansard.py +221 -0
- pycorpdiff/datasets/hansard.py +235 -0
- pycorpdiff/datasets/histwords.py +221 -0
- pycorpdiff/explain.py +177 -0
- pycorpdiff/io/__init__.py +16 -0
- pycorpdiff/io/duckdb.py +92 -0
- pycorpdiff/io/huggingface.py +142 -0
- pycorpdiff/io/readers.py +138 -0
- pycorpdiff/keyness/__init__.py +26 -0
- pycorpdiff/keyness/bayes.py +50 -0
- pycorpdiff/keyness/chi_squared.py +94 -0
- pycorpdiff/keyness/correction.py +34 -0
- pycorpdiff/keyness/dispersion.py +89 -0
- pycorpdiff/keyness/effect_sizes.py +65 -0
- pycorpdiff/keyness/loglikelihood.py +92 -0
- pycorpdiff/keyness/multicorpus.py +143 -0
- pycorpdiff/keyness/permutation.py +154 -0
- pycorpdiff/py.typed +0 -0
- pycorpdiff/results.py +635 -0
- pycorpdiff/semantic/__init__.py +18 -0
- pycorpdiff/semantic/alignment.py +53 -0
- pycorpdiff/semantic/embed.py +84 -0
- pycorpdiff/semantic/shift.py +224 -0
- pycorpdiff/semantic/trajectory.py +166 -0
- pycorpdiff/stats.py +69 -0
- pycorpdiff/temporal/__init__.py +15 -0
- pycorpdiff/temporal/bocpd.py +233 -0
- pycorpdiff/temporal/causal_impact.py +293 -0
- pycorpdiff/temporal/changepoint.py +92 -0
- pycorpdiff/temporal/forecast.py +405 -0
- pycorpdiff/temporal/its.py +123 -0
- pycorpdiff/temporal/slicing.py +174 -0
- pycorpdiff/tokenize.py +110 -0
- pycorpdiff/viz/__init__.py +37 -0
- pycorpdiff/viz/bocpd.py +173 -0
- pycorpdiff/viz/causal_impact.py +142 -0
- pycorpdiff/viz/collocation.py +48 -0
- pycorpdiff/viz/dispersion.py +117 -0
- pycorpdiff/viz/forecast.py +129 -0
- pycorpdiff/viz/keyness.py +96 -0
- pycorpdiff/viz/network.py +186 -0
- pycorpdiff/viz/scattertext.py +160 -0
- pycorpdiff/viz/semantic_forecast.py +114 -0
- pycorpdiff/viz/trajectory.py +48 -0
- pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
- pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
- pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
- pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
pycorpdiff/io/duckdb.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Out-of-core corpus querying via DuckDB.
|
|
2
|
+
|
|
3
|
+
DuckDB is in the optional ``duckdb`` extra. The reader is a thin
|
|
4
|
+
shim that runs a SQL query and projects the result into a pandas
|
|
5
|
+
DataFrame — DuckDB handles the heavy lifting (out-of-core scans of
|
|
6
|
+
parquet, CSV, Arrow tables, SQLite, S3-hosted files) before the data
|
|
7
|
+
ever touches pandas.
|
|
8
|
+
|
|
9
|
+
Use this when your corpus is too large to fit in pandas comfortably
|
|
10
|
+
but small enough that the rows you actually need fit after filtering.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from ..corpus import Corpus
|
|
18
|
+
from ..tokenize import Tokenizer
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def read_duckdb(
|
|
22
|
+
connection: Any,
|
|
23
|
+
query: str,
|
|
24
|
+
text_col: str = "text",
|
|
25
|
+
id_col: str | None = None,
|
|
26
|
+
meta_cols: tuple[str, ...] = (),
|
|
27
|
+
tokenizer: Tokenizer | None = None,
|
|
28
|
+
params: list[Any] | dict[str, Any] | None = None,
|
|
29
|
+
) -> Corpus:
|
|
30
|
+
"""Run a SQL query against a DuckDB connection and wrap as a :class:`Corpus`.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
connection
|
|
35
|
+
A :class:`duckdb.DuckDBPyConnection` (the object returned by
|
|
36
|
+
``duckdb.connect(...)``). Pass ``duckdb.connect()`` for an
|
|
37
|
+
in-memory database, or ``duckdb.connect("path/to/file.duckdb")``
|
|
38
|
+
for an on-disk one. DuckDB also accepts parquet / CSV / Arrow
|
|
39
|
+
directly in SQL via ``read_parquet('path')``.
|
|
40
|
+
query
|
|
41
|
+
SQL that returns rows; must include the text column named by
|
|
42
|
+
``text_col``. Anything you can express in DuckDB SQL is fine —
|
|
43
|
+
filters, joins, aggregates — the only requirement is that the
|
|
44
|
+
final SELECT yields one row per document.
|
|
45
|
+
text_col
|
|
46
|
+
Name of the column containing document text. Default: ``"text"``.
|
|
47
|
+
id_col
|
|
48
|
+
Optional unique-document-id column.
|
|
49
|
+
meta_cols
|
|
50
|
+
Tuple of metadata column names to surface for slicing. If empty
|
|
51
|
+
(the default), every non-text column becomes metadata.
|
|
52
|
+
tokenizer
|
|
53
|
+
Optional :class:`Tokenizer`. Defaults to :class:`RegexTokenizer`.
|
|
54
|
+
params
|
|
55
|
+
Optional positional or named SQL parameters; forwarded to
|
|
56
|
+
:meth:`duckdb.DuckDBPyConnection.execute`.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
Corpus
|
|
61
|
+
Whose backing DataFrame is the result of the query.
|
|
62
|
+
|
|
63
|
+
Examples
|
|
64
|
+
--------
|
|
65
|
+
>>> import duckdb, pycorpdiff as pcd
|
|
66
|
+
>>> con = duckdb.connect()
|
|
67
|
+
>>> corpus = pcd.read_duckdb( # doctest: +SKIP
|
|
68
|
+
... con,
|
|
69
|
+
... "SELECT body AS text, outlet, year FROM read_parquet('news/*.parquet') "
|
|
70
|
+
... "WHERE year >= 2020",
|
|
71
|
+
... )
|
|
72
|
+
"""
|
|
73
|
+
try:
|
|
74
|
+
import duckdb # noqa: F401
|
|
75
|
+
except ImportError as exc: # pragma: no cover
|
|
76
|
+
raise ImportError(
|
|
77
|
+
"read_duckdb requires duckdb. Install with: pip install 'pycorpdiff[duckdb]'"
|
|
78
|
+
) from exc
|
|
79
|
+
|
|
80
|
+
cursor = connection.execute(query, params) if params is not None else connection.execute(query)
|
|
81
|
+
df = cursor.df()
|
|
82
|
+
if text_col not in df.columns:
|
|
83
|
+
raise ValueError(
|
|
84
|
+
f"text_col={text_col!r} not found in query result columns "
|
|
85
|
+
f"{list(df.columns)!r}"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
from .readers import from_dataframe
|
|
89
|
+
|
|
90
|
+
return from_dataframe(
|
|
91
|
+
df, text_col=text_col, id_col=id_col, meta_cols=meta_cols, tokenizer=tokenizer
|
|
92
|
+
)
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""HuggingFace Datasets loader.
|
|
2
|
+
|
|
3
|
+
The dominant modern source of public text corpora — Hansard mirrors,
|
|
4
|
+
news datasets, social-media archives, academic corpora — is the
|
|
5
|
+
HuggingFace `datasets` hub. This module wraps `datasets.load_dataset`
|
|
6
|
+
in a thin shim that converts the result to a :class:`pycorpdiff.Corpus`.
|
|
7
|
+
|
|
8
|
+
``datasets`` is heavy (pulls ``pyarrow``, ``fsspec``, ``requests``,
|
|
9
|
+
``aiohttp``), so it lives in the optional ``huggingface`` extra:
|
|
10
|
+
|
|
11
|
+
pip install 'pycorpdiff[huggingface]'
|
|
12
|
+
|
|
13
|
+
Then::
|
|
14
|
+
|
|
15
|
+
corpus = pcd.from_huggingface(
|
|
16
|
+
"stanfordnlp/imdb", split="train",
|
|
17
|
+
text_col="text", meta_cols=("label",),
|
|
18
|
+
)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from collections.abc import Callable
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
import pandas as pd
|
|
27
|
+
|
|
28
|
+
from ..corpus import Corpus
|
|
29
|
+
from ..io.readers import from_dataframe
|
|
30
|
+
from ..tokenize import Tokenizer
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def from_huggingface(
|
|
34
|
+
dataset_id: str,
|
|
35
|
+
*,
|
|
36
|
+
split: str = "train",
|
|
37
|
+
text_col: str = "text",
|
|
38
|
+
id_col: str | None = None,
|
|
39
|
+
meta_cols: tuple[str, ...] = (),
|
|
40
|
+
tokenizer: Tokenizer | None = None,
|
|
41
|
+
config_name: str | None = None,
|
|
42
|
+
columns: list[str] | None = None,
|
|
43
|
+
n_rows: int | None = None,
|
|
44
|
+
_loader: Callable[..., Any] | None = None,
|
|
45
|
+
**load_dataset_kwargs: Any,
|
|
46
|
+
) -> Corpus:
|
|
47
|
+
"""Load a HuggingFace dataset and wrap it as a :class:`Corpus`.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
dataset_id
|
|
52
|
+
The hub identifier — e.g. ``"stanfordnlp/imdb"``,
|
|
53
|
+
``"openwebtext"``, ``"wikitext"``, or any private repo path
|
|
54
|
+
you have access to.
|
|
55
|
+
split
|
|
56
|
+
Which split to materialise — ``"train"`` (default),
|
|
57
|
+
``"test"``, ``"validation"``, or a slice expression like
|
|
58
|
+
``"train[:1000]"``.
|
|
59
|
+
text_col
|
|
60
|
+
Name of the column carrying document text in the dataset.
|
|
61
|
+
id_col
|
|
62
|
+
Optional unique-document-id column.
|
|
63
|
+
meta_cols
|
|
64
|
+
Tuple of metadata column names to surface for slicing. If
|
|
65
|
+
empty, every non-text column becomes metadata (matching the
|
|
66
|
+
:func:`from_dataframe` default).
|
|
67
|
+
tokenizer
|
|
68
|
+
Optional :class:`Tokenizer`.
|
|
69
|
+
config_name
|
|
70
|
+
HuggingFace's "name" parameter for multi-config datasets
|
|
71
|
+
(e.g. ``"wikitext-103-v1"`` for the ``wikitext`` dataset).
|
|
72
|
+
columns
|
|
73
|
+
Restrict materialisation to a subset of columns — useful when
|
|
74
|
+
the dataset has many fields you don't need.
|
|
75
|
+
n_rows
|
|
76
|
+
Materialise only the first ``n_rows`` documents. Equivalent
|
|
77
|
+
to passing ``split=f"{split}[:{n_rows}]"``; the explicit
|
|
78
|
+
parameter is just more discoverable.
|
|
79
|
+
_loader
|
|
80
|
+
Internal hook for unit tests; substitutes
|
|
81
|
+
:func:`datasets.load_dataset`.
|
|
82
|
+
**load_dataset_kwargs
|
|
83
|
+
Anything else gets forwarded to ``datasets.load_dataset``.
|
|
84
|
+
|
|
85
|
+
Examples
|
|
86
|
+
--------
|
|
87
|
+
>>> import pycorpdiff as pcd
|
|
88
|
+
>>> corpus = pcd.from_huggingface( # doctest: +SKIP
|
|
89
|
+
... "stanfordnlp/imdb", split="train[:1000]",
|
|
90
|
+
... text_col="text", meta_cols=("label",),
|
|
91
|
+
... )
|
|
92
|
+
>>> pos = corpus.slice(label=1); neg = corpus.slice(label=0) # doctest: +SKIP
|
|
93
|
+
>>> pcd.compare(pos, neg).keyness().plot() # doctest: +SKIP
|
|
94
|
+
"""
|
|
95
|
+
loader = _loader
|
|
96
|
+
if loader is None:
|
|
97
|
+
try:
|
|
98
|
+
from datasets import load_dataset as _hf_load # type: ignore[import-not-found]
|
|
99
|
+
except ImportError as exc: # pragma: no cover
|
|
100
|
+
raise ImportError(
|
|
101
|
+
"from_huggingface requires the `datasets` library. "
|
|
102
|
+
"Install with: pip install 'pycorpdiff[huggingface]'"
|
|
103
|
+
) from exc
|
|
104
|
+
loader = _hf_load
|
|
105
|
+
|
|
106
|
+
effective_split = split if n_rows is None else f"{split}[:{int(n_rows)}]"
|
|
107
|
+
|
|
108
|
+
ds = loader(
|
|
109
|
+
dataset_id,
|
|
110
|
+
name=config_name,
|
|
111
|
+
split=effective_split,
|
|
112
|
+
**load_dataset_kwargs,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# The datasets library exposes Arrow Tables; the canonical conversion
|
|
116
|
+
# to pandas is .to_pandas(), which most dataset objects implement.
|
|
117
|
+
if hasattr(ds, "to_pandas"):
|
|
118
|
+
df = ds.to_pandas()
|
|
119
|
+
elif isinstance(ds, pd.DataFrame):
|
|
120
|
+
df = ds
|
|
121
|
+
else:
|
|
122
|
+
# Last-resort: iterate as dicts.
|
|
123
|
+
df = pd.DataFrame(list(ds))
|
|
124
|
+
|
|
125
|
+
if columns is not None:
|
|
126
|
+
# Keep just the requested columns (plus text_col if not listed).
|
|
127
|
+
keep = list(dict.fromkeys([text_col, *columns]))
|
|
128
|
+
df = df[[c for c in keep if c in df.columns]]
|
|
129
|
+
|
|
130
|
+
if text_col not in df.columns:
|
|
131
|
+
raise ValueError(
|
|
132
|
+
f"text_col={text_col!r} not found in dataset columns "
|
|
133
|
+
f"{list(df.columns)!r}"
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return from_dataframe(
|
|
137
|
+
df,
|
|
138
|
+
text_col=text_col,
|
|
139
|
+
id_col=id_col,
|
|
140
|
+
meta_cols=meta_cols,
|
|
141
|
+
tokenizer=tokenizer,
|
|
142
|
+
)
|
pycorpdiff/io/readers.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Corpus readers — txt, csv, parquet, in-memory DataFrame."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from ..corpus import Corpus
|
|
11
|
+
from ..tokenize import RegexTokenizer, Tokenizer
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def from_dataframe(
|
|
15
|
+
df: Any,
|
|
16
|
+
text_col: str = "text",
|
|
17
|
+
id_col: str | None = None,
|
|
18
|
+
meta_cols: tuple[str, ...] = (),
|
|
19
|
+
tokenizer: Tokenizer | None = None,
|
|
20
|
+
) -> Corpus:
|
|
21
|
+
"""Construct a :class:`Corpus` from an in-memory DataFrame.
|
|
22
|
+
|
|
23
|
+
Accepts either a :class:`pandas.DataFrame` or a
|
|
24
|
+
:class:`polars.DataFrame`. Polars input is converted to pandas
|
|
25
|
+
internally — the analytical layer is pandas-based, but the
|
|
26
|
+
constructor is symmetric so polars-native pipelines slot in
|
|
27
|
+
without explicit conversion.
|
|
28
|
+
"""
|
|
29
|
+
if isinstance(df, pd.DataFrame):
|
|
30
|
+
df = df.reset_index(drop=True)
|
|
31
|
+
# else: Corpus.__post_init__ handles polars → pandas coercion.
|
|
32
|
+
return Corpus(
|
|
33
|
+
docs=df,
|
|
34
|
+
text_col=text_col,
|
|
35
|
+
id_col=id_col,
|
|
36
|
+
meta_cols=meta_cols,
|
|
37
|
+
tokenizer=tokenizer if tokenizer is not None else RegexTokenizer(),
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def read_csv(
|
|
42
|
+
path: str | Path,
|
|
43
|
+
text_col: str = "text",
|
|
44
|
+
id_col: str | None = None,
|
|
45
|
+
meta_cols: tuple[str, ...] = (),
|
|
46
|
+
tokenizer: Tokenizer | None = None,
|
|
47
|
+
**read_csv_kwargs: Any,
|
|
48
|
+
) -> Corpus:
|
|
49
|
+
"""Read a CSV file into a :class:`Corpus`.
|
|
50
|
+
|
|
51
|
+
Extra keyword arguments are forwarded to :func:`pandas.read_csv`.
|
|
52
|
+
"""
|
|
53
|
+
df = pd.read_csv(path, **read_csv_kwargs)
|
|
54
|
+
return from_dataframe(
|
|
55
|
+
df, text_col=text_col, id_col=id_col, meta_cols=meta_cols, tokenizer=tokenizer
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def read_parquet(
|
|
60
|
+
path: str | Path,
|
|
61
|
+
text_col: str = "text",
|
|
62
|
+
id_col: str | None = None,
|
|
63
|
+
meta_cols: tuple[str, ...] = (),
|
|
64
|
+
tokenizer: Tokenizer | None = None,
|
|
65
|
+
use_polars: bool = False,
|
|
66
|
+
**read_parquet_kwargs: Any,
|
|
67
|
+
) -> Corpus:
|
|
68
|
+
"""Read a parquet file (or directory of parquet files) into a :class:`Corpus`.
|
|
69
|
+
|
|
70
|
+
Set ``use_polars=True`` to read via ``polars.read_parquet`` instead
|
|
71
|
+
of ``pandas.read_parquet`` — polars's parquet reader is often
|
|
72
|
+
several × faster on large files, particularly when only a subset of
|
|
73
|
+
columns is materialised. The result is converted to pandas
|
|
74
|
+
internally; the user-visible Corpus is identical either way.
|
|
75
|
+
Requires the ``polars`` extra.
|
|
76
|
+
"""
|
|
77
|
+
if use_polars:
|
|
78
|
+
try:
|
|
79
|
+
import polars as pl
|
|
80
|
+
except ImportError as exc: # pragma: no cover
|
|
81
|
+
raise ImportError(
|
|
82
|
+
"use_polars=True requires polars. Install with: "
|
|
83
|
+
"pip install 'pycorpdiff[polars]'"
|
|
84
|
+
) from exc
|
|
85
|
+
df_pl = pl.read_parquet(path, **read_parquet_kwargs)
|
|
86
|
+
return from_dataframe(
|
|
87
|
+
df_pl, text_col=text_col, id_col=id_col, meta_cols=meta_cols, tokenizer=tokenizer
|
|
88
|
+
)
|
|
89
|
+
df = pd.read_parquet(path, **read_parquet_kwargs)
|
|
90
|
+
return from_dataframe(
|
|
91
|
+
df, text_col=text_col, id_col=id_col, meta_cols=meta_cols, tokenizer=tokenizer
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def read_txt(
|
|
96
|
+
path: str | Path,
|
|
97
|
+
encoding: str = "utf-8",
|
|
98
|
+
one_doc_per: str = "file",
|
|
99
|
+
tokenizer: Tokenizer | None = None,
|
|
100
|
+
) -> Corpus:
|
|
101
|
+
"""Read a single text file into a :class:`Corpus`.
|
|
102
|
+
|
|
103
|
+
Parameters
|
|
104
|
+
----------
|
|
105
|
+
path
|
|
106
|
+
Path to a UTF-8 text file (override via ``encoding``).
|
|
107
|
+
one_doc_per
|
|
108
|
+
``"file"`` treats the whole file as one document. ``"line"``
|
|
109
|
+
treats each non-empty line as its own document — useful for
|
|
110
|
+
per-line corpora like JSONL exports already projected to text,
|
|
111
|
+
or one-utterance-per-line transcripts.
|
|
112
|
+
tokenizer
|
|
113
|
+
Optional :class:`Tokenizer`. Defaults to :class:`RegexTokenizer`.
|
|
114
|
+
|
|
115
|
+
Returns
|
|
116
|
+
-------
|
|
117
|
+
Corpus
|
|
118
|
+
Has columns ``text``, ``source`` (the path), and — when
|
|
119
|
+
``one_doc_per="line"`` — an integer ``line`` column with the
|
|
120
|
+
1-based line number so KWIC results can point back at the
|
|
121
|
+
original file.
|
|
122
|
+
"""
|
|
123
|
+
if one_doc_per not in ("file", "line"):
|
|
124
|
+
raise ValueError(
|
|
125
|
+
f"one_doc_per must be 'file' or 'line'; got {one_doc_per!r}"
|
|
126
|
+
)
|
|
127
|
+
text = Path(path).read_text(encoding=encoding)
|
|
128
|
+
if one_doc_per == "file":
|
|
129
|
+
df = pd.DataFrame({"text": [text], "source": [str(path)]})
|
|
130
|
+
else:
|
|
131
|
+
lines = text.splitlines()
|
|
132
|
+
rows = [
|
|
133
|
+
{"text": line, "source": str(path), "line": i + 1}
|
|
134
|
+
for i, line in enumerate(lines)
|
|
135
|
+
if line.strip()
|
|
136
|
+
]
|
|
137
|
+
df = pd.DataFrame(rows, columns=["text", "source", "line"])
|
|
138
|
+
return from_dataframe(df, text_col="text", tokenizer=tokenizer)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Keyness measures — Dunning log-likelihood, LogRatio, Bayes factor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .bayes import bayes_factor
|
|
6
|
+
from .chi_squared import chi_squared
|
|
7
|
+
from .correction import benjamini_hochberg, bonferroni
|
|
8
|
+
from .dispersion import dispersion_dp, juilland_d
|
|
9
|
+
from .effect_sizes import log_ratio, percent_diff
|
|
10
|
+
from .loglikelihood import log_likelihood
|
|
11
|
+
from .multicorpus import keyness_multi
|
|
12
|
+
from .permutation import permutation_pvalues
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"bayes_factor",
|
|
16
|
+
"benjamini_hochberg",
|
|
17
|
+
"bonferroni",
|
|
18
|
+
"chi_squared",
|
|
19
|
+
"dispersion_dp",
|
|
20
|
+
"juilland_d",
|
|
21
|
+
"keyness_multi",
|
|
22
|
+
"log_likelihood",
|
|
23
|
+
"log_ratio",
|
|
24
|
+
"percent_diff",
|
|
25
|
+
"permutation_pvalues",
|
|
26
|
+
]
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Bayes factor keyness, BIC-based approximation.
|
|
2
|
+
|
|
3
|
+
References
|
|
4
|
+
----------
|
|
5
|
+
Wilson, A. (2013). Embracing Bayes factors for key item analysis in
|
|
6
|
+
corpus linguistics. In *New Approaches to the Study of Linguistic
|
|
7
|
+
Variability* (pp. 3-11).
|
|
8
|
+
|
|
9
|
+
Kass, R. E., & Raftery, A. E. (1995). Bayes factors. *Journal of the
|
|
10
|
+
American Statistical Association*, 90(430), 773-795.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
18
|
+
from .loglikelihood import log_likelihood
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def bayes_factor(
|
|
22
|
+
counts_a: pd.Series,
|
|
23
|
+
counts_b: pd.Series,
|
|
24
|
+
total_a: int,
|
|
25
|
+
total_b: int,
|
|
26
|
+
) -> pd.Series:
|
|
27
|
+
"""BIC-approximated Bayes factor for each term's frequency difference.
|
|
28
|
+
|
|
29
|
+
Uses Wilson's BIC approximation: ``BIC = |G²| - ln(N)`` where ``N``
|
|
30
|
+
is the total tokens across both corpora and ``G²`` is the unsigned
|
|
31
|
+
log-likelihood. The Bayes factor is then ``exp(BIC / 2)``.
|
|
32
|
+
|
|
33
|
+
Interpret with Kass & Raftery (1995):
|
|
34
|
+
|
|
35
|
+
- ``BF > 2`` : positive evidence
|
|
36
|
+
- ``BF > 6`` : strong evidence
|
|
37
|
+
- ``BF > 10`` : very strong evidence
|
|
38
|
+
- ``BF > 100``: decisive evidence
|
|
39
|
+
|
|
40
|
+
Very large BF values overflow float64 and surface as ``inf``; that is
|
|
41
|
+
semantically correct ("evidence is essentially conclusive") and pandas
|
|
42
|
+
plots / sorts handle it.
|
|
43
|
+
"""
|
|
44
|
+
terms = counts_a.index.union(counts_b.index)
|
|
45
|
+
ll_table = log_likelihood(counts_a, counts_b, total_a, total_b)
|
|
46
|
+
g2_abs = ll_table["g2"].abs()
|
|
47
|
+
bic = g2_abs - np.log(total_a + total_b)
|
|
48
|
+
with np.errstate(over="ignore"):
|
|
49
|
+
bf = np.exp(bic / 2.0)
|
|
50
|
+
return pd.Series(bf, index=terms, name="bayes_factor")
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Pearson's χ² keyness statistic.
|
|
2
|
+
|
|
3
|
+
The historical alternative to Dunning's G² for 2×2 corpus-comparison
|
|
4
|
+
contingency tables. Both are asymptotically χ²(1)-distributed under
|
|
5
|
+
the null of identical relative frequencies; G² is more robust to
|
|
6
|
+
small expected counts (Dunning 1993), which is why pycorpdiff defaults
|
|
7
|
+
to it. χ² is exposed here for the *test* of the equivalence and for
|
|
8
|
+
researchers replicating older keyness-via-chi-squared studies.
|
|
9
|
+
|
|
10
|
+
Reference
|
|
11
|
+
---------
|
|
12
|
+
Pearson, K. (1900). On the criterion that a given system of deviations
|
|
13
|
+
from the probable in the case of a correlated system of variables is
|
|
14
|
+
such that it can be reasonably supposed to have arisen from random
|
|
15
|
+
sampling. *Philosophical Magazine*, 50(302), 157–175.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
import pandas as pd
|
|
22
|
+
from scipy.stats import chi2
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def chi_squared(
|
|
26
|
+
counts_a: pd.Series,
|
|
27
|
+
counts_b: pd.Series,
|
|
28
|
+
total_a: int,
|
|
29
|
+
total_b: int,
|
|
30
|
+
) -> pd.DataFrame:
|
|
31
|
+
"""Compute Pearson χ² for every term in ``counts_a ∪ counts_b``.
|
|
32
|
+
|
|
33
|
+
Inputs and conventions mirror
|
|
34
|
+
:func:`pycorpdiff.keyness.log_likelihood`: caller is responsible for
|
|
35
|
+
min-count filtering; the returned ``chi_squared`` column is **signed**
|
|
36
|
+
by the direction of overuse (positive when A's rate exceeds B's),
|
|
37
|
+
while the *p*-value is computed from ``|χ²|``.
|
|
38
|
+
|
|
39
|
+
Parameters
|
|
40
|
+
----------
|
|
41
|
+
counts_a, counts_b
|
|
42
|
+
Term-frequency series; missing terms imputed as zero on the union.
|
|
43
|
+
total_a, total_b
|
|
44
|
+
Corpus totals.
|
|
45
|
+
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
pandas.DataFrame
|
|
49
|
+
Indexed by term, columns ``count_a``, ``count_b``, ``expected_a``,
|
|
50
|
+
``expected_b``, ``chi_squared`` (signed), ``p_value``.
|
|
51
|
+
"""
|
|
52
|
+
if total_a <= 0 or total_b <= 0:
|
|
53
|
+
raise ValueError(f"total_a and total_b must be positive; got {total_a}, {total_b}")
|
|
54
|
+
|
|
55
|
+
terms = counts_a.index.union(counts_b.index)
|
|
56
|
+
# Cast to float64 throughout: the 2×2 numerator below is
|
|
57
|
+
# ``(ad − bc)² · N``, which overflows int64 for any realistic
|
|
58
|
+
# corpus size (a few hundred occurrences against a million-token
|
|
59
|
+
# corpus is enough).
|
|
60
|
+
a = counts_a.reindex(terms, fill_value=0).astype(np.float64).to_numpy()
|
|
61
|
+
b = counts_b.reindex(terms, fill_value=0).astype(np.float64).to_numpy()
|
|
62
|
+
|
|
63
|
+
obs_sum = a + b
|
|
64
|
+
total = float(total_a + total_b)
|
|
65
|
+
expected_a = float(total_a) * obs_sum / total
|
|
66
|
+
expected_b = float(total_b) * obs_sum / total
|
|
67
|
+
|
|
68
|
+
# 2×2 closed form: χ² = ((ad − bc)² · N) / ((a+b)(c+d)(a+c)(b+d))
|
|
69
|
+
# where c = N_a − a (non-term in A), d = N_b − b (non-term in B).
|
|
70
|
+
c = float(total_a) - a
|
|
71
|
+
d = float(total_b) - b
|
|
72
|
+
numerator = (a * d - b * c) ** 2 * total
|
|
73
|
+
denominator = obs_sum * (c + d) * float(total_a) * float(total_b)
|
|
74
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
75
|
+
unsigned = np.where(denominator > 0, numerator / denominator, 0.0)
|
|
76
|
+
unsigned = np.maximum(unsigned, 0.0)
|
|
77
|
+
|
|
78
|
+
a_rate = a / total_a
|
|
79
|
+
b_rate = b / total_b
|
|
80
|
+
sign = np.where(a_rate >= b_rate, 1.0, -1.0)
|
|
81
|
+
signed = sign * unsigned
|
|
82
|
+
p_value = chi2.sf(unsigned, df=1)
|
|
83
|
+
|
|
84
|
+
return pd.DataFrame(
|
|
85
|
+
{
|
|
86
|
+
"count_a": a,
|
|
87
|
+
"count_b": b,
|
|
88
|
+
"expected_a": expected_a,
|
|
89
|
+
"expected_b": expected_b,
|
|
90
|
+
"chi_squared": signed,
|
|
91
|
+
"p_value": p_value,
|
|
92
|
+
},
|
|
93
|
+
index=terms,
|
|
94
|
+
)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Multiple-comparison correction for keyness *p*-value vectors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import numpy.typing as npt
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def benjamini_hochberg(pvals: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]:
|
|
10
|
+
"""Return Benjamini–Hochberg–adjusted *p*-values.
|
|
11
|
+
|
|
12
|
+
For each input *p*, the adjusted value is the minimum over the
|
|
13
|
+
rank-cumulative ``p_(k) * n / k`` from that rank rightward, clipped
|
|
14
|
+
to ``[0, 1]``. Order of the input is preserved.
|
|
15
|
+
"""
|
|
16
|
+
pvals = np.asarray(pvals, dtype=np.float64)
|
|
17
|
+
n = pvals.size
|
|
18
|
+
if n == 0:
|
|
19
|
+
return pvals
|
|
20
|
+
order = np.argsort(pvals)
|
|
21
|
+
ranks = np.arange(1, n + 1)
|
|
22
|
+
raw = pvals[order] * n / ranks
|
|
23
|
+
# Cumulative minimum from the right enforces monotonicity.
|
|
24
|
+
monotone = np.minimum.accumulate(raw[::-1])[::-1]
|
|
25
|
+
monotone = np.clip(monotone, 0.0, 1.0)
|
|
26
|
+
out = np.empty(n, dtype=np.float64)
|
|
27
|
+
out[order] = monotone
|
|
28
|
+
return out
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def bonferroni(pvals: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]:
|
|
32
|
+
"""Bonferroni-corrected *p*-values: ``min(p * n, 1)`` elementwise."""
|
|
33
|
+
pvals = np.asarray(pvals, dtype=np.float64)
|
|
34
|
+
return np.clip(pvals * pvals.size, 0.0, 1.0)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Dispersion measures for corpus-comparison sanity checks.
|
|
2
|
+
|
|
3
|
+
A term can be "key" (significant + large effect) simply because one
|
|
4
|
+
document overuses it. Reporting dispersion alongside keyness lets the
|
|
5
|
+
caller filter out these spurious findings.
|
|
6
|
+
|
|
7
|
+
References
|
|
8
|
+
----------
|
|
9
|
+
Juilland, A., & Chang-Rodríguez, E. (1964). *Frequency Dictionary of
|
|
10
|
+
Spanish Words*. Mouton.
|
|
11
|
+
|
|
12
|
+
Gries, S. Th. (2008). Dispersions and adjusted frequencies in corpora.
|
|
13
|
+
*International Journal of Corpus Linguistics*, 13(4), 403-437.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def juilland_d(doc_term_matrix: pd.DataFrame) -> pd.Series:
|
|
23
|
+
"""Juilland's D — a 0..1 dispersion score; higher is more even.
|
|
24
|
+
|
|
25
|
+
Assumes the rows of ``doc_term_matrix`` are equally weighted parts
|
|
26
|
+
(i.e. treats each document as one "part"). For arbitrarily-sized
|
|
27
|
+
parts, aggregate to fixed-size buckets first.
|
|
28
|
+
|
|
29
|
+
Per-term D = ``1 - CV / sqrt(k - 1)``, where CV is the coefficient
|
|
30
|
+
of variation of the term's per-document relative frequencies and
|
|
31
|
+
``k`` is the number of documents. D = 1 means perfectly even
|
|
32
|
+
spread; D = 0 means concentrated in one document.
|
|
33
|
+
|
|
34
|
+
Edge cases: when ``k == 1`` the formula is undefined and we return
|
|
35
|
+
NaN. When a term's count is zero everywhere the per-document rates
|
|
36
|
+
are all zero and we return 0 (no spread).
|
|
37
|
+
"""
|
|
38
|
+
k = len(doc_term_matrix)
|
|
39
|
+
if k <= 1:
|
|
40
|
+
return pd.Series(np.nan, index=doc_term_matrix.columns, name="juilland_d")
|
|
41
|
+
|
|
42
|
+
counts = doc_term_matrix.to_numpy(dtype=float) # (k, V)
|
|
43
|
+
doc_totals = counts.sum(axis=1) # (k,)
|
|
44
|
+
# Per-document relative frequencies. Empty documents contribute zero
|
|
45
|
+
# rate everywhere (avoid divide-by-zero with a safe denominator).
|
|
46
|
+
safe_totals = np.where(doc_totals > 0, doc_totals, 1.0)
|
|
47
|
+
rates = counts / safe_totals[:, None]
|
|
48
|
+
rates = np.where(doc_totals[:, None] > 0, rates, 0.0)
|
|
49
|
+
|
|
50
|
+
mean = rates.mean(axis=0)
|
|
51
|
+
std = rates.std(axis=0, ddof=0)
|
|
52
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
53
|
+
cv = np.where(mean > 0, std / mean, 0.0)
|
|
54
|
+
d = np.where(mean > 0, 1.0 - cv / np.sqrt(k - 1), 0.0)
|
|
55
|
+
return pd.Series(d, index=doc_term_matrix.columns, name="juilland_d")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def dispersion_dp(doc_term_matrix: pd.DataFrame) -> pd.Series:
|
|
59
|
+
"""Gries's DP (Deviation of Proportions) — 0..1; lower is more even.
|
|
60
|
+
|
|
61
|
+
For each document ``i`` with size ``s_i`` (in tokens) and target-term
|
|
62
|
+
count ``c_i``, let ``expected_i = s_i / S`` (the document's share of
|
|
63
|
+
the corpus) and ``observed_i = c_i / C`` (the document's share of the
|
|
64
|
+
target's occurrences). Then ``DP = 0.5 * Σ |observed_i - expected_i|``.
|
|
65
|
+
|
|
66
|
+
DP = 0 means perfectly even spread; DP near 1 means total
|
|
67
|
+
concentration. We return the unnormalised form (Gries 2008 §3); the
|
|
68
|
+
normalised variant ``DPnorm = DP / (1 - min(expected_i))`` is a
|
|
69
|
+
one-line transformation if needed.
|
|
70
|
+
"""
|
|
71
|
+
if len(doc_term_matrix) == 0:
|
|
72
|
+
return pd.Series(dtype=float, name="dispersion_dp")
|
|
73
|
+
|
|
74
|
+
doc_sizes = doc_term_matrix.sum(axis=1).astype(float).to_numpy()
|
|
75
|
+
s_total = doc_sizes.sum()
|
|
76
|
+
if s_total == 0:
|
|
77
|
+
return pd.Series(np.nan, index=doc_term_matrix.columns, name="dispersion_dp")
|
|
78
|
+
expected = doc_sizes / s_total # shape (k,)
|
|
79
|
+
|
|
80
|
+
term_totals = doc_term_matrix.sum(axis=0).astype(float)
|
|
81
|
+
counts = doc_term_matrix.astype(float).to_numpy() # shape (k, V)
|
|
82
|
+
|
|
83
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
84
|
+
observed = counts / term_totals.to_numpy() # shape (k, V), broadcasts
|
|
85
|
+
# Terms with zero total get observed = NaN/Inf → replace with expected
|
|
86
|
+
# so |observed - expected| = 0 (a uniformly-absent term has trivial DP).
|
|
87
|
+
observed = np.where(term_totals.to_numpy() > 0, observed, expected[:, None])
|
|
88
|
+
dp = 0.5 * np.abs(observed - expected[:, None]).sum(axis=0)
|
|
89
|
+
return pd.Series(dp, index=doc_term_matrix.columns, name="dispersion_dp")
|