pycorpdiff 0.1.0a6__tar.gz → 0.1.0a8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/.gitignore +0 -3
  2. pycorpdiff-0.1.0a8/CHANGELOG.md +71 -0
  3. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/CITATION.cff +1 -1
  4. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/PKG-INFO +42 -24
  5. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/README.md +39 -21
  6. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/pyproject.toml +13 -9
  7. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/__init__.py +6 -5
  8. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/compare.py +3 -1
  9. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/corpus.py +9 -0
  10. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/io/duckdb.py +13 -1
  11. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/bayes.py +10 -2
  12. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/results.py +25 -8
  13. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/semantic/shift.py +24 -0
  14. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_crossval_histwords.py +29 -15
  15. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_crossval_quanteda.py +29 -23
  16. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_sbert_slow.py +13 -2
  17. pycorpdiff-0.1.0a8/tests/unit/test_audit_a7_fixes.py +133 -0
  18. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_loglikelihood.py +46 -0
  19. pycorpdiff-0.1.0a6/CHANGELOG.md +0 -44
  20. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/LICENSE +0 -0
  21. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/_backends/__init__.py +0 -0
  22. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/_backends/pandas.py +0 -0
  23. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/_backends/polars.py +0 -0
  24. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/collocation/__init__.py +0 -0
  25. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
  26. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/collocation/measures.py +0 -0
  27. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/collocation/network.py +0 -0
  28. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/collocation/shift.py +0 -0
  29. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/datasets/__init__.py +0 -0
  30. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  31. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/datasets/_generate_hansard.py +0 -0
  32. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/datasets/hansard.py +0 -0
  33. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/datasets/histwords.py +0 -0
  34. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/explain.py +0 -0
  35. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/io/__init__.py +0 -0
  36. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/io/huggingface.py +0 -0
  37. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/io/readers.py +0 -0
  38. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/__init__.py +0 -0
  39. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/chi_squared.py +0 -0
  40. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/correction.py +0 -0
  41. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/dispersion.py +0 -0
  42. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
  43. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/loglikelihood.py +0 -0
  44. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/multicorpus.py +0 -0
  45. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/permutation.py +0 -0
  46. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/py.typed +0 -0
  47. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/semantic/__init__.py +0 -0
  48. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/semantic/alignment.py +0 -0
  49. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/semantic/embed.py +0 -0
  50. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/semantic/trajectory.py +0 -0
  51. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/stats.py +0 -0
  52. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/temporal/__init__.py +0 -0
  53. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/temporal/bocpd.py +0 -0
  54. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/temporal/causal_impact.py +0 -0
  55. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/temporal/changepoint.py +0 -0
  56. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/temporal/forecast.py +0 -0
  57. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/temporal/its.py +0 -0
  58. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/temporal/slicing.py +0 -0
  59. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/tokenize.py +0 -0
  60. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/__init__.py +0 -0
  61. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/bocpd.py +0 -0
  62. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/causal_impact.py +0 -0
  63. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/collocation.py +0 -0
  64. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/dispersion.py +0 -0
  65. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/forecast.py +0 -0
  66. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/keyness.py +0 -0
  67. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/network.py +0 -0
  68. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/scattertext.py +0 -0
  69. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
  70. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/trajectory.py +0 -0
  71. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/__init__.py +0 -0
  72. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/conftest.py +0 -0
  73. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/fixtures/__init__.py +0 -0
  74. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/__init__.py +0 -0
  75. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_collocation_integration.py +0 -0
  76. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_crossval_nltk.py +0 -0
  77. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_crossval_rayson.py +0 -0
  78. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_crossval_scattertext.py +0 -0
  79. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_explain_integration.py +0 -0
  80. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_keyness_integration.py +0 -0
  81. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_semantic_integration.py +0 -0
  82. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_stop_words.py +0 -0
  83. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_temporal_stats.py +0 -0
  84. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_viz.py +0 -0
  85. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/property/__init__.py +0 -0
  86. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/property/test_collocation_properties.py +0 -0
  87. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/property/test_keyness_properties.py +0 -0
  88. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/property/test_temporal_properties.py +0 -0
  89. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/__init__.py +0 -0
  90. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_bayes_factor.py +0 -0
  91. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_bocpd.py +0 -0
  92. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_causal_impact.py +0 -0
  93. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_changepoint.py +0 -0
  94. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_chi_squared.py +0 -0
  95. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_collocation_cooccurrence.py +0 -0
  96. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_collocation_measures.py +0 -0
  97. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_collocation_shift.py +0 -0
  98. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_comparison_concordance.py +0 -0
  99. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_cooccurrence_network.py +0 -0
  100. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_corpus_hash.py +0 -0
  101. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_corpus_vocab.py +0 -0
  102. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_correction.py +0 -0
  103. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_datasets_hansard.py +0 -0
  104. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_dispersion.py +0 -0
  105. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_dispersion_plot.py +0 -0
  106. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_doc_term_counts_sparse.py +0 -0
  107. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_effect_sizes.py +0 -0
  108. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_embedders.py +0 -0
  109. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_explain.py +0 -0
  110. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_forecast.py +0 -0
  111. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_forecast_semantic_drift.py +0 -0
  112. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_from_huggingface.py +0 -0
  113. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_hansard_fetcher.py +0 -0
  114. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_histwords_loader.py +0 -0
  115. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_its.py +0 -0
  116. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_keyness_multi.py +0 -0
  117. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_ngram_tokenizer.py +0 -0
  118. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_permutation_keyness.py +0 -0
  119. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_polars_interop.py +0 -0
  120. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_procrustes.py +0 -0
  121. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_read_duckdb.py +0 -0
  122. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_read_txt_line_mode.py +0 -0
  123. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_result_exports.py +0 -0
  124. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_scattertext_plot.py +0 -0
  125. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_semantic_neighbours.py +0 -0
  126. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_semantic_shift.py +0 -0
  127. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_semantic_trajectory.py +0 -0
  128. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_smoke.py +0 -0
  129. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_temporal.py +0 -0
  130. {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_wilson_ci.py +0 -0
@@ -33,9 +33,6 @@ Thumbs.db
33
33
  # Hypothesis example database (auto-managed)
34
34
  .hypothesis/
35
35
 
36
- # Local tooling
37
- .claude/
38
-
39
36
  # Jupyter checkpoints
40
37
  .ipynb_checkpoints/
41
38
 
@@ -0,0 +1,71 @@
1
+ # Changelog
2
+
3
+ All notable changes to `pycorpdiff` are documented in this file. The format
4
+ follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
5
+ project adheres to [Semantic Versioning](https://semver.org/).
6
+
7
+ ## [0.1.0a8] — first public release
8
+
9
+ The first public alpha of `pycorpdiff` — comparative corpus analysis
10
+ for modern Python workflows. Three public verbs (`compare`, `track`,
11
+ `compare.before_after`), nine `Result` dataclasses each implementing
12
+ the relevant subset of `.to_df / .plot / .explain / .summary /
13
+ .to_html / .to_json` (see `docs/design.md` for the per-Result method
14
+ matrix), two `typing.Protocol` extension points (`Tokenizer`,
15
+ `Embedder`), and opt-in extras for visualisation, semantic embedding,
16
+ temporal modelling, polars interop, DuckDB ingestion, 🤗 Datasets,
17
+ and notebook rendering.
18
+
19
+ ### Analytical surface
20
+
21
+ - **Keyness**: signed log-likelihood G² with selectable formula
22
+ (`formula="rayson"` 2-cell shortcut, default; matches the UCREL
23
+ LL Wizard. `formula="dunning"` 4-cell G²; matches NLTK +
24
+ `quanteda::textstat_keyness(measure="lr")` byte-for-byte.). Pearson
25
+ χ², Hardie LogRatio, Gabrielatos %DIFF, BIC-approximated Bayes
26
+ factor (also tracks the `formula=` choice), Juilland D / Gries DP
27
+ dispersion flagging, Benjamini–Hochberg correction, stop-word
28
+ filtering, empirical permutation *p*-values, N-way contingency G²
29
+ via `keyness_multi`.
30
+ - **Collocations**: logDice, PMI, t-score, MI³ with Laplace smoothing;
31
+ cross-corpus `collocation_shift`; co-occurrence networks via
32
+ `cooccurrence_network`.
33
+ - **Semantic shift**: averaged contextual embeddings, Procrustes
34
+ alignment, multi-period `semantic_trajectory`, `neighborhood_drift`.
35
+ Embedder output shape is validated to catch silently-broken
36
+ embedders before they produce nonsense.
37
+ - **Temporal**: Wilson-CI trajectories, offline PELT changepoints,
38
+ online Bayesian changepoint detection, segmented-OLS interrupted
39
+ time series, Bayesian structural time-series causal impact,
40
+ state-space exponential-smoothing forecasting.
41
+
42
+ ### Cross-validated
43
+
44
+ The package is checked against standard tools by automated test:
45
+
46
+ - **Rayson's LL Wizard** — hand-derived contingency-table reference
47
+ triples (fast tier; runs on every push).
48
+ - **NLTK** `BigramAssocMeasures` — PMI + t-score agreement to ≤ 1e-12
49
+ on every adjacent bigram (slow tier).
50
+ - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
51
+ US Conventions corpus (slow tier).
52
+ - **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-10 with
53
+ `formula="dunning"` (slow tier).
54
+ - **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
55
+ sanity check on Stanford SNAP COHA decade embeddings; skips
56
+ gracefully when the archive isn't reachable (slow tier).
57
+
58
+ ### Extras
59
+
60
+ `[viz]`, `[semantic]`, `[temporal]`, `[polars]`, `[duckdb]`, `[nlp]`,
61
+ `[huggingface]`, `[notebooks]`, `[all]` are MIT-compatible. A separate
62
+ `[showcase]` extra pulls in `pysofra` (GPL-3.0-or-later) for
63
+ JAMA-style table polish in the showcase notebook — opt in explicitly
64
+ if you accept that licence.
65
+
66
+ ### Infrastructure
67
+
68
+ Hundreds of tests, `ruff` + `mypy --strict` clean across the source
69
+ tree, matrix CI on three Python versions × two operating systems,
70
+ plus a slow-tier CI job exercising the cross-validation receipts
71
+ against NLTK + quanteda on main pushes.
@@ -4,7 +4,7 @@ message: >
4
4
  entry. GitHub renders a "Cite this repository" widget directly from
5
5
  this file.
6
6
  title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
7
- version: 0.1.0a6
7
+ version: 0.1.0a8
8
8
  date-released: 2026-05-25
9
9
  authors:
10
10
  - family-names: Turner
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pycorpdiff
3
- Version: 0.1.0a6
3
+ Version: 0.1.0a8
4
4
  Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
5
5
  Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
6
6
  Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
@@ -54,7 +54,6 @@ Requires-Dist: matplotlib>=3.8; extra == 'all'
54
54
  Requires-Dist: networkx>=3.1; extra == 'all'
55
55
  Requires-Dist: polars>=1.0; extra == 'all'
56
56
  Requires-Dist: pyarrow>=15; extra == 'all'
57
- Requires-Dist: pysofra>=0.1.0a3; extra == 'all'
58
57
  Requires-Dist: ruptures>=1.1; extra == 'all'
59
58
  Requires-Dist: scikit-learn>=1.3; extra == 'all'
60
59
  Requires-Dist: sentence-transformers>=2.2; extra == 'all'
@@ -77,7 +76,6 @@ Provides-Extra: nlp
77
76
  Requires-Dist: spacy>=3.7; extra == 'nlp'
78
77
  Provides-Extra: notebooks
79
78
  Requires-Dist: jupyter>=1.0; extra == 'notebooks'
80
- Requires-Dist: pysofra>=0.1.0a3; extra == 'notebooks'
81
79
  Requires-Dist: vl-convert-python>=1.5; extra == 'notebooks'
82
80
  Provides-Extra: polars
83
81
  Requires-Dist: polars>=1.0; extra == 'polars'
@@ -85,6 +83,8 @@ Requires-Dist: pyarrow>=15; extra == 'polars'
85
83
  Provides-Extra: semantic
86
84
  Requires-Dist: scikit-learn>=1.3; extra == 'semantic'
87
85
  Requires-Dist: sentence-transformers>=2.2; extra == 'semantic'
86
+ Provides-Extra: showcase
87
+ Requires-Dist: pysofra>=0.1.0a3; extra == 'showcase'
88
88
  Provides-Extra: temporal
89
89
  Requires-Dist: ruptures>=1.1; extra == 'temporal'
90
90
  Requires-Dist: statsmodels>=0.14; extra == 'temporal'
@@ -127,11 +127,11 @@ and computational social science routinely have:
127
127
  `pycorpdiff` is positioned as **orchestration**, not reinvention.
128
128
  Tokenizers (`spaCy`, `Stanza`, `jieba`, `fugashi`) and embedders (any
129
129
  `SBERT`-compatible model) plug in via two `typing.Protocol` extension
130
- points — one-line adapters, no plugin registry. The base install pulls
131
- only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
132
- via extras.
130
+ points — one-line adapters, no plugin registry. The base install's
131
+ direct runtime dependencies are `numpy`, `pandas`, `scipy`, and
132
+ `pyarrow`; everything else is opt-in via extras.
133
133
 
134
- > **Status: alpha (0.1.0a6).** Public API is stable for the features
134
+ > **Status: alpha (0.1.0a8).** Public API is stable for the features
135
135
  > described below; on PyPI as `pip install pycorpdiff`.
136
136
 
137
137
  ## The three-layer architecture
@@ -178,7 +178,8 @@ for the full feature tour, or the cheat sheet below for one-line API previews.
178
178
 
179
179
  ```python
180
180
  # Compare verbs (returns Result objects; methods exposed vary by Result)
181
- pcd.compare(a, b).keyness()
181
+ pcd.compare(a, b).keyness() # default formula="rayson" (LL Wizard)
182
+ pcd.compare(a, b).keyness(formula="dunning") # full 4-cell G² (matches quanteda / NLTK)
182
183
  pcd.compare(a, b).collocation_shift("immigrant")
183
184
  pcd.compare(a, b).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder()) # [semantic]
184
185
  # SBERTEmbedder downloads a sentence-transformers model on first call;
@@ -190,7 +191,7 @@ tr.changepoints() # offline PELT
190
191
  tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
191
192
  tr.interrupted_time_series(event_date="2016") # segmented OLS
192
193
  tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
193
- tr.forecast(horizon=4) # state-space ETS
194
+ tr.forecast(horizon=4) # 4 periods at the over_time freq (state-space ETS)
194
195
 
195
196
  # Before / after a known event
196
197
  pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
@@ -209,17 +210,20 @@ every analytical surface.
209
210
  ## Installation
210
211
 
211
212
  ```bash
212
- pip install pycorpdiff # lexical-comparative core
213
- pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
214
- pip install "pycorpdiff[semantic]" # + sentence-transformers
215
- pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
216
- pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert / pysofra
217
- pip install "pycorpdiff[all]" # everything
213
+ pip install pycorpdiff # lexical-comparative core (MIT)
214
+ pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
215
+ pip install "pycorpdiff[semantic]" # + sentence-transformers
216
+ pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
217
+ pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert
218
+ pip install "pycorpdiff[all]" # everything MIT-compatible
219
+ pip install "pycorpdiff[all,showcase]" # + pysofra (GPL-3.0-or-later) for the JAMA-style showcase
218
220
  ```
219
221
 
220
- The base install keeps a small dependency footprint (`numpy`, `pandas`,
221
- `scipy`, `pyarrow`); optional extras land per analytical layer so you
222
- only pay for what you use.
222
+ The base install's direct runtime dependencies are `numpy`, `pandas`,
223
+ `scipy`, and `pyarrow`; optional extras land per analytical layer so
224
+ you only pay for what you use. `[showcase]` is broken out separately
225
+ because `pysofra` is GPL-3.0-or-later — pure `pycorpdiff` use without
226
+ that extra remains MIT-only.
223
227
 
224
228
  To work from source:
225
229
 
@@ -232,13 +236,27 @@ pytest -q
232
236
 
233
237
  ## Cross-validation receipts
234
238
 
235
- The math agrees with the standard tools by automated test:
239
+ The math is checked against standard tools by automated test. The
240
+ fast tier runs on every push (matrix CI); the slow tier needs heavy
241
+ optional dependencies (R + quanteda, NLTK, rpy2, Stanford SNAP
242
+ downloads) and runs on main pushes only.
236
243
 
237
- - **Rayson's LL Wizard** — hand-derived contingency-table reference triples
238
- - **NLTK** `BigramAssocMeasures` — PMI + t-score to ≤ 1e-12 on every adjacent bigram
239
- - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012 US Conventions corpus
240
- - **quanteda (R)** via `rpy2` — byte-for-byte G² agreement with `formula="dunning"` (slow tier)
241
- - **HistWords (Hamilton et al. 2016)** — diachronic cosine displacements on COHA (slow tier)
244
+ Fast tier:
245
+
246
+ - **Rayson's LL Wizard** — hand-derived contingency-table reference
247
+ triples ([`tests/integration/test_crossval_rayson.py`](https://github.com/jturner-uofl/pycorpdiff/blob/main/tests/integration/test_crossval_rayson.py))
248
+
249
+ Slow tier:
250
+
251
+ - **NLTK** `BigramAssocMeasures` — PMI + t-score agreement to ≤ 1e-12
252
+ on every adjacent bigram
253
+ - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
254
+ US Conventions corpus
255
+ - **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-10 with
256
+ `formula="dunning"`
257
+ - **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
258
+ sanity check on Stanford SNAP COHA decade embeddings (skips
259
+ gracefully if the archive isn't reachable)
242
260
 
243
261
  ## Citation
244
262
 
@@ -31,11 +31,11 @@ and computational social science routinely have:
31
31
  `pycorpdiff` is positioned as **orchestration**, not reinvention.
32
32
  Tokenizers (`spaCy`, `Stanza`, `jieba`, `fugashi`) and embedders (any
33
33
  `SBERT`-compatible model) plug in via two `typing.Protocol` extension
34
- points — one-line adapters, no plugin registry. The base install pulls
35
- only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
36
- via extras.
34
+ points — one-line adapters, no plugin registry. The base install's
35
+ direct runtime dependencies are `numpy`, `pandas`, `scipy`, and
36
+ `pyarrow`; everything else is opt-in via extras.
37
37
 
38
- > **Status: alpha (0.1.0a6).** Public API is stable for the features
38
+ > **Status: alpha (0.1.0a8).** Public API is stable for the features
39
39
  > described below; on PyPI as `pip install pycorpdiff`.
40
40
 
41
41
  ## The three-layer architecture
@@ -82,7 +82,8 @@ for the full feature tour, or the cheat sheet below for one-line API previews.
82
82
 
83
83
  ```python
84
84
  # Compare verbs (returns Result objects; methods exposed vary by Result)
85
- pcd.compare(a, b).keyness()
85
+ pcd.compare(a, b).keyness() # default formula="rayson" (LL Wizard)
86
+ pcd.compare(a, b).keyness(formula="dunning") # full 4-cell G² (matches quanteda / NLTK)
86
87
  pcd.compare(a, b).collocation_shift("immigrant")
87
88
  pcd.compare(a, b).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder()) # [semantic]
88
89
  # SBERTEmbedder downloads a sentence-transformers model on first call;
@@ -94,7 +95,7 @@ tr.changepoints() # offline PELT
94
95
  tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
95
96
  tr.interrupted_time_series(event_date="2016") # segmented OLS
96
97
  tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
97
- tr.forecast(horizon=4) # state-space ETS
98
+ tr.forecast(horizon=4) # 4 periods at the over_time freq (state-space ETS)
98
99
 
99
100
  # Before / after a known event
100
101
  pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
@@ -113,17 +114,20 @@ every analytical surface.
113
114
  ## Installation
114
115
 
115
116
  ```bash
116
- pip install pycorpdiff # lexical-comparative core
117
- pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
118
- pip install "pycorpdiff[semantic]" # + sentence-transformers
119
- pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
120
- pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert / pysofra
121
- pip install "pycorpdiff[all]" # everything
117
+ pip install pycorpdiff # lexical-comparative core (MIT)
118
+ pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
119
+ pip install "pycorpdiff[semantic]" # + sentence-transformers
120
+ pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
121
+ pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert
122
+ pip install "pycorpdiff[all]" # everything MIT-compatible
123
+ pip install "pycorpdiff[all,showcase]" # + pysofra (GPL-3.0-or-later) for the JAMA-style showcase
122
124
  ```
123
125
 
124
- The base install keeps a small dependency footprint (`numpy`, `pandas`,
125
- `scipy`, `pyarrow`); optional extras land per analytical layer so you
126
- only pay for what you use.
126
+ The base install's direct runtime dependencies are `numpy`, `pandas`,
127
+ `scipy`, and `pyarrow`; optional extras land per analytical layer so
128
+ you only pay for what you use. `[showcase]` is broken out separately
129
+ because `pysofra` is GPL-3.0-or-later — pure `pycorpdiff` use without
130
+ that extra remains MIT-only.
127
131
 
128
132
  To work from source:
129
133
 
@@ -136,13 +140,27 @@ pytest -q
136
140
 
137
141
  ## Cross-validation receipts
138
142
 
139
- The math agrees with the standard tools by automated test:
143
+ The math is checked against standard tools by automated test. The
144
+ fast tier runs on every push (matrix CI); the slow tier needs heavy
145
+ optional dependencies (R + quanteda, NLTK, rpy2, Stanford SNAP
146
+ downloads) and runs on main pushes only.
140
147
 
141
- - **Rayson's LL Wizard** — hand-derived contingency-table reference triples
142
- - **NLTK** `BigramAssocMeasures` — PMI + t-score to ≤ 1e-12 on every adjacent bigram
143
- - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012 US Conventions corpus
144
- - **quanteda (R)** via `rpy2` — byte-for-byte G² agreement with `formula="dunning"` (slow tier)
145
- - **HistWords (Hamilton et al. 2016)** — diachronic cosine displacements on COHA (slow tier)
148
+ Fast tier:
149
+
150
+ - **Rayson's LL Wizard** — hand-derived contingency-table reference
151
+ triples ([`tests/integration/test_crossval_rayson.py`](https://github.com/jturner-uofl/pycorpdiff/blob/main/tests/integration/test_crossval_rayson.py))
152
+
153
+ Slow tier:
154
+
155
+ - **NLTK** `BigramAssocMeasures` — PMI + t-score agreement to ≤ 1e-12
156
+ on every adjacent bigram
157
+ - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
158
+ US Conventions corpus
159
+ - **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-10 with
160
+ `formula="dunning"`
161
+ - **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
162
+ sanity check on Stanford SNAP COHA decade embeddings (skips
163
+ gracefully if the archive isn't reachable)
146
164
 
147
165
  ## Citation
148
166
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "pycorpdiff"
7
- version = "0.1.0a6"
7
+ version = "0.1.0a8"
8
8
  description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -62,13 +62,18 @@ nlp = ["spacy>=3.7"]
62
62
  # Public-text-corpus hub. Heavy (pulls pyarrow, fsspec, requests, aiohttp),
63
63
  # so opt-in only — base install stays small.
64
64
  huggingface = ["datasets>=2.14"]
65
- # Needed if you want to execute the showcase notebook or regenerate the
66
- # rendered HTML examples. `jupyter` runs the notebook, `vl-convert` does
67
- # static SVG/PNG export of altair charts, `pysofra` renders the showcase's
68
- # result tables in JAMA-style typography.
69
- notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5", "pysofra>=0.1.0a3"]
70
- # Meta-extra: `pycorpdiff[all]` pulls in every optional code path
71
- # including the notebook runtime.
65
+ # Needed if you want to execute the example notebooks. `jupyter` runs
66
+ # the notebook; `vl-convert` does static SVG/PNG export of altair charts.
67
+ # Kept MIT-clean see `showcase` below for the JAMA-style table polish.
68
+ notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5"]
69
+ # Adds `pysofra` for the showcase notebook's JAMA-style typography.
70
+ # IMPORTANT: pysofra is GPL-3.0-or-later. Opting in to `[showcase]` (or
71
+ # installing pysofra directly) brings GPL into your environment; pure
72
+ # pycorpdiff use without this extra remains MIT-only.
73
+ showcase = ["pysofra>=0.1.0a3"]
74
+ # Meta-extra: every MIT-compatible optional code path. Does NOT include
75
+ # `[showcase]` because pysofra is GPL-3.0-or-later; install
76
+ # `pycorpdiff[all,showcase]` explicitly if you accept that licence.
72
77
  all = [
73
78
  "altair>=5",
74
79
  "matplotlib>=3.8",
@@ -84,7 +89,6 @@ all = [
84
89
  "spacy>=3.7",
85
90
  "jupyter>=1.0",
86
91
  "vl-convert-python>=1.5",
87
- "pysofra>=0.1.0a3",
88
92
  ]
89
93
  dev = [
90
94
  "pytest>=8",
@@ -6,20 +6,21 @@ result objects (:class:`KeynessResult`, :class:`CollocationShiftResult`,
6
6
  :class:`SemanticShiftResult`, :class:`TemporalTrajectory`,
7
7
  :class:`NetworkResult`, :class:`ForecastResult`,
8
8
  :class:`CausalImpactResult`, :class:`BocpdResult`,
9
- :class:`ConcordanceResult`), each implementing the same
10
- ``.to_df / .plot / .explain / .summary / .to_html / .to_json`` contract.
9
+ :class:`ConcordanceResult`), each implementing the relevant subset of
10
+ the ``.to_df / .plot / .explain / .summary / .to_html / .to_json``
11
+ contract. See ``docs/design.md`` for the per-Result method matrix.
11
12
 
12
13
  Example
13
14
  -------
14
15
 
15
16
  >>> import pycorpdiff as pcd
16
- >>> pcd.__version__
17
- '0.1.0a6'
17
+ >>> isinstance(pcd.__version__, str)
18
+ True
18
19
  """
19
20
 
20
21
  from __future__ import annotations
21
22
 
22
- __version__ = "0.1.0a6"
23
+ __version__ = "0.1.0a8"
23
24
 
24
25
  from .collocation.network import NetworkResult, cooccurrence_network
25
26
  from .compare import Comparison, compare
@@ -149,7 +149,9 @@ class Comparison:
149
149
  if effect_size:
150
150
  table["log_ratio"] = _log_ratio(a_kept, b_kept, n_a, n_b)
151
151
  table["percent_diff"] = _percent_diff(a_kept, b_kept, n_a, n_b)
152
- table["bayes_factor"] = _bayes_factor(a_kept, b_kept, n_a, n_b)
152
+ table["bayes_factor"] = _bayes_factor(
153
+ a_kept, b_kept, n_a, n_b, formula=formula
154
+ )
153
155
 
154
156
  if dispersion:
155
157
  kept_terms = table.index
@@ -242,6 +242,15 @@ class Corpus:
242
242
  """
243
243
  from .temporal.slicing import TemporalCorpus # local import to break cycle
244
244
 
245
+ if len(self.docs) == 0:
246
+ raise ValueError(
247
+ "by_time() requires a non-empty corpus; got 0 documents."
248
+ )
249
+ if col not in self.docs.columns:
250
+ raise ValueError(
251
+ f"by_time(col={col!r}, ...): column not found in corpus. "
252
+ f"Available columns: {list(self.docs.columns)!r}."
253
+ )
245
254
  return TemporalCorpus(parent=self, time_col=col, freq=freq)
246
255
 
247
256
  def with_tokenizer(self, tokenizer: Tokenizer) -> Corpus:
@@ -71,12 +71,24 @@ def read_duckdb(
71
71
  ... )
72
72
  """
73
73
  try:
74
- import duckdb # noqa: F401
74
+ import duckdb
75
75
  except ImportError as exc: # pragma: no cover
76
76
  raise ImportError(
77
77
  "read_duckdb requires duckdb. Install with: pip install 'pycorpdiff[duckdb]'"
78
78
  ) from exc
79
79
 
80
+ if isinstance(connection, str):
81
+ raise TypeError(
82
+ "read_duckdb expects a DuckDB connection, not a file path. "
83
+ f"Got connection={connection!r}. Open one first: "
84
+ f'duckdb.connect({connection!r}), or pcd.read_duckdb(duckdb.connect(), "...")'
85
+ )
86
+ if not isinstance(connection, duckdb.DuckDBPyConnection):
87
+ raise TypeError(
88
+ "read_duckdb expects a duckdb.DuckDBPyConnection; got "
89
+ f"{type(connection).__name__}. Open one via duckdb.connect(...)."
90
+ )
91
+
80
92
  cursor = connection.execute(query, params) if params is not None else connection.execute(query)
81
93
  df = cursor.df()
82
94
  if text_col not in df.columns:
@@ -15,7 +15,7 @@ from __future__ import annotations
15
15
  import numpy as np
16
16
  import pandas as pd
17
17
 
18
- from .loglikelihood import log_likelihood
18
+ from .loglikelihood import LLFormula, log_likelihood
19
19
 
20
20
 
21
21
  def bayes_factor(
@@ -23,6 +23,8 @@ def bayes_factor(
23
23
  counts_b: pd.Series,
24
24
  total_a: int,
25
25
  total_b: int,
26
+ *,
27
+ formula: LLFormula = "rayson",
26
28
  ) -> pd.Series:
27
29
  """BIC-approximated Bayes factor for each term's frequency difference.
28
30
 
@@ -31,6 +33,12 @@ def bayes_factor(
31
33
  the unsigned log-likelihood. The Bayes factor is then
32
34
  ``exp(BIC / 2)``. Wilson (2013) is the keyness application.
33
35
 
36
+ ``formula`` selects which G² flavour feeds the BF: ``"rayson"`` (the
37
+ 2-cell shortcut, default; matches the LL Wizard) or ``"dunning"``
38
+ (the full 4-cell G²; matches quanteda/NLTK). Use the same
39
+ ``formula=`` as the ``keyness()`` call that produced the row so the
40
+ G² and the Bayes factor in a single row describe the same statistic.
41
+
34
42
  Interpret with Kass & Raftery (1995):
35
43
 
36
44
  - ``BF > 2`` : positive evidence
@@ -43,7 +51,7 @@ def bayes_factor(
43
51
  plots / sorts handle it.
44
52
  """
45
53
  terms = counts_a.index.union(counts_b.index)
46
- ll_table = log_likelihood(counts_a, counts_b, total_a, total_b)
54
+ ll_table = log_likelihood(counts_a, counts_b, total_a, total_b, formula=formula)
47
55
  g2_abs = ll_table["g2"].abs()
48
56
  bic = g2_abs - np.log(total_a + total_b)
49
57
  with np.errstate(over="ignore"):
@@ -10,7 +10,7 @@ contract:
10
10
  - ``.summary()`` returns a short human-readable string.
11
11
  - ``.explain(term, n)`` returns a :class:`ConcordanceResult` with
12
12
  KWIC evidence for one row of the result. Defined only on
13
- comparison-based Results (``KeynessResult``, ``CollocationShiftResult``)
13
+ term-ranked Results (``KeynessResult``, ``CollocationShiftResult``)
14
14
  where "one row of the result" maps to a target term.
15
15
 
16
16
  See ``docs/design.md`` for the per-Result method matrix. This contract
@@ -257,15 +257,32 @@ class SemanticShiftResult:
257
257
  return _table_to_json(self.table, path, **kw)
258
258
 
259
259
  def plot(self, **kw: Any) -> alt.Chart:
260
- """Plotting for SemanticShiftResult is not yet implemented.
260
+ """Horizontal bar chart of cosine distance per target term.
261
261
 
262
- For a forward-looking trajectory of cosine distances, use
263
- :func:`pycorpdiff.semantic_trajectory` and pass the resulting
264
- DataFrame to :func:`pycorpdiff.viz.semantic_forecast_plot`.
262
+ For a multi-period trajectory of cosine distances (an across-
263
+ time view rather than a single A-vs-B snapshot), use
264
+ :func:`pycorpdiff.semantic_trajectory` paired with
265
+ :func:`pycorpdiff.viz.semantic_forecast_plot`.
266
+
267
+ Extra keyword arguments forward to :meth:`altair.Chart.properties`.
265
268
  """
266
- raise NotImplementedError(
267
- "SemanticShiftResult.plot() is not yet implemented; "
268
- "use .table or pcd.viz.semantic_forecast_plot() instead"
269
+ import altair as alt
270
+
271
+ return ( # type: ignore[no-any-return]
272
+ alt.Chart(self.table)
273
+ .mark_bar(color="#0b6e7c")
274
+ .encode(
275
+ x=alt.X("cosine_distance:Q", title="Cosine distance (A → B)"),
276
+ y=alt.Y("target:N", sort="-x", title=None),
277
+ tooltip=[
278
+ "target",
279
+ alt.Tooltip("cosine_similarity:Q", format=".4f"),
280
+ alt.Tooltip("cosine_distance:Q", format=".4f"),
281
+ "n_contexts_a",
282
+ "n_contexts_b",
283
+ ],
284
+ )
285
+ .properties(width=400, **kw)
269
286
  )
270
287
 
271
288
  def neighbors_before(
@@ -46,6 +46,28 @@ def _centroid(vectors: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
46
46
  return out
47
47
 
48
48
 
49
+ def _validate_embeddings(
50
+ vecs: np.ndarray[Any, Any], expected_rows: int, side: str
51
+ ) -> None:
52
+ """Catch mis-shaped embedder output before it produces silent nonsense.
53
+
54
+ A 1-D return from ``embedder.encode`` would otherwise be averaged into
55
+ a scalar centroid and yield ``cosine_similarity == 1.0`` for any
56
+ comparison — a silently wrong result.
57
+ """
58
+ if vecs.ndim != 2:
59
+ raise ValueError(
60
+ f"embedder.encode() for corpus {side!r} returned an array of "
61
+ f"rank {vecs.ndim}; expected 2 (shape (n_windows, d)). "
62
+ f"Got shape {vecs.shape}."
63
+ )
64
+ if vecs.shape[0] != expected_rows:
65
+ raise ValueError(
66
+ f"embedder.encode() for corpus {side!r} returned "
67
+ f"{vecs.shape[0]} rows; expected {expected_rows} (one per window)."
68
+ )
69
+
70
+
49
71
  def semantic_shift(
50
72
  a: Corpus | CorpusSlice,
51
73
  b: Corpus | CorpusSlice,
@@ -103,6 +125,8 @@ def semantic_shift(
103
125
 
104
126
  vecs_a = np.asarray(embedder.encode(wins_a), dtype=np.float64)
105
127
  vecs_b = np.asarray(embedder.encode(wins_b), dtype=np.float64)
128
+ _validate_embeddings(vecs_a, expected_rows=len(wins_a), side="a")
129
+ _validate_embeddings(vecs_b, expected_rows=len(wins_b), side="b")
106
130
 
107
131
  if align == "procrustes":
108
132
  # Procrustes wants two matrices of the same shape. Pad / truncate
@@ -71,9 +71,12 @@ def test_fetch_coha_1990_returns_real_vocab(histwords_cache_dir: Path) -> None:
71
71
  everyday words. Doesn't check vector values — that's the next test."""
72
72
  if not _has_internet():
73
73
  pytest.skip("offline")
74
- vecs = pcd.fetch_histwords_decade(
75
- 1990, source="coha", cache_dir=histwords_cache_dir
76
- )
74
+ try:
75
+ vecs = pcd.fetch_histwords_decade(
76
+ 1990, source="coha", cache_dir=histwords_cache_dir
77
+ )
78
+ except FileNotFoundError as exc:
79
+ pytest.skip(f"COHA 1990s not available: {exc}")
77
80
  # COHA 1990s vocab is large (~50k+ words). Expect basic English words.
78
81
  for word in ("the", "and", "of", "is", "people"):
79
82
  assert word in vecs, f"expected {word!r} in 1990s COHA vocab"
@@ -98,6 +101,8 @@ def test_known_shifters_show_high_cosine_distance(
98
101
  )
99
102
  except KeyError:
100
103
  pytest.skip(f"{word!r} missing from COHA 1900s or 1990s vocab")
104
+ except FileNotFoundError as exc:
105
+ pytest.skip(f"COHA decade data not available: {exc}")
101
106
  assert d > 0.3, (
102
107
  f"expected {word!r} to show cosine distance > 0.3 "
103
108
  f"between 1900s and 1990s COHA; got {d:.3f}"
@@ -115,9 +120,12 @@ def test_stable_function_words_show_low_cosine_distance(
115
120
  pytest.skip("offline")
116
121
  stable = ["the", "and", "of"]
117
122
  for word in stable:
118
- d = pcd.histwords_cosine_shift(
119
- 1900, 1990, word, source="coha", cache_dir=histwords_cache_dir
120
- )
123
+ try:
124
+ d = pcd.histwords_cosine_shift(
125
+ 1900, 1990, word, source="coha", cache_dir=histwords_cache_dir
126
+ )
127
+ except FileNotFoundError as exc:
128
+ pytest.skip(f"COHA decade data not available: {exc}")
121
129
  assert d < 0.30, (
122
130
  f"expected {word!r} to be stable across decades "
123
131
  f"(cosine distance < 0.30); got {d:.3f}"
@@ -137,19 +145,25 @@ def test_shifter_distance_exceeds_stable_distance_by_meaningful_margin(
137
145
  shifter_distances = []
138
146
  for word in ("gay", "broadcast", "awful"):
139
147
  with contextlib.suppress(KeyError):
140
- shifter_distances.append(
141
- pcd.histwords_cosine_shift(
142
- 1900, 1990, word, source="coha",
143
- cache_dir=histwords_cache_dir,
148
+ try:
149
+ shifter_distances.append(
150
+ pcd.histwords_cosine_shift(
151
+ 1900, 1990, word, source="coha",
152
+ cache_dir=histwords_cache_dir,
153
+ )
144
154
  )
145
- )
155
+ except FileNotFoundError as exc:
156
+ pytest.skip(f"COHA decade data not available: {exc}")
146
157
  stable_distances = []
147
158
  for word in ("the", "and", "of"):
148
- stable_distances.append(
149
- pcd.histwords_cosine_shift(
150
- 1900, 1990, word, source="coha", cache_dir=histwords_cache_dir
159
+ try:
160
+ stable_distances.append(
161
+ pcd.histwords_cosine_shift(
162
+ 1900, 1990, word, source="coha", cache_dir=histwords_cache_dir
163
+ )
151
164
  )
152
- )
165
+ except FileNotFoundError as exc:
166
+ pytest.skip(f"COHA decade data not available: {exc}")
153
167
  if not shifter_distances:
154
168
  pytest.skip("no shifters available in COHA vocab")
155
169
  avg_shift = sum(shifter_distances) / len(shifter_distances)