pycorpdiff 0.1.0a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. pycorpdiff-0.1.0a0/.gitignore +60 -0
  2. pycorpdiff-0.1.0a0/CHANGELOG.md +44 -0
  3. pycorpdiff-0.1.0a0/CITATION.cff +49 -0
  4. pycorpdiff-0.1.0a0/LICENSE +21 -0
  5. pycorpdiff-0.1.0a0/PKG-INFO +230 -0
  6. pycorpdiff-0.1.0a0/README.md +135 -0
  7. pycorpdiff-0.1.0a0/pyproject.toml +200 -0
  8. pycorpdiff-0.1.0a0/src/pycorpdiff/__init__.py +126 -0
  9. pycorpdiff-0.1.0a0/src/pycorpdiff/_backends/__init__.py +3 -0
  10. pycorpdiff-0.1.0a0/src/pycorpdiff/_backends/pandas.py +3 -0
  11. pycorpdiff-0.1.0a0/src/pycorpdiff/_backends/polars.py +3 -0
  12. pycorpdiff-0.1.0a0/src/pycorpdiff/collocation/__init__.py +19 -0
  13. pycorpdiff-0.1.0a0/src/pycorpdiff/collocation/cooccurrence.py +65 -0
  14. pycorpdiff-0.1.0a0/src/pycorpdiff/collocation/measures.py +102 -0
  15. pycorpdiff-0.1.0a0/src/pycorpdiff/collocation/network.py +233 -0
  16. pycorpdiff-0.1.0a0/src/pycorpdiff/collocation/shift.py +146 -0
  17. pycorpdiff-0.1.0a0/src/pycorpdiff/compare.py +345 -0
  18. pycorpdiff-0.1.0a0/src/pycorpdiff/corpus.py +411 -0
  19. pycorpdiff-0.1.0a0/src/pycorpdiff/datasets/__init__.py +27 -0
  20. pycorpdiff-0.1.0a0/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  21. pycorpdiff-0.1.0a0/src/pycorpdiff/datasets/_generate_hansard.py +221 -0
  22. pycorpdiff-0.1.0a0/src/pycorpdiff/datasets/hansard.py +235 -0
  23. pycorpdiff-0.1.0a0/src/pycorpdiff/datasets/histwords.py +221 -0
  24. pycorpdiff-0.1.0a0/src/pycorpdiff/explain.py +177 -0
  25. pycorpdiff-0.1.0a0/src/pycorpdiff/io/__init__.py +16 -0
  26. pycorpdiff-0.1.0a0/src/pycorpdiff/io/duckdb.py +92 -0
  27. pycorpdiff-0.1.0a0/src/pycorpdiff/io/huggingface.py +142 -0
  28. pycorpdiff-0.1.0a0/src/pycorpdiff/io/readers.py +138 -0
  29. pycorpdiff-0.1.0a0/src/pycorpdiff/keyness/__init__.py +26 -0
  30. pycorpdiff-0.1.0a0/src/pycorpdiff/keyness/bayes.py +50 -0
  31. pycorpdiff-0.1.0a0/src/pycorpdiff/keyness/chi_squared.py +94 -0
  32. pycorpdiff-0.1.0a0/src/pycorpdiff/keyness/correction.py +34 -0
  33. pycorpdiff-0.1.0a0/src/pycorpdiff/keyness/dispersion.py +89 -0
  34. pycorpdiff-0.1.0a0/src/pycorpdiff/keyness/effect_sizes.py +65 -0
  35. pycorpdiff-0.1.0a0/src/pycorpdiff/keyness/loglikelihood.py +92 -0
  36. pycorpdiff-0.1.0a0/src/pycorpdiff/keyness/multicorpus.py +143 -0
  37. pycorpdiff-0.1.0a0/src/pycorpdiff/keyness/permutation.py +154 -0
  38. pycorpdiff-0.1.0a0/src/pycorpdiff/py.typed +0 -0
  39. pycorpdiff-0.1.0a0/src/pycorpdiff/results.py +635 -0
  40. pycorpdiff-0.1.0a0/src/pycorpdiff/semantic/__init__.py +18 -0
  41. pycorpdiff-0.1.0a0/src/pycorpdiff/semantic/alignment.py +53 -0
  42. pycorpdiff-0.1.0a0/src/pycorpdiff/semantic/embed.py +84 -0
  43. pycorpdiff-0.1.0a0/src/pycorpdiff/semantic/shift.py +224 -0
  44. pycorpdiff-0.1.0a0/src/pycorpdiff/semantic/trajectory.py +166 -0
  45. pycorpdiff-0.1.0a0/src/pycorpdiff/stats.py +69 -0
  46. pycorpdiff-0.1.0a0/src/pycorpdiff/temporal/__init__.py +15 -0
  47. pycorpdiff-0.1.0a0/src/pycorpdiff/temporal/bocpd.py +233 -0
  48. pycorpdiff-0.1.0a0/src/pycorpdiff/temporal/causal_impact.py +293 -0
  49. pycorpdiff-0.1.0a0/src/pycorpdiff/temporal/changepoint.py +92 -0
  50. pycorpdiff-0.1.0a0/src/pycorpdiff/temporal/forecast.py +405 -0
  51. pycorpdiff-0.1.0a0/src/pycorpdiff/temporal/its.py +123 -0
  52. pycorpdiff-0.1.0a0/src/pycorpdiff/temporal/slicing.py +174 -0
  53. pycorpdiff-0.1.0a0/src/pycorpdiff/tokenize.py +110 -0
  54. pycorpdiff-0.1.0a0/src/pycorpdiff/viz/__init__.py +37 -0
  55. pycorpdiff-0.1.0a0/src/pycorpdiff/viz/bocpd.py +173 -0
  56. pycorpdiff-0.1.0a0/src/pycorpdiff/viz/causal_impact.py +142 -0
  57. pycorpdiff-0.1.0a0/src/pycorpdiff/viz/collocation.py +48 -0
  58. pycorpdiff-0.1.0a0/src/pycorpdiff/viz/dispersion.py +117 -0
  59. pycorpdiff-0.1.0a0/src/pycorpdiff/viz/forecast.py +129 -0
  60. pycorpdiff-0.1.0a0/src/pycorpdiff/viz/keyness.py +96 -0
  61. pycorpdiff-0.1.0a0/src/pycorpdiff/viz/network.py +186 -0
  62. pycorpdiff-0.1.0a0/src/pycorpdiff/viz/scattertext.py +160 -0
  63. pycorpdiff-0.1.0a0/src/pycorpdiff/viz/semantic_forecast.py +114 -0
  64. pycorpdiff-0.1.0a0/src/pycorpdiff/viz/trajectory.py +48 -0
  65. pycorpdiff-0.1.0a0/tests/__init__.py +0 -0
  66. pycorpdiff-0.1.0a0/tests/conftest.py +29 -0
  67. pycorpdiff-0.1.0a0/tests/fixtures/__init__.py +0 -0
  68. pycorpdiff-0.1.0a0/tests/integration/__init__.py +0 -0
  69. pycorpdiff-0.1.0a0/tests/integration/test_collocation_integration.py +85 -0
  70. pycorpdiff-0.1.0a0/tests/integration/test_crossval_histwords.py +174 -0
  71. pycorpdiff-0.1.0a0/tests/integration/test_crossval_nltk.py +157 -0
  72. pycorpdiff-0.1.0a0/tests/integration/test_crossval_quanteda.py +129 -0
  73. pycorpdiff-0.1.0a0/tests/integration/test_crossval_rayson.py +171 -0
  74. pycorpdiff-0.1.0a0/tests/integration/test_crossval_scattertext.py +110 -0
  75. pycorpdiff-0.1.0a0/tests/integration/test_explain_integration.py +94 -0
  76. pycorpdiff-0.1.0a0/tests/integration/test_keyness_integration.py +145 -0
  77. pycorpdiff-0.1.0a0/tests/integration/test_sbert_slow.py +121 -0
  78. pycorpdiff-0.1.0a0/tests/integration/test_semantic_integration.py +65 -0
  79. pycorpdiff-0.1.0a0/tests/integration/test_stop_words.py +118 -0
  80. pycorpdiff-0.1.0a0/tests/integration/test_temporal_stats.py +80 -0
  81. pycorpdiff-0.1.0a0/tests/integration/test_viz.py +143 -0
  82. pycorpdiff-0.1.0a0/tests/property/__init__.py +0 -0
  83. pycorpdiff-0.1.0a0/tests/property/test_collocation_properties.py +106 -0
  84. pycorpdiff-0.1.0a0/tests/property/test_keyness_properties.py +123 -0
  85. pycorpdiff-0.1.0a0/tests/property/test_temporal_properties.py +101 -0
  86. pycorpdiff-0.1.0a0/tests/unit/__init__.py +0 -0
  87. pycorpdiff-0.1.0a0/tests/unit/test_bayes_factor.py +51 -0
  88. pycorpdiff-0.1.0a0/tests/unit/test_bocpd.py +238 -0
  89. pycorpdiff-0.1.0a0/tests/unit/test_causal_impact.py +283 -0
  90. pycorpdiff-0.1.0a0/tests/unit/test_changepoint.py +63 -0
  91. pycorpdiff-0.1.0a0/tests/unit/test_chi_squared.py +95 -0
  92. pycorpdiff-0.1.0a0/tests/unit/test_collocation_cooccurrence.py +78 -0
  93. pycorpdiff-0.1.0a0/tests/unit/test_collocation_measures.py +121 -0
  94. pycorpdiff-0.1.0a0/tests/unit/test_collocation_shift.py +117 -0
  95. pycorpdiff-0.1.0a0/tests/unit/test_comparison_concordance.py +200 -0
  96. pycorpdiff-0.1.0a0/tests/unit/test_cooccurrence_network.py +218 -0
  97. pycorpdiff-0.1.0a0/tests/unit/test_corpus_hash.py +82 -0
  98. pycorpdiff-0.1.0a0/tests/unit/test_corpus_vocab.py +51 -0
  99. pycorpdiff-0.1.0a0/tests/unit/test_correction.py +48 -0
  100. pycorpdiff-0.1.0a0/tests/unit/test_datasets_hansard.py +80 -0
  101. pycorpdiff-0.1.0a0/tests/unit/test_dispersion.py +74 -0
  102. pycorpdiff-0.1.0a0/tests/unit/test_dispersion_plot.py +97 -0
  103. pycorpdiff-0.1.0a0/tests/unit/test_doc_term_counts_sparse.py +135 -0
  104. pycorpdiff-0.1.0a0/tests/unit/test_effect_sizes.py +80 -0
  105. pycorpdiff-0.1.0a0/tests/unit/test_embedders.py +78 -0
  106. pycorpdiff-0.1.0a0/tests/unit/test_explain.py +135 -0
  107. pycorpdiff-0.1.0a0/tests/unit/test_forecast.py +296 -0
  108. pycorpdiff-0.1.0a0/tests/unit/test_forecast_semantic_drift.py +206 -0
  109. pycorpdiff-0.1.0a0/tests/unit/test_from_huggingface.py +153 -0
  110. pycorpdiff-0.1.0a0/tests/unit/test_hansard_fetcher.py +222 -0
  111. pycorpdiff-0.1.0a0/tests/unit/test_histwords_loader.py +188 -0
  112. pycorpdiff-0.1.0a0/tests/unit/test_its.py +80 -0
  113. pycorpdiff-0.1.0a0/tests/unit/test_keyness_multi.py +183 -0
  114. pycorpdiff-0.1.0a0/tests/unit/test_loglikelihood.py +136 -0
  115. pycorpdiff-0.1.0a0/tests/unit/test_ngram_tokenizer.py +167 -0
  116. pycorpdiff-0.1.0a0/tests/unit/test_permutation_keyness.py +156 -0
  117. pycorpdiff-0.1.0a0/tests/unit/test_polars_interop.py +136 -0
  118. pycorpdiff-0.1.0a0/tests/unit/test_procrustes.py +58 -0
  119. pycorpdiff-0.1.0a0/tests/unit/test_read_duckdb.py +148 -0
  120. pycorpdiff-0.1.0a0/tests/unit/test_read_txt_line_mode.py +62 -0
  121. pycorpdiff-0.1.0a0/tests/unit/test_result_exports.py +111 -0
  122. pycorpdiff-0.1.0a0/tests/unit/test_scattertext_plot.py +217 -0
  123. pycorpdiff-0.1.0a0/tests/unit/test_semantic_neighbours.py +93 -0
  124. pycorpdiff-0.1.0a0/tests/unit/test_semantic_shift.py +123 -0
  125. pycorpdiff-0.1.0a0/tests/unit/test_semantic_trajectory.py +173 -0
  126. pycorpdiff-0.1.0a0/tests/unit/test_smoke.py +144 -0
  127. pycorpdiff-0.1.0a0/tests/unit/test_temporal.py +147 -0
  128. pycorpdiff-0.1.0a0/tests/unit/test_wilson_ci.py +71 -0
@@ -0,0 +1,60 @@
1
+ # Python build artefacts
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .eggs/
8
+
9
+ # Virtual environments
10
+ .venv/
11
+ venv/
12
+ env/
13
+ .env
14
+
15
+ # Test / type-check / lint caches
16
+ .pytest_cache/
17
+ .mypy_cache/
18
+ .ruff_cache/
19
+ .coverage
20
+ .coverage.*
21
+ htmlcov/
22
+ .tox/
23
+
24
+ # Editor / OS cruft
25
+ .DS_Store
26
+ Thumbs.db
27
+ .idea/
28
+ .vscode/
29
+ *.swp
30
+ *.swo
31
+ *~
32
+
33
+ # AI workflow artefacts (kept local, never published)
34
+ .claude/
35
+
36
+ # Hypothesis example database (auto-managed)
37
+ .hypothesis/
38
+
39
+ # Jupyter checkpoints
40
+ .ipynb_checkpoints/
41
+
42
+ # Notebook outputs that aren't reviewed-as-source; the canonical notebooks
43
+ # are executed in CI, not hand-edited with stale outputs.
44
+ examples/*_executed.ipynb
45
+
46
+ # Temp notebooks produced by scripts/render_notebooks_to_html.py
47
+ examples/*.patched.ipynb
48
+
49
+ # pyenv local override
50
+ .python-version
51
+
52
+ # Misc temp files
53
+ *.tmp
54
+ *.bak
55
+
56
+ # Stray uv lockfiles created outside the repo root
57
+ **/uv.lock.tmp
58
+
59
+ # Mkdocs build output (legacy; mkdocs.yml itself is gone)
60
+ site/
@@ -0,0 +1,44 @@
1
+ # Changelog
2
+
3
+ All notable changes to `pycorpdiff` are documented in this file. The format
4
+ follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
5
+ project adheres to [Semantic Versioning](https://semver.org/).
6
+
7
+ ## [0.1.0a0] — initial release
8
+
9
+ The initial public release of `pycorpdiff` — comparative corpus analysis
10
+ for modern Python workflows. Three public verbs (`compare`, `track`,
11
+ `compare.before_after`), nine `Result` dataclasses with a uniform
12
+ six-method contract (`.to_df / .plot / .explain / .summary / .to_html /
13
+ .to_json`), two `typing.Protocol` extension points (`Tokenizer`,
14
+ `Embedder`), and opt-in extras for visualisation, semantic embedding,
15
+ temporal modelling, polars interop, DuckDB ingestion, and 🤗 Datasets.
16
+
17
+ ### Analytical surface
18
+
19
+ - **Keyness**: signed Dunning G², Pearson χ², Hardie LogRatio,
20
+ Gabrielatos %DIFF, BIC-Bayes factor, Juilland D / Gries DP dispersion
21
+ flagging, Benjamini–Hochberg correction, stop-word filtering,
22
+ empirical permutation *p*-values, N-way contingency G² via
23
+ `keyness_multi`.
24
+ - **Collocations**: logDice, PMI, t-score, MI³ with Laplace smoothing;
25
+ cross-corpus `collocation_shift`; co-occurrence networks via
26
+ `cooccurrence_network`.
27
+ - **Semantic shift**: averaged contextual embeddings, Procrustes
28
+ alignment, multi-period `semantic_trajectory`, `neighborhood_drift`.
29
+ - **Temporal**: Wilson-CI trajectories, offline PELT changepoints,
30
+ online Bayesian changepoint detection, segmented-OLS interrupted
31
+ time series, Bayesian structural time-series causal impact,
32
+ state-space exponential-smoothing forecasting.
33
+
34
+ ### Cross-validated
35
+
36
+ Numerically agrees with Rayson's LL Wizard (15 reference triples),
37
+ NLTK's `BigramAssocMeasures` (≤ 1e-12 on PMI / t-score / MI³),
38
+ Scattertext on the 2012 US conventions, `quanteda` via `rpy2`, and
39
+ the HistWords COHA replication.
40
+
41
+ ### Infrastructure
42
+
43
+ 519 tests, `ruff` + `mypy --strict` clean across 55 source files,
44
+ matrix CI on three Python versions × two operating systems.
@@ -0,0 +1,49 @@
1
+ cff-version: 1.2.0
2
+ message: >
3
+ If you use pycorpdiff in academic work, please cite both the
4
+ software (this entry) and the accompanying Journal of Statistical
5
+ Software paper once it appears. The JSS manuscript is in
6
+ preparation; the draft will live in this repository as paper/paper.tex.
7
+ title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
8
+ version: 0.1.0a0
9
+ date-released: 2026-05-22
10
+ authors:
11
+ - family-names: Turner
12
+ given-names: Jason
13
+ email: jason.s.turner@gmail.com
14
+ license: MIT
15
+ repository-code: "https://github.com/jturner-uofl/pycorpdiff"
16
+ keywords:
17
+ - corpus linguistics
18
+ - comparative corpus analysis
19
+ - keyness
20
+ - collocation
21
+ - semantic change
22
+ - diachronic nlp
23
+ - digital humanities
24
+ - computational social science
25
+ - reproducible research
26
+ - python
27
+ abstract: >
28
+ pycorpdiff is a Python package for comparative and temporal corpus
29
+ analysis. It provides a coherent comparative layer over the
30
+ existing PyData and NLP stacks, unifying classical corpus
31
+ linguistics methods (keyness, collocations, dispersion) with
32
+ embedding-based semantic-shift analysis under a single, composable
33
+ API. The package targets corpus linguistics, digital humanities,
34
+ computational social science, and discourse analysis research,
35
+ emphasising interpretability, explainability, statistical rigour,
36
+ and reproducibility.
37
+ preferred-citation:
38
+ type: article
39
+ authors:
40
+ - family-names: Turner
41
+ given-names: Jason
42
+ title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
43
+ journal: "Journal of Statistical Software"
44
+ year: 2026
45
+ status: in-preparation
46
+ identifiers:
47
+ - type: url
48
+ value: "https://github.com/jturner-uofl/pycorpdiff"
49
+ description: Project repository
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jason Turner
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,230 @@
1
+ Metadata-Version: 2.4
2
+ Name: pycorpdiff
3
+ Version: 0.1.0a0
4
+ Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
5
+ Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
6
+ Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
7
+ Project-URL: Repository, https://github.com/jturner-uofl/pycorpdiff
8
+ Project-URL: Issues, https://github.com/jturner-uofl/pycorpdiff/issues
9
+ Author-email: Jason Turner <jason.s.turner@gmail.com>
10
+ License: MIT License
11
+
12
+ Copyright (c) 2026 Jason Turner
13
+
14
+ Permission is hereby granted, free of charge, to any person obtaining a copy
15
+ of this software and associated documentation files (the "Software"), to deal
16
+ in the Software without restriction, including without limitation the rights
17
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18
+ copies of the Software, and to permit persons to whom the Software is
19
+ furnished to do so, subject to the following conditions:
20
+
21
+ The above copyright notice and this permission notice shall be included in all
22
+ copies or substantial portions of the Software.
23
+
24
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30
+ SOFTWARE.
31
+ License-File: LICENSE
32
+ Keywords: collocation,comparative corpus analysis,computational social science,corpus linguistics,diachronic nlp,digital humanities,discourse analysis,keyness,semantic change,temporal text analysis
33
+ Classifier: Development Status :: 2 - Pre-Alpha
34
+ Classifier: Intended Audience :: Science/Research
35
+ Classifier: License :: OSI Approved :: MIT License
36
+ Classifier: Programming Language :: Python :: 3
37
+ Classifier: Programming Language :: Python :: 3 :: Only
38
+ Classifier: Programming Language :: Python :: 3.11
39
+ Classifier: Programming Language :: Python :: 3.12
40
+ Classifier: Programming Language :: Python :: 3.13
41
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
42
+ Classifier: Topic :: Text Processing :: Linguistic
43
+ Requires-Python: >=3.11
44
+ Requires-Dist: numpy>=1.24
45
+ Requires-Dist: pandas<3,>=2.0
46
+ Requires-Dist: pyarrow>=14
47
+ Requires-Dist: scipy>=1.11
48
+ Provides-Extra: all
49
+ Requires-Dist: altair>=5; extra == 'all'
50
+ Requires-Dist: datasets>=2.14; extra == 'all'
51
+ Requires-Dist: duckdb>=0.10; extra == 'all'
52
+ Requires-Dist: matplotlib>=3.8; extra == 'all'
53
+ Requires-Dist: networkx>=3.1; extra == 'all'
54
+ Requires-Dist: polars>=1.0; extra == 'all'
55
+ Requires-Dist: pyarrow>=15; extra == 'all'
56
+ Requires-Dist: pysofra>=0.1.0a2; extra == 'all'
57
+ Requires-Dist: ruptures>=1.1; extra == 'all'
58
+ Requires-Dist: scikit-learn>=1.3; extra == 'all'
59
+ Requires-Dist: sentence-transformers>=2.2; extra == 'all'
60
+ Requires-Dist: spacy>=3.7; extra == 'all'
61
+ Requires-Dist: statsmodels>=0.14; extra == 'all'
62
+ Requires-Dist: vl-convert-python>=1.5; extra == 'all'
63
+ Provides-Extra: dev
64
+ Requires-Dist: hypothesis>=6.100; extra == 'dev'
65
+ Requires-Dist: mypy>=1.8; extra == 'dev'
66
+ Requires-Dist: pandas-stubs>=2.2; extra == 'dev'
67
+ Requires-Dist: pre-commit>=3.6; extra == 'dev'
68
+ Requires-Dist: pytest-cov>=4.1; extra == 'dev'
69
+ Requires-Dist: pytest>=8; extra == 'dev'
70
+ Requires-Dist: ruff>=0.4; extra == 'dev'
71
+ Provides-Extra: duckdb
72
+ Requires-Dist: duckdb>=0.10; extra == 'duckdb'
73
+ Provides-Extra: huggingface
74
+ Requires-Dist: datasets>=2.14; extra == 'huggingface'
75
+ Provides-Extra: nlp
76
+ Requires-Dist: spacy>=3.7; extra == 'nlp'
77
+ Provides-Extra: notebooks
78
+ Requires-Dist: jupyter>=1.0; extra == 'notebooks'
79
+ Requires-Dist: pysofra>=0.1.0a2; extra == 'notebooks'
80
+ Requires-Dist: vl-convert-python>=1.5; extra == 'notebooks'
81
+ Provides-Extra: polars
82
+ Requires-Dist: polars>=1.0; extra == 'polars'
83
+ Requires-Dist: pyarrow>=15; extra == 'polars'
84
+ Provides-Extra: semantic
85
+ Requires-Dist: scikit-learn>=1.3; extra == 'semantic'
86
+ Requires-Dist: sentence-transformers>=2.2; extra == 'semantic'
87
+ Provides-Extra: temporal
88
+ Requires-Dist: ruptures>=1.1; extra == 'temporal'
89
+ Requires-Dist: statsmodels>=0.14; extra == 'temporal'
90
+ Provides-Extra: viz
91
+ Requires-Dist: altair>=5; extra == 'viz'
92
+ Requires-Dist: matplotlib>=3.8; extra == 'viz'
93
+ Requires-Dist: networkx>=3.1; extra == 'viz'
94
+ Description-Content-Type: text/markdown
95
+
96
+ # pycorpdiff
97
+
98
+ <!--
99
+ TODO post-publish (Phase 5 — once GitHub repo public + PyPI published + Zenodo DOI minted):
100
+
101
+ [![PyPI](https://img.shields.io/pypi/v/pycorpdiff.svg)](https://pypi.org/project/pycorpdiff/)
102
+ [![Python versions](https://img.shields.io/pypi/pyversions/pycorpdiff.svg)](https://pypi.org/project/pycorpdiff/)
103
+ [![CI](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml/badge.svg)](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml)
104
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.<RECORD>.svg)](https://doi.org/10.5281/zenodo.<RECORD>)
105
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
106
+ -->
107
+
108
+ **Comparative corpus analysis for modern Python workflows.**
109
+
110
+ `pycorpdiff` is the **missing comparative layer** between R's
111
+ [`quanteda`](https://quanteda.io/), the closed-source SketchEngine
112
+ platform, and the fragmented Python NLP stack
113
+ (`nltk`/`spaCy`/`gensim`/`sentence-transformers`). Three public verbs
114
+ — `compare(a, b)`, `track(c, term)`, `compare.before_after(c, event)` —
115
+ consolidate keyness, collocations, dispersion, temporal trajectories,
116
+ changepoint detection, interrupted time series, causal-impact analysis,
117
+ forecasting, online changepoint detection, and embedding-based semantic
118
+ shift under a single notebook-native API. Every result carries its own
119
+ KWIC evidence: `.explain(term)` returns the source-text concordances
120
+ behind any ranked term.
121
+
122
+ The package answers the questions corpus linguistics, digital humanities,
123
+ and computational social science routinely have:
124
+
125
+ - *How does corpus A differ from corpus B?* — `compare(a, b).keyness()`
126
+ - *How has discourse around X evolved over time?* — `track(c, "x").over_time()`
127
+ - *What did "migrant" mean in 2005 vs 2023?* — `compare(...).semantic_shift("migrant", embedder=...)`
128
+ - *Did this event actually shift the conversation?* — `track(...).causal_impact(event_date=...)`
129
+ - *Where is the discourse heading?* — `track(...).forecast(horizon=4)`
130
+
131
+ `pycorpdiff` is positioned as **orchestration**, not reinvention.
132
+ Tokenizers (`spaCy`, `Stanza`, `jieba`, `fugashi`) and embedders (any
133
+ `SBERT`-compatible model) plug in via two `typing.Protocol` extension
134
+ points — one-line adapters, no plugin registry. The base install pulls
135
+ only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
136
+ via extras.
137
+
138
+ > **Status: pre-release alpha (0.1.0a0).** Public API is stable for the
139
+ > features described below; PyPI publication is the next milestone.
140
+
141
+ ## The three-layer architecture
142
+
143
+ | Layer | Purpose | Key surface |
144
+ |---|---|---|
145
+ | **1 — Ingestion + `Corpus`** | get text in, slice it, hash it | `from_dataframe`, `read_csv`, `read_parquet`, `read_txt`, `read_duckdb`, `from_huggingface`, `fetch_hansard`, `Corpus.slice/by_time/__hash__/doc_term_counts(_sparse)/to_polars` |
146
+ | **2 — Pure math** | statistics with no I/O | `keyness.{log_likelihood,chi_squared,log_ratio,percent_diff,bayes_factor,permutation_pvalues,keyness_multi,juilland_d,benjamini_hochberg}`; `collocation.{logdice,pmi,t_score,mi_three,collocation_shift,cooccurrence_network}`; `semantic.{HashEmbedder,SBERTEmbedder,semantic_trajectory,neighborhood_drift}`; `temporal.{changepoints,interrupted_time_series,forecast,causal_impact,bocpd}` |
147
+ | **3 — Verbs + Results** | public API | `compare`, `track`, `compare.before_after`, `keyness_multi`, plus 9 frozen-dataclass Result types each with `.to_df() / .plot() / .explain() / .summary() / .to_html() / .to_json()` |
148
+
149
+ ## Quick start
150
+
151
+ ```python
152
+ import pycorpdiff as pcd
153
+
154
+ news = pcd.from_dataframe(df, text_col="body", meta_cols=("outlet", "date"))
155
+
156
+ # Compare — three verbs
157
+ k = pcd.compare(news.slice(outlet="Guardian"), news.slice(outlet="Mail")).keyness()
158
+ c = pcd.compare(a, b).collocation_shift("migrant")
159
+ s = pcd.compare(a, b).semantic_shift("migrant", embedder=pcd.SBERTEmbedder())
160
+
161
+ # Track over time
162
+ tr = pcd.track(news, "migrant").over_time(freq="Y")
163
+ tr.changepoints() # offline PELT
164
+ tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
165
+ tr.interrupted_time_series(event_date="2016-06-23") # segmented OLS
166
+ tr.causal_impact(event_date="2016-06-23") # Bayesian counterfactual (Brodersen 2015)
167
+ tr.forecast(horizon=4) # state-space ETS
168
+
169
+ # Before / after a known event
170
+ pcd.compare.before_after(news, event_date="2016-06-23").keyness()
171
+
172
+ # N-way (≥ 2 corpora)
173
+ pcd.keyness_multi([gu, ma, te, mi], labels=["Guardian", "Mail", "Telegraph", "Mirror"])
174
+
175
+ # The discourse as a graph
176
+ pcd.cooccurrence_network(news, top_n=50).plot()
177
+
178
+ # Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
179
+ ```
180
+
181
+ See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
182
+ ([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
183
+ walkthrough on a synthetic UK Hansard corpus exercising every analytical
184
+ surface.
185
+
186
+ ## Installation
187
+
188
+ <!-- TODO post-publish: replace this block with the PyPI install commands once published. -->
189
+
190
+ Currently a pre-release alpha. From a local clone:
191
+
192
+ ```bash
193
+ git clone https://github.com/jturner-uofl/pycorpdiff
194
+ cd pycorpdiff
195
+ pip install -e ".[dev]"
196
+ pytest -q # 519 default tests, ~7s
197
+ ```
198
+
199
+ Optional extras: `[viz]` (altair + matplotlib + networkx), `[semantic]`
200
+ (sentence-transformers + scikit-learn), `[temporal]` (ruptures +
201
+ statsmodels), `[polars]`, `[duckdb]`, `[huggingface]`, `[nlp]` (spaCy),
202
+ `[notebooks]` (jupyter + vl-convert + pysofra, for the showcase),
203
+ or `[all]`.
204
+
205
+ ## Cross-validation receipts
206
+
207
+ The math agrees with the standard tools — by automated test:
208
+
209
+ - **Rayson's LL Wizard** — 15 hand-derived contingency-table reference triples
210
+ - **NLTK** `BigramAssocMeasures` — PMI + t-score to ≤ 1e-12 on every adjacent bigram
211
+ - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012 US Conventions corpus
212
+ - **quanteda (R)** via `rpy2` — byte-for-byte G² agreement (slow tier)
213
+ - **HistWords (Hamilton et al. 2016)** — diachronic cosine displacements on COHA (slow tier)
214
+
215
+ ## Citation
216
+
217
+ If you use `pycorpdiff` in academic work, please cite the software via
218
+ the `CITATION.cff` file in this repository — GitHub renders a "Cite this
219
+ repository" widget directly from it.
220
+
221
+ ## License
222
+
223
+ MIT — see [LICENSE](LICENSE).
224
+
225
+ ## Further reading
226
+
227
+ - [`docs/design.md`](docs/design.md) — three-layer architecture
228
+ - [`docs/statistical-methods.md`](docs/statistical-methods.md) — every metric's formula + citation
229
+ - [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb) — full feature tour as a notebook
230
+ - [`docs/rendered/`](docs/rendered/) — self-contained HTML renders of the example notebooks
@@ -0,0 +1,135 @@
1
+ # pycorpdiff
2
+
3
+ <!--
4
+ TODO post-publish (Phase 5 — once GitHub repo public + PyPI published + Zenodo DOI minted):
5
+
6
+ [![PyPI](https://img.shields.io/pypi/v/pycorpdiff.svg)](https://pypi.org/project/pycorpdiff/)
7
+ [![Python versions](https://img.shields.io/pypi/pyversions/pycorpdiff.svg)](https://pypi.org/project/pycorpdiff/)
8
+ [![CI](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml/badge.svg)](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml)
9
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.<RECORD>.svg)](https://doi.org/10.5281/zenodo.<RECORD>)
10
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
11
+ -->
12
+
13
+ **Comparative corpus analysis for modern Python workflows.**
14
+
15
+ `pycorpdiff` is the **missing comparative layer** between R's
16
+ [`quanteda`](https://quanteda.io/), the closed-source SketchEngine
17
+ platform, and the fragmented Python NLP stack
18
+ (`nltk`/`spaCy`/`gensim`/`sentence-transformers`). Three public verbs
19
+ — `compare(a, b)`, `track(c, term)`, `compare.before_after(c, event)` —
20
+ consolidate keyness, collocations, dispersion, temporal trajectories,
21
+ changepoint detection, interrupted time series, causal-impact analysis,
22
+ forecasting, online changepoint detection, and embedding-based semantic
23
+ shift under a single notebook-native API. Every result carries its own
24
+ KWIC evidence: `.explain(term)` returns the source-text concordances
25
+ behind any ranked term.
26
+
27
+ The package answers the questions corpus linguistics, digital humanities,
28
+ and computational social science routinely have:
29
+
30
+ - *How does corpus A differ from corpus B?* — `compare(a, b).keyness()`
31
+ - *How has discourse around X evolved over time?* — `track(c, "x").over_time()`
32
+ - *What did "migrant" mean in 2005 vs 2023?* — `compare(...).semantic_shift("migrant", embedder=...)`
33
+ - *Did this event actually shift the conversation?* — `track(...).causal_impact(event_date=...)`
34
+ - *Where is the discourse heading?* — `track(...).forecast(horizon=4)`
35
+
36
+ `pycorpdiff` is positioned as **orchestration**, not reinvention.
37
+ Tokenizers (`spaCy`, `Stanza`, `jieba`, `fugashi`) and embedders (any
38
+ `SBERT`-compatible model) plug in via two `typing.Protocol` extension
39
+ points — one-line adapters, no plugin registry. The base install pulls
40
+ only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
41
+ via extras.
42
+
43
+ > **Status: pre-release alpha (0.1.0a0).** Public API is stable for the
44
+ > features described below; PyPI publication is the next milestone.
45
+
46
+ ## The three-layer architecture
47
+
48
+ | Layer | Purpose | Key surface |
49
+ |---|---|---|
50
+ | **1 — Ingestion + `Corpus`** | get text in, slice it, hash it | `from_dataframe`, `read_csv`, `read_parquet`, `read_txt`, `read_duckdb`, `from_huggingface`, `fetch_hansard`, `Corpus.slice/by_time/__hash__/doc_term_counts(_sparse)/to_polars` |
51
+ | **2 — Pure math** | statistics with no I/O | `keyness.{log_likelihood,chi_squared,log_ratio,percent_diff,bayes_factor,permutation_pvalues,keyness_multi,juilland_d,benjamini_hochberg}`; `collocation.{logdice,pmi,t_score,mi_three,collocation_shift,cooccurrence_network}`; `semantic.{HashEmbedder,SBERTEmbedder,semantic_trajectory,neighborhood_drift}`; `temporal.{changepoints,interrupted_time_series,forecast,causal_impact,bocpd}` |
52
+ | **3 — Verbs + Results** | public API | `compare`, `track`, `compare.before_after`, `keyness_multi`, plus 9 frozen-dataclass Result types each with `.to_df() / .plot() / .explain() / .summary() / .to_html() / .to_json()` |
53
+
54
+ ## Quick start
55
+
56
+ ```python
57
+ import pycorpdiff as pcd
58
+
59
+ news = pcd.from_dataframe(df, text_col="body", meta_cols=("outlet", "date"))
60
+
61
+ # Compare — three verbs
62
+ k = pcd.compare(news.slice(outlet="Guardian"), news.slice(outlet="Mail")).keyness()
63
+ c = pcd.compare(a, b).collocation_shift("migrant")
64
+ s = pcd.compare(a, b).semantic_shift("migrant", embedder=pcd.SBERTEmbedder())
65
+
66
+ # Track over time
67
+ tr = pcd.track(news, "migrant").over_time(freq="Y")
68
+ tr.changepoints() # offline PELT
69
+ tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
70
+ tr.interrupted_time_series(event_date="2016-06-23") # segmented OLS
71
+ tr.causal_impact(event_date="2016-06-23") # Bayesian counterfactual (Brodersen 2015)
72
+ tr.forecast(horizon=4) # state-space ETS
73
+
74
+ # Before / after a known event
75
+ pcd.compare.before_after(news, event_date="2016-06-23").keyness()
76
+
77
+ # N-way (≥ 2 corpora)
78
+ pcd.keyness_multi([gu, ma, te, mi], labels=["Guardian", "Mail", "Telegraph", "Mirror"])
79
+
80
+ # The discourse as a graph
81
+ pcd.cooccurrence_network(news, top_n=50).plot()
82
+
83
+ # Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
84
+ ```
85
+
86
+ See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
87
+ ([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
88
+ walkthrough on a synthetic UK Hansard corpus exercising every analytical
89
+ surface.
90
+
91
+ ## Installation
92
+
93
+ <!-- TODO post-publish: replace this block with the PyPI install commands once published. -->
94
+
95
+ Currently a pre-release alpha. From a local clone:
96
+
97
+ ```bash
98
+ git clone https://github.com/jturner-uofl/pycorpdiff
99
+ cd pycorpdiff
100
+ pip install -e ".[dev]"
101
+ pytest -q # 519 default tests, ~7s
102
+ ```
103
+
104
+ Optional extras: `[viz]` (altair + matplotlib + networkx), `[semantic]`
105
+ (sentence-transformers + scikit-learn), `[temporal]` (ruptures +
106
+ statsmodels), `[polars]`, `[duckdb]`, `[huggingface]`, `[nlp]` (spaCy),
107
+ `[notebooks]` (jupyter + vl-convert + pysofra, for the showcase),
108
+ or `[all]`.
109
+
110
+ ## Cross-validation receipts
111
+
112
+ The math agrees with the standard tools — by automated test:
113
+
114
+ - **Rayson's LL Wizard** — 15 hand-derived contingency-table reference triples
115
+ - **NLTK** `BigramAssocMeasures` — PMI + t-score to ≤ 1e-12 on every adjacent bigram
116
+ - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012 US Conventions corpus
117
+ - **quanteda (R)** via `rpy2` — byte-for-byte G² agreement (slow tier)
118
+ - **HistWords (Hamilton et al. 2016)** — diachronic cosine displacements on COHA (slow tier)
119
+
120
+ ## Citation
121
+
122
+ If you use `pycorpdiff` in academic work, please cite the software via
123
+ the `CITATION.cff` file in this repository — GitHub renders a "Cite this
124
+ repository" widget directly from it.
125
+
126
+ ## License
127
+
128
+ MIT — see [LICENSE](LICENSE).
129
+
130
+ ## Further reading
131
+
132
+ - [`docs/design.md`](docs/design.md) — three-layer architecture
133
+ - [`docs/statistical-methods.md`](docs/statistical-methods.md) — every metric's formula + citation
134
+ - [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb) — full feature tour as a notebook
135
+ - [`docs/rendered/`](docs/rendered/) — self-contained HTML renders of the example notebooks