pycorpdiff 0.1.0a3__tar.gz → 0.1.0a5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/.gitignore +4 -4
  2. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/CHANGELOG.md +8 -8
  3. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/CITATION.cff +2 -2
  4. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/PKG-INFO +25 -23
  5. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/README.md +23 -22
  6. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/pyproject.toml +5 -3
  7. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/__init__.py +2 -2
  8. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/collocation/network.py +14 -1
  9. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/compare.py +1 -1
  10. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/datasets/__init__.py +1 -1
  11. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/datasets/_generate_hansard.py +1 -1
  12. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/keyness/bayes.py +4 -3
  13. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/viz/__init__.py +1 -1
  14. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/integration/test_crossval_rayson.py +4 -2
  15. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/LICENSE +0 -0
  16. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/_backends/__init__.py +0 -0
  17. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/_backends/pandas.py +0 -0
  18. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/_backends/polars.py +0 -0
  19. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/collocation/__init__.py +0 -0
  20. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
  21. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/collocation/measures.py +0 -0
  22. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/collocation/shift.py +0 -0
  23. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/corpus.py +0 -0
  24. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  25. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/datasets/hansard.py +0 -0
  26. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/datasets/histwords.py +0 -0
  27. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/explain.py +0 -0
  28. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/io/__init__.py +0 -0
  29. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/io/duckdb.py +0 -0
  30. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/io/huggingface.py +0 -0
  31. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/io/readers.py +0 -0
  32. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/keyness/__init__.py +0 -0
  33. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/keyness/chi_squared.py +0 -0
  34. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/keyness/correction.py +0 -0
  35. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/keyness/dispersion.py +0 -0
  36. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
  37. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/keyness/loglikelihood.py +0 -0
  38. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/keyness/multicorpus.py +0 -0
  39. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/keyness/permutation.py +0 -0
  40. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/py.typed +0 -0
  41. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/results.py +0 -0
  42. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/semantic/__init__.py +0 -0
  43. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/semantic/alignment.py +0 -0
  44. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/semantic/embed.py +0 -0
  45. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/semantic/shift.py +0 -0
  46. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/semantic/trajectory.py +0 -0
  47. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/stats.py +0 -0
  48. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/temporal/__init__.py +0 -0
  49. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/temporal/bocpd.py +0 -0
  50. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/temporal/causal_impact.py +0 -0
  51. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/temporal/changepoint.py +0 -0
  52. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/temporal/forecast.py +0 -0
  53. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/temporal/its.py +0 -0
  54. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/temporal/slicing.py +0 -0
  55. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/tokenize.py +0 -0
  56. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/viz/bocpd.py +0 -0
  57. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/viz/causal_impact.py +0 -0
  58. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/viz/collocation.py +0 -0
  59. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/viz/dispersion.py +0 -0
  60. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/viz/forecast.py +0 -0
  61. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/viz/keyness.py +0 -0
  62. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/viz/network.py +0 -0
  63. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/viz/scattertext.py +0 -0
  64. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
  65. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/src/pycorpdiff/viz/trajectory.py +0 -0
  66. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/__init__.py +0 -0
  67. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/conftest.py +0 -0
  68. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/fixtures/__init__.py +0 -0
  69. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/integration/__init__.py +0 -0
  70. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/integration/test_collocation_integration.py +0 -0
  71. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/integration/test_crossval_histwords.py +0 -0
  72. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/integration/test_crossval_nltk.py +0 -0
  73. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/integration/test_crossval_quanteda.py +0 -0
  74. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/integration/test_crossval_scattertext.py +0 -0
  75. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/integration/test_explain_integration.py +0 -0
  76. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/integration/test_keyness_integration.py +0 -0
  77. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/integration/test_sbert_slow.py +0 -0
  78. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/integration/test_semantic_integration.py +0 -0
  79. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/integration/test_stop_words.py +0 -0
  80. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/integration/test_temporal_stats.py +0 -0
  81. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/integration/test_viz.py +0 -0
  82. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/property/__init__.py +0 -0
  83. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/property/test_collocation_properties.py +0 -0
  84. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/property/test_keyness_properties.py +0 -0
  85. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/property/test_temporal_properties.py +0 -0
  86. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/__init__.py +0 -0
  87. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_bayes_factor.py +0 -0
  88. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_bocpd.py +0 -0
  89. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_causal_impact.py +0 -0
  90. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_changepoint.py +0 -0
  91. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_chi_squared.py +0 -0
  92. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_collocation_cooccurrence.py +0 -0
  93. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_collocation_measures.py +0 -0
  94. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_collocation_shift.py +0 -0
  95. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_comparison_concordance.py +0 -0
  96. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_cooccurrence_network.py +0 -0
  97. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_corpus_hash.py +0 -0
  98. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_corpus_vocab.py +0 -0
  99. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_correction.py +0 -0
  100. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_datasets_hansard.py +0 -0
  101. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_dispersion.py +0 -0
  102. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_dispersion_plot.py +0 -0
  103. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_doc_term_counts_sparse.py +0 -0
  104. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_effect_sizes.py +0 -0
  105. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_embedders.py +0 -0
  106. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_explain.py +0 -0
  107. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_forecast.py +0 -0
  108. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_forecast_semantic_drift.py +0 -0
  109. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_from_huggingface.py +0 -0
  110. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_hansard_fetcher.py +0 -0
  111. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_histwords_loader.py +0 -0
  112. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_its.py +0 -0
  113. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_keyness_multi.py +0 -0
  114. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_loglikelihood.py +0 -0
  115. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_ngram_tokenizer.py +0 -0
  116. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_permutation_keyness.py +0 -0
  117. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_polars_interop.py +0 -0
  118. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_procrustes.py +0 -0
  119. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_read_duckdb.py +0 -0
  120. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_read_txt_line_mode.py +0 -0
  121. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_result_exports.py +0 -0
  122. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_scattertext_plot.py +0 -0
  123. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_semantic_neighbours.py +0 -0
  124. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_semantic_shift.py +0 -0
  125. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_semantic_trajectory.py +0 -0
  126. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_smoke.py +0 -0
  127. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_temporal.py +0 -0
  128. {pycorpdiff-0.1.0a3 → pycorpdiff-0.1.0a5}/tests/unit/test_wilson_ci.py +0 -0
@@ -30,12 +30,12 @@ Thumbs.db
30
30
  *.swo
31
31
  *~
32
32
 
33
- # AI workflow artefacts (kept local, never published)
34
- .claude/
35
-
36
33
  # Hypothesis example database (auto-managed)
37
34
  .hypothesis/
38
35
 
36
+ # Local tooling
37
+ .claude/
38
+
39
39
  # Jupyter checkpoints
40
40
  .ipynb_checkpoints/
41
41
 
@@ -56,5 +56,5 @@ examples/*.patched.ipynb
56
56
  # Stray uv lockfiles created outside the repo root
57
57
  **/uv.lock.tmp
58
58
 
59
- # Mkdocs build output (legacy; mkdocs.yml itself is gone)
59
+ # Static site build output
60
60
  site/
@@ -4,13 +4,13 @@ All notable changes to `pycorpdiff` are documented in this file. The format
4
4
  follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
5
5
  project adheres to [Semantic Versioning](https://semver.org/).
6
6
 
7
- ## [0.1.0a3] — initial release
7
+ ## [0.1.0a5] — initial release
8
8
 
9
9
  The initial public release of `pycorpdiff` — comparative corpus analysis
10
10
  for modern Python workflows. Three public verbs (`compare`, `track`,
11
- `compare.before_after`), nine `Result` dataclasses with a uniform
12
- six-method contract (`.to_df / .plot / .explain / .summary / .to_html /
13
- .to_json`), two `typing.Protocol` extension points (`Tokenizer`,
11
+ `compare.before_after`), nine `Result` dataclasses each implementing the
12
+ relevant subset of `.to_df / .plot / .explain / .summary / .to_html /
13
+ .to_json`, two `typing.Protocol` extension points (`Tokenizer`,
14
14
  `Embedder`), and opt-in extras for visualisation, semantic embedding,
15
15
  temporal modelling, polars interop, DuckDB ingestion, and 🤗 Datasets.
16
16
 
@@ -33,12 +33,12 @@ temporal modelling, polars interop, DuckDB ingestion, and 🤗 Datasets.
33
33
 
34
34
  ### Cross-validated
35
35
 
36
- Numerically agrees with Rayson's LL Wizard (15 reference triples),
37
- NLTK's `BigramAssocMeasures` (≤ 1e-12 on PMI / t-score / MI³),
36
+ Numerically agrees with Rayson's LL Wizard on hand-derived reference
37
+ triples, NLTK's `BigramAssocMeasures` (≤ 1e-12 on PMI / t-score / MI³),
38
38
  Scattertext on the 2012 US conventions, `quanteda` via `rpy2`, and
39
39
  the HistWords COHA replication.
40
40
 
41
41
  ### Infrastructure
42
42
 
43
- 519 tests, `ruff` + `mypy --strict` clean across 55 source files,
44
- matrix CI on three Python versions × two operating systems.
43
+ Hundreds of tests, `ruff` + `mypy --strict` clean across the source
44
+ tree, matrix CI on three Python versions × two operating systems.
@@ -4,8 +4,8 @@ message: >
4
4
  entry. GitHub renders a "Cite this repository" widget directly from
5
5
  this file.
6
6
  title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
7
- version: 0.1.0a3
8
- date-released: 2026-05-22
7
+ version: 0.1.0a5
8
+ date-released: 2026-05-25
9
9
  authors:
10
10
  - family-names: Turner
11
11
  given-names: Jason
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pycorpdiff
3
- Version: 0.1.0a3
3
+ Version: 0.1.0a5
4
4
  Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
5
5
  Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
6
6
  Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
@@ -49,6 +49,7 @@ Provides-Extra: all
49
49
  Requires-Dist: altair>=5; extra == 'all'
50
50
  Requires-Dist: datasets>=2.14; extra == 'all'
51
51
  Requires-Dist: duckdb>=0.10; extra == 'all'
52
+ Requires-Dist: jupyter>=1.0; extra == 'all'
52
53
  Requires-Dist: matplotlib>=3.8; extra == 'all'
53
54
  Requires-Dist: networkx>=3.1; extra == 'all'
54
55
  Requires-Dist: polars>=1.0; extra == 'all'
@@ -110,9 +111,9 @@ platform, and the fragmented Python NLP stack
110
111
  consolidate keyness, collocations, dispersion, temporal trajectories,
111
112
  changepoint detection, interrupted time series, causal-impact analysis,
112
113
  forecasting, online changepoint detection, and embedding-based semantic
113
- shift under a single notebook-native API. Every result carries its own
114
- KWIC evidence: `.explain(term)` returns the source-text concordances
115
- behind any ranked term.
114
+ shift under a single notebook-native API. Keyness and collocation
115
+ results carry their own KWIC evidence: `.explain(term)` returns the
116
+ source-text concordances behind any ranked term.
116
117
 
117
118
  The package answers the questions corpus linguistics, digital humanities,
118
119
  and computational social science routinely have:
@@ -130,7 +131,7 @@ points — one-line adapters, no plugin registry. The base install pulls
130
131
  only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
131
132
  via extras.
132
133
 
133
- > **Status: alpha (0.1.0a3).** Public API is stable for the features
134
+ > **Status: alpha (0.1.0a5).** Public API is stable for the features
134
135
  > described below; on PyPI as `pip install pycorpdiff`.
135
136
 
136
137
  ## The three-layer architecture
@@ -139,7 +140,7 @@ via extras.
139
140
  |---|---|---|
140
141
  | **1 — Ingestion + `Corpus`** | get text in, slice it, hash it | `from_dataframe`, `read_csv`, `read_parquet`, `read_txt`, `read_duckdb`, `from_huggingface`, `fetch_hansard`, `Corpus.slice/by_time/__hash__/doc_term_counts(_sparse)/to_polars` |
141
142
  | **2 — Pure math** | statistics with no I/O | `keyness.{log_likelihood,chi_squared,log_ratio,percent_diff,bayes_factor,permutation_pvalues,keyness_multi,juilland_d,benjamini_hochberg}`; `collocation.{logdice,pmi,t_score,mi_three,collocation_shift,cooccurrence_network}`; `semantic.{HashEmbedder,SBERTEmbedder,semantic_trajectory,neighborhood_drift}`; `temporal.{changepoints,interrupted_time_series,forecast,causal_impact,bocpd}` |
142
- | **3 — Verbs + Results** | public API | `compare`, `track`, `compare.before_after`, `keyness_multi`, plus 9 frozen-dataclass Result types each with `.to_df() / .plot() / .explain() / .summary() / .to_html() / .to_json()` |
143
+ | **3 — Verbs + Results** | public API | `compare`, `track`, `compare.before_after`, `keyness_multi`, plus 9 frozen-dataclass Result types each implementing the relevant subset of `.to_df() / .plot() / .explain() / .summary() / .to_html() / .to_json()` |
143
144
 
144
145
  ## Quick start
145
146
 
@@ -150,7 +151,7 @@ pip install "pycorpdiff[viz]"
150
151
  ```python
151
152
  import pycorpdiff as pcd
152
153
 
153
- # Bundled UK-Hansard sample — runs offline, no data download.
154
+ # Bundled synthetic Hansard-style sample — runs offline, no data download.
154
155
  corpus = pcd.load_hansard_sample()
155
156
  immigration = corpus.slice(topic="immigration")
156
157
 
@@ -170,19 +171,21 @@ compare two slices, plot the result. Every other analytical method —
170
171
  collocation shifts, semantic drift, temporal trajectories, changepoint
171
172
  detection, causal-impact analysis, forecasting, co-occurrence networks,
172
173
  N-way keyness — follows the same shape. See
173
- [the showcase notebook](docs/rendered/pycorpdiff_showcase.html) for the
174
- full feature tour, or the cheat sheet below for one-line API previews.
174
+ [the showcase notebook](https://github.com/jturner-uofl/pycorpdiff/blob/main/examples/pycorpdiff_showcase.ipynb)
175
+ for the full feature tour, or the cheat sheet below for one-line API previews.
175
176
 
176
177
  ### Cheat sheet — every analytical surface in one block
177
178
 
178
179
  ```python
179
- # Compare verbs (returns Result objects with .plot / .to_df / .explain / .summary)
180
+ # Compare verbs (returns Result objects; methods exposed vary by Result)
180
181
  pcd.compare(a, b).keyness()
181
- pcd.compare(a, b).collocation_shift("migrant")
182
- pcd.compare(a, b).semantic_shift("migrant", embedder=pcd.SBERTEmbedder()) # [semantic]
182
+ pcd.compare(a, b).collocation_shift("immigrant")
183
+ pcd.compare(a, b).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder()) # [semantic]
184
+ # SBERTEmbedder downloads a sentence-transformers model on first call;
185
+ # use pcd.HashEmbedder() for offline / deterministic-test settings.
183
186
 
184
187
  # Track over time (requires [temporal] for the changepoint + ITS + forecast + causal_impact methods)
185
- tr = pcd.track(corpus, "migrant").over_time(freq="Y")
188
+ tr = pcd.track(corpus, "immigrant").over_time(freq="Y")
186
189
  tr.changepoints() # offline PELT
187
190
  tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
188
191
  tr.interrupted_time_series(event_date="2016") # segmented OLS
@@ -199,10 +202,9 @@ pcd.keyness_multi([a, b, c, d], labels=["A", "B", "C", "D"])
199
202
  pcd.cooccurrence_network(corpus, top_n=30).plot()
200
203
  ```
201
204
 
202
- See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
203
- ([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
204
- walkthrough on a synthetic UK Hansard corpus exercising every analytical
205
- surface.
205
+ See [`examples/pycorpdiff_showcase.ipynb`](https://github.com/jturner-uofl/pycorpdiff/blob/main/examples/pycorpdiff_showcase.ipynb)
206
+ for a walkthrough on the synthetic Hansard-style corpus exercising
207
+ every analytical surface.
206
208
 
207
209
  ## Installation
208
210
 
@@ -232,7 +234,7 @@ pytest -q
232
234
 
233
235
  The math agrees with the standard tools — by automated test:
234
236
 
235
- - **Rayson's LL Wizard** — 15 hand-derived contingency-table reference triples
237
+ - **Rayson's LL Wizard** — hand-derived contingency-table reference triples
236
238
  - **NLTK** `BigramAssocMeasures` — PMI + t-score to ≤ 1e-12 on every adjacent bigram
237
239
  - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012 US Conventions corpus
238
240
  - **quanteda (R)** via `rpy2` — byte-for-byte G² agreement (slow tier)
@@ -246,11 +248,11 @@ repository" widget directly from it.
246
248
 
247
249
  ## License
248
250
 
249
- MIT — see [LICENSE](LICENSE).
251
+ MIT — see [LICENSE](https://github.com/jturner-uofl/pycorpdiff/blob/main/LICENSE).
250
252
 
251
253
  ## Further reading
252
254
 
253
- - [`docs/design.md`](docs/design.md) — three-layer architecture
254
- - [`docs/statistical-methods.md`](docs/statistical-methods.md) — every metric's formula + citation
255
- - [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb) — full feature tour as a notebook
256
- - [`docs/rendered/`](docs/rendered/) — self-contained HTML renders of the example notebooks
255
+ - [`docs/design.md`](https://github.com/jturner-uofl/pycorpdiff/blob/main/docs/design.md) — three-layer architecture
256
+ - [`docs/statistical-methods.md`](https://github.com/jturner-uofl/pycorpdiff/blob/main/docs/statistical-methods.md) — every metric's formula + citation
257
+ - [`examples/pycorpdiff_showcase.ipynb`](https://github.com/jturner-uofl/pycorpdiff/blob/main/examples/pycorpdiff_showcase.ipynb) — full feature tour as a notebook
258
+ - [`docs/rendered/`](https://github.com/jturner-uofl/pycorpdiff/tree/main/docs/rendered) — static HTML renders for offline viewing
@@ -15,9 +15,9 @@ platform, and the fragmented Python NLP stack
15
15
  consolidate keyness, collocations, dispersion, temporal trajectories,
16
16
  changepoint detection, interrupted time series, causal-impact analysis,
17
17
  forecasting, online changepoint detection, and embedding-based semantic
18
- shift under a single notebook-native API. Every result carries its own
19
- KWIC evidence: `.explain(term)` returns the source-text concordances
20
- behind any ranked term.
18
+ shift under a single notebook-native API. Keyness and collocation
19
+ results carry their own KWIC evidence: `.explain(term)` returns the
20
+ source-text concordances behind any ranked term.
21
21
 
22
22
  The package answers the questions corpus linguistics, digital humanities,
23
23
  and computational social science routinely have:
@@ -35,7 +35,7 @@ points — one-line adapters, no plugin registry. The base install pulls
35
35
  only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
36
36
  via extras.
37
37
 
38
- > **Status: alpha (0.1.0a3).** Public API is stable for the features
38
+ > **Status: alpha (0.1.0a5).** Public API is stable for the features
39
39
  > described below; on PyPI as `pip install pycorpdiff`.
40
40
 
41
41
  ## The three-layer architecture
@@ -44,7 +44,7 @@ via extras.
44
44
  |---|---|---|
45
45
  | **1 — Ingestion + `Corpus`** | get text in, slice it, hash it | `from_dataframe`, `read_csv`, `read_parquet`, `read_txt`, `read_duckdb`, `from_huggingface`, `fetch_hansard`, `Corpus.slice/by_time/__hash__/doc_term_counts(_sparse)/to_polars` |
46
46
  | **2 — Pure math** | statistics with no I/O | `keyness.{log_likelihood,chi_squared,log_ratio,percent_diff,bayes_factor,permutation_pvalues,keyness_multi,juilland_d,benjamini_hochberg}`; `collocation.{logdice,pmi,t_score,mi_three,collocation_shift,cooccurrence_network}`; `semantic.{HashEmbedder,SBERTEmbedder,semantic_trajectory,neighborhood_drift}`; `temporal.{changepoints,interrupted_time_series,forecast,causal_impact,bocpd}` |
47
- | **3 — Verbs + Results** | public API | `compare`, `track`, `compare.before_after`, `keyness_multi`, plus 9 frozen-dataclass Result types each with `.to_df() / .plot() / .explain() / .summary() / .to_html() / .to_json()` |
47
+ | **3 — Verbs + Results** | public API | `compare`, `track`, `compare.before_after`, `keyness_multi`, plus 9 frozen-dataclass Result types each implementing the relevant subset of `.to_df() / .plot() / .explain() / .summary() / .to_html() / .to_json()` |
48
48
 
49
49
  ## Quick start
50
50
 
@@ -55,7 +55,7 @@ pip install "pycorpdiff[viz]"
55
55
  ```python
56
56
  import pycorpdiff as pcd
57
57
 
58
- # Bundled UK-Hansard sample — runs offline, no data download.
58
+ # Bundled synthetic Hansard-style sample — runs offline, no data download.
59
59
  corpus = pcd.load_hansard_sample()
60
60
  immigration = corpus.slice(topic="immigration")
61
61
 
@@ -75,19 +75,21 @@ compare two slices, plot the result. Every other analytical method —
75
75
  collocation shifts, semantic drift, temporal trajectories, changepoint
76
76
  detection, causal-impact analysis, forecasting, co-occurrence networks,
77
77
  N-way keyness — follows the same shape. See
78
- [the showcase notebook](docs/rendered/pycorpdiff_showcase.html) for the
79
- full feature tour, or the cheat sheet below for one-line API previews.
78
+ [the showcase notebook](https://github.com/jturner-uofl/pycorpdiff/blob/main/examples/pycorpdiff_showcase.ipynb)
79
+ for the full feature tour, or the cheat sheet below for one-line API previews.
80
80
 
81
81
  ### Cheat sheet — every analytical surface in one block
82
82
 
83
83
  ```python
84
- # Compare verbs (returns Result objects with .plot / .to_df / .explain / .summary)
84
+ # Compare verbs (returns Result objects; methods exposed vary by Result)
85
85
  pcd.compare(a, b).keyness()
86
- pcd.compare(a, b).collocation_shift("migrant")
87
- pcd.compare(a, b).semantic_shift("migrant", embedder=pcd.SBERTEmbedder()) # [semantic]
86
+ pcd.compare(a, b).collocation_shift("immigrant")
87
+ pcd.compare(a, b).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder()) # [semantic]
88
+ # SBERTEmbedder downloads a sentence-transformers model on first call;
89
+ # use pcd.HashEmbedder() for offline / deterministic-test settings.
88
90
 
89
91
  # Track over time (requires [temporal] for the changepoint + ITS + forecast + causal_impact methods)
90
- tr = pcd.track(corpus, "migrant").over_time(freq="Y")
92
+ tr = pcd.track(corpus, "immigrant").over_time(freq="Y")
91
93
  tr.changepoints() # offline PELT
92
94
  tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
93
95
  tr.interrupted_time_series(event_date="2016") # segmented OLS
@@ -104,10 +106,9 @@ pcd.keyness_multi([a, b, c, d], labels=["A", "B", "C", "D"])
104
106
  pcd.cooccurrence_network(corpus, top_n=30).plot()
105
107
  ```
106
108
 
107
- See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
108
- ([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
109
- walkthrough on a synthetic UK Hansard corpus exercising every analytical
110
- surface.
109
+ See [`examples/pycorpdiff_showcase.ipynb`](https://github.com/jturner-uofl/pycorpdiff/blob/main/examples/pycorpdiff_showcase.ipynb)
110
+ for a walkthrough on the synthetic Hansard-style corpus exercising
111
+ every analytical surface.
111
112
 
112
113
  ## Installation
113
114
 
@@ -137,7 +138,7 @@ pytest -q
137
138
 
138
139
  The math agrees with the standard tools — by automated test:
139
140
 
140
- - **Rayson's LL Wizard** — 15 hand-derived contingency-table reference triples
141
+ - **Rayson's LL Wizard** — hand-derived contingency-table reference triples
141
142
  - **NLTK** `BigramAssocMeasures` — PMI + t-score to ≤ 1e-12 on every adjacent bigram
142
143
  - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012 US Conventions corpus
143
144
  - **quanteda (R)** via `rpy2` — byte-for-byte G² agreement (slow tier)
@@ -151,11 +152,11 @@ repository" widget directly from it.
151
152
 
152
153
  ## License
153
154
 
154
- MIT — see [LICENSE](LICENSE).
155
+ MIT — see [LICENSE](https://github.com/jturner-uofl/pycorpdiff/blob/main/LICENSE).
155
156
 
156
157
  ## Further reading
157
158
 
158
- - [`docs/design.md`](docs/design.md) — three-layer architecture
159
- - [`docs/statistical-methods.md`](docs/statistical-methods.md) — every metric's formula + citation
160
- - [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb) — full feature tour as a notebook
161
- - [`docs/rendered/`](docs/rendered/) — self-contained HTML renders of the example notebooks
159
+ - [`docs/design.md`](https://github.com/jturner-uofl/pycorpdiff/blob/main/docs/design.md) — three-layer architecture
160
+ - [`docs/statistical-methods.md`](https://github.com/jturner-uofl/pycorpdiff/blob/main/docs/statistical-methods.md) — every metric's formula + citation
161
+ - [`examples/pycorpdiff_showcase.ipynb`](https://github.com/jturner-uofl/pycorpdiff/blob/main/examples/pycorpdiff_showcase.ipynb) — full feature tour as a notebook
162
+ - [`docs/rendered/`](https://github.com/jturner-uofl/pycorpdiff/tree/main/docs/rendered) — static HTML renders for offline viewing
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "pycorpdiff"
7
- version = "0.1.0a3"
7
+ version = "0.1.0a5"
8
8
  description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -45,7 +45,7 @@ dependencies = [
45
45
  ]
46
46
 
47
47
  [project.optional-dependencies]
48
- # Visualisation: altair-first, matplotlib retained for paper-grade figures.
48
+ # Visualisation: altair-first, matplotlib retained for publication-quality figures.
49
49
  viz = ["altair>=5", "matplotlib>=3.8", "networkx>=3.1"]
50
50
  # Embedding-based semantic shift. sentence-transformers pulls torch
51
51
  # transitively, which is why this is opt-in rather than a base dep.
@@ -67,7 +67,8 @@ huggingface = ["datasets>=2.14"]
67
67
  # static SVG/PNG export of altair charts, `pysofra` renders the showcase's
68
68
  # result tables in JAMA-style typography.
69
69
  notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5", "pysofra>=0.1.0a3"]
70
- # Meta-extra so `pycorpdiff[all]` exercises every optional code path.
70
+ # Meta-extra: `pycorpdiff[all]` pulls in every optional code path
71
+ # including the notebook runtime.
71
72
  all = [
72
73
  "altair>=5",
73
74
  "matplotlib>=3.8",
@@ -81,6 +82,7 @@ all = [
81
82
  "pyarrow>=15",
82
83
  "duckdb>=0.10",
83
84
  "spacy>=3.7",
85
+ "jupyter>=1.0",
84
86
  "vl-convert-python>=1.5",
85
87
  "pysofra>=0.1.0a3",
86
88
  ]
@@ -14,12 +14,12 @@ Example
14
14
 
15
15
  >>> import pycorpdiff as pcd
16
16
  >>> pcd.__version__
17
- '0.1.0a3'
17
+ '0.1.0a5'
18
18
  """
19
19
 
20
20
  from __future__ import annotations
21
21
 
22
- __version__ = "0.1.0a3"
22
+ __version__ = "0.1.0a5"
23
23
 
24
24
  from .collocation.network import NetworkResult, cooccurrence_network
25
25
  from .compare import Comparison, compare
@@ -16,12 +16,14 @@ from __future__ import annotations
16
16
  from collections import Counter
17
17
  from collections.abc import Sequence
18
18
  from dataclasses import dataclass, field
19
- from typing import TYPE_CHECKING, Literal
19
+ from pathlib import Path
20
+ from typing import TYPE_CHECKING, Any, Literal
20
21
 
21
22
  import numpy as np
22
23
  import pandas as pd
23
24
 
24
25
  from ..corpus import Corpus, CorpusSlice
26
+ from ..results import _table_to_html, _table_to_json
25
27
  from .measures import logdice, mi_three, pmi, t_score
26
28
 
27
29
  if TYPE_CHECKING:
@@ -54,6 +56,17 @@ class NetworkResult:
54
56
  """Return the edges as a flat tidy DataFrame (for round-trips)."""
55
57
  return self.edges.copy()
56
58
 
59
+ def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
60
+ """Render the edge table as HTML (returns the string and,
61
+ optionally, writes to ``path``). Extra kwargs forward to
62
+ :meth:`pandas.DataFrame.to_html`."""
63
+ return _table_to_html(self.edges, path, **kw)
64
+
65
+ def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
66
+ """Render the edge table as JSON (default ``orient="records"``).
67
+ Returns the JSON string and, optionally, writes to ``path``."""
68
+ return _table_to_json(self.edges, path, **kw)
69
+
57
70
  def summary(self) -> str:
58
71
  return (
59
72
  f"NetworkResult(measure={self.measure}, window={self.window}, "
@@ -66,7 +66,7 @@ class Comparison:
66
66
  require ``effect_size=True`` and sort by that column.
67
67
  effect_size
68
68
  If True (default), also compute LogRatio (Hardie),
69
- %DIFF (Gabrielatos), and the BIC-Bayes factor (Wilson).
69
+ %DIFF (Gabrielatos), and the BIC-approximated Bayes factor.
70
70
  dispersion
71
71
  If True, compute Juilland's D for both corpora and flag
72
72
  terms where ``D < 0.5`` in either — the canonical "this is
@@ -3,7 +3,7 @@
3
3
  What ships with the package
4
4
  ---------------------------
5
5
 
6
- - :func:`load_hansard_sample` — a 200-speech synthetic corpus designed
6
+ - :func:`load_hansard_sample` — a 193-speech synthetic corpus designed
7
7
  to mimic UK Hansard's structure across two decades, four topics, and
8
8
  four parties, with topical language shifts around real-world events
9
9
  (Brexit referendum, COVID-19, the climate-emergency declarations).
@@ -172,7 +172,7 @@ TOPICS = ["immigration", "brexit", "nhs", "climate"]
172
172
 
173
173
 
174
174
  def generate(seed: int = 20260522) -> pd.DataFrame:
175
- """Return a deterministic 200-speech synthetic Hansard sample."""
175
+ """Return a deterministic 193-speech synthetic Hansard sample."""
176
176
  rng = np.random.default_rng(seed)
177
177
  rows: list[dict[str, object]] = []
178
178
  speech_id = 0
@@ -26,9 +26,10 @@ def bayes_factor(
26
26
  ) -> pd.Series:
27
27
  """BIC-approximated Bayes factor for each term's frequency difference.
28
28
 
29
- Uses Wilson's BIC approximation: ``BIC = |G²| - ln(N)`` where ``N``
30
- is the total tokens across both corpora and ``G²`` is the unsigned
31
- log-likelihood. The Bayes factor is then ``exp(BIC / 2)``.
29
+ The BIC approximation (Kass & Raftery 1995): ``BIC = |G²| - ln(N)``
30
+ where ``N`` is the total tokens across both corpora and ``G²`` is
31
+ the unsigned log-likelihood. The Bayes factor is then
32
+ ``exp(BIC / 2)``. Wilson (2013) is the keyness application.
32
33
 
33
34
  Interpret with Kass & Raftery (1995):
34
35
 
@@ -1,4 +1,4 @@
1
- """Visualisation helpers — altair-first, matplotlib for paper-grade figures.
1
+ """Visualisation helpers — altair-first, matplotlib for publication-quality figures.
2
2
 
3
3
  Every Result type's ``.plot()`` method delegates here. Plot functions
4
4
  also accept a bare DataFrame so users can call
@@ -6,8 +6,10 @@ single-cell keyness computation in corpus linguistics. Every value
6
6
  asserted below was either computed from Rayson's exact formula or
7
7
  copy-pasted from his calculator on a clean dataset.
8
8
 
9
- This file extends ``test_loglikelihood.py`` with the broader sweep
10
- called for by the audit's #15 item (cross-validation receipts).
9
+ This file extends ``test_loglikelihood.py`` with a broader sweep of
10
+ canonical reference triples covering edge cases (lopsided counts,
11
+ sparse cells, mid-sized over-representation) so that any future
12
+ refactor of the LL formula trips multiple assertions simultaneously.
11
13
  """
12
14
 
13
15
  from __future__ import annotations
File without changes