pystylometry 1.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry-1.3.1/LICENSE +21 -0
- pystylometry-1.3.1/PKG-INFO +79 -0
- pystylometry-1.3.1/README.md +51 -0
- pystylometry-1.3.1/pyproject.toml +116 -0
- pystylometry-1.3.1/pystylometry/README.md +42 -0
- pystylometry-1.3.1/pystylometry/__init__.py +247 -0
- pystylometry-1.3.1/pystylometry/_normalize.py +277 -0
- pystylometry-1.3.1/pystylometry/_types.py +2304 -0
- pystylometry-1.3.1/pystylometry/_utils.py +201 -0
- pystylometry-1.3.1/pystylometry/authorship/README.md +21 -0
- pystylometry-1.3.1/pystylometry/authorship/__init__.py +38 -0
- pystylometry-1.3.1/pystylometry/authorship/additional_methods.py +320 -0
- pystylometry-1.3.1/pystylometry/authorship/burrows_delta.py +152 -0
- pystylometry-1.3.1/pystylometry/authorship/compression.py +175 -0
- pystylometry-1.3.1/pystylometry/authorship/kilgarriff.py +354 -0
- pystylometry-1.3.1/pystylometry/authorship/zeta.py +109 -0
- pystylometry-1.3.1/pystylometry/character/README.md +17 -0
- pystylometry-1.3.1/pystylometry/character/__init__.py +15 -0
- pystylometry-1.3.1/pystylometry/character/character_metrics.py +389 -0
- pystylometry-1.3.1/pystylometry/cli.py +427 -0
- pystylometry-1.3.1/pystylometry/consistency/README.md +27 -0
- pystylometry-1.3.1/pystylometry/consistency/__init__.py +57 -0
- pystylometry-1.3.1/pystylometry/consistency/_thresholds.py +162 -0
- pystylometry-1.3.1/pystylometry/consistency/drift.py +549 -0
- pystylometry-1.3.1/pystylometry/dialect/README.md +26 -0
- pystylometry-1.3.1/pystylometry/dialect/__init__.py +65 -0
- pystylometry-1.3.1/pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry-1.3.1/pystylometry/dialect/_loader.py +360 -0
- pystylometry-1.3.1/pystylometry/dialect/detector.py +533 -0
- pystylometry-1.3.1/pystylometry/lexical/README.md +23 -0
- pystylometry-1.3.1/pystylometry/lexical/__init__.py +27 -0
- pystylometry-1.3.1/pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry-1.3.1/pystylometry/lexical/function_words.py +590 -0
- pystylometry-1.3.1/pystylometry/lexical/hapax.py +352 -0
- pystylometry-1.3.1/pystylometry/lexical/mtld.py +219 -0
- pystylometry-1.3.1/pystylometry/lexical/repetition.py +506 -0
- pystylometry-1.3.1/pystylometry/lexical/ttr.py +149 -0
- pystylometry-1.3.1/pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry-1.3.1/pystylometry/lexical/yule.py +179 -0
- pystylometry-1.3.1/pystylometry/ngrams/README.md +18 -0
- pystylometry-1.3.1/pystylometry/ngrams/__init__.py +15 -0
- pystylometry-1.3.1/pystylometry/ngrams/entropy.py +231 -0
- pystylometry-1.3.1/pystylometry/ngrams/extended_ngrams.py +480 -0
- pystylometry-1.3.1/pystylometry/prosody/README.md +17 -0
- pystylometry-1.3.1/pystylometry/prosody/__init__.py +12 -0
- pystylometry-1.3.1/pystylometry/prosody/rhythm_prosody.py +815 -0
- pystylometry-1.3.1/pystylometry/readability/README.md +23 -0
- pystylometry-1.3.1/pystylometry/readability/__init__.py +27 -0
- pystylometry-1.3.1/pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry-1.3.1/pystylometry/readability/ari.py +208 -0
- pystylometry-1.3.1/pystylometry/readability/coleman_liau.py +187 -0
- pystylometry-1.3.1/pystylometry/readability/complex_words.py +531 -0
- pystylometry-1.3.1/pystylometry/readability/flesch.py +230 -0
- pystylometry-1.3.1/pystylometry/readability/gunning_fog.py +236 -0
- pystylometry-1.3.1/pystylometry/readability/smog.py +169 -0
- pystylometry-1.3.1/pystylometry/readability/syllables.py +161 -0
- pystylometry-1.3.1/pystylometry/stylistic/README.md +20 -0
- pystylometry-1.3.1/pystylometry/stylistic/__init__.py +20 -0
- pystylometry-1.3.1/pystylometry/stylistic/cohesion_coherence.py +701 -0
- pystylometry-1.3.1/pystylometry/stylistic/genre_register.py +1588 -0
- pystylometry-1.3.1/pystylometry/stylistic/markers.py +725 -0
- pystylometry-1.3.1/pystylometry/stylistic/vocabulary_overlap.py +388 -0
- pystylometry-1.3.1/pystylometry/syntactic/README.md +20 -0
- pystylometry-1.3.1/pystylometry/syntactic/__init__.py +13 -0
- pystylometry-1.3.1/pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry-1.3.1/pystylometry/syntactic/pos_ratios.py +216 -0
- pystylometry-1.3.1/pystylometry/syntactic/sentence_stats.py +147 -0
- pystylometry-1.3.1/pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry-1.3.1/pystylometry/tokenizer.py +598 -0
- pystylometry-1.3.1/pystylometry/viz/README.md +27 -0
- pystylometry-1.3.1/pystylometry/viz/__init__.py +71 -0
- pystylometry-1.3.1/pystylometry/viz/drift.py +589 -0
- pystylometry-1.3.1/pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry-1.3.1/pystylometry/viz/jsx/_base.py +144 -0
- pystylometry-1.3.1/pystylometry/viz/jsx/report.py +677 -0
- pystylometry-1.3.1/pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry-1.3.1/pystylometry/viz/jsx/viewer.py +1032 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Craig Trim
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pystylometry
|
|
3
|
+
Version: 1.3.1
|
|
4
|
+
Summary: Comprehensive Python package for stylometric analysis
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: stylometry,nlp,text-analysis,authorship,readability,lexical-diversity,readability-metrics
|
|
7
|
+
Author: Craig Trim
|
|
8
|
+
Author-email: craigtrim@gmail.com
|
|
9
|
+
Requires-Python: >=3.9,<4.0
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
20
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Dist: stylometry-ttr (>=1.0.3,<2.0.0)
|
|
23
|
+
Project-URL: Homepage, https://github.com/craigtrim/pystylometry
|
|
24
|
+
Project-URL: Issues, https://github.com/craigtrim/pystylometry/issues
|
|
25
|
+
Project-URL: Repository, https://github.com/craigtrim/pystylometry
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# pystylometry
|
|
29
|
+
|
|
30
|
+
[](https://badge.fury.io/py/pystylometry)
|
|
31
|
+
[](https://pepy.tech/project/pystylometry)
|
|
32
|
+
[](https://pepy.tech/project/pystylometry)
|
|
33
|
+
[](https://www.python.org/downloads/)
|
|
34
|
+
[](https://opensource.org/licenses/MIT)
|
|
35
|
+
[]()
|
|
36
|
+
|
|
37
|
+
Stylometric analysis and authorship attribution for Python. 50+ metrics across 11 modules, from vocabulary diversity to AI-generation detection.
|
|
38
|
+
|
|
39
|
+
## Install
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install pystylometry # Core (lexical metrics)
|
|
43
|
+
pip install pystylometry[all] # Everything
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Modules
|
|
47
|
+
|
|
48
|
+
| Module | Metrics | Description |
|
|
49
|
+
|--------|---------|-------------|
|
|
50
|
+
| [**lexical**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/lexical) | TTR, MTLD, Yule's K/I, Hapax, MATTR, VocD-D, HD-D, MSTTR, function words, word frequency | Vocabulary diversity and richness |
|
|
51
|
+
| [**readability**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/readability) | Flesch, Flesch-Kincaid, SMOG, Gunning Fog, Coleman-Liau, ARI, Dale-Chall, Fry, FORCAST, Linsear Write, Powers-Sumner-Kearl | Grade-level and difficulty scoring |
|
|
52
|
+
| [**syntactic**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/syntactic) | POS ratios, sentence types, parse tree depth, clausal density, passive voice, T-units, dependency distance | Sentence and parse structure (requires spaCy) |
|
|
53
|
+
| [**authorship**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/authorship) | Burrows' Delta, Cosine Delta, Zeta, Kilgarriff chi-squared, MinMax, John's Delta, NCD | Author attribution and text comparison |
|
|
54
|
+
| [**stylistic**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/stylistic) | Contractions, hedges, intensifiers, modals, punctuation, vocabulary overlap (Jaccard/Dice/Cosine/KL), cohesion, genre/register | Style markers and text similarity |
|
|
55
|
+
| [**character**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/character) | Letter frequencies, digit/uppercase ratios, special characters, whitespace | Character-level fingerprinting |
|
|
56
|
+
| [**ngrams**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/ngrams) | Word/character/POS n-grams, Shannon entropy, skipgrams | N-gram profiles and entropy |
|
|
57
|
+
| [**dialect**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/dialect) | British/American classification, spelling/grammar/vocabulary markers, markedness | Regional dialect detection |
|
|
58
|
+
| [**consistency**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/consistency) | Sliding-window chi-squared drift, pattern classification | Intra-document style analysis |
|
|
59
|
+
| [**prosody**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/prosody) | Syllable stress, rhythm regularity | Prose rhythm (requires spaCy) |
|
|
60
|
+
| [**viz**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/viz) | Timeline, scatter, report (PNG + interactive HTML) | Drift detection visualization |
|
|
61
|
+
|
|
62
|
+
## Development
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
git clone https://github.com/craigtrim/pystylometry && cd pystylometry
|
|
66
|
+
pip install -e ".[dev,all]"
|
|
67
|
+
make test # 1022 tests
|
|
68
|
+
make lint # ruff + mypy
|
|
69
|
+
make all # lint + test + build
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## License
|
|
73
|
+
|
|
74
|
+
MIT
|
|
75
|
+
|
|
76
|
+
## Author
|
|
77
|
+
|
|
78
|
+
Craig Trim -- craigtrim@gmail.com
|
|
79
|
+
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# pystylometry
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/py/pystylometry)
|
|
4
|
+
[](https://pepy.tech/project/pystylometry)
|
|
5
|
+
[](https://pepy.tech/project/pystylometry)
|
|
6
|
+
[](https://www.python.org/downloads/)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
[]()
|
|
9
|
+
|
|
10
|
+
Stylometric analysis and authorship attribution for Python. 50+ metrics across 11 modules, from vocabulary diversity to AI-generation detection.
|
|
11
|
+
|
|
12
|
+
## Install
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install pystylometry # Core (lexical metrics)
|
|
16
|
+
pip install pystylometry[all] # Everything
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Modules
|
|
20
|
+
|
|
21
|
+
| Module | Metrics | Description |
|
|
22
|
+
|--------|---------|-------------|
|
|
23
|
+
| [**lexical**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/lexical) | TTR, MTLD, Yule's K/I, Hapax, MATTR, VocD-D, HD-D, MSTTR, function words, word frequency | Vocabulary diversity and richness |
|
|
24
|
+
| [**readability**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/readability) | Flesch, Flesch-Kincaid, SMOG, Gunning Fog, Coleman-Liau, ARI, Dale-Chall, Fry, FORCAST, Linsear Write, Powers-Sumner-Kearl | Grade-level and difficulty scoring |
|
|
25
|
+
| [**syntactic**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/syntactic) | POS ratios, sentence types, parse tree depth, clausal density, passive voice, T-units, dependency distance | Sentence and parse structure (requires spaCy) |
|
|
26
|
+
| [**authorship**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/authorship) | Burrows' Delta, Cosine Delta, Zeta, Kilgarriff chi-squared, MinMax, John's Delta, NCD | Author attribution and text comparison |
|
|
27
|
+
| [**stylistic**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/stylistic) | Contractions, hedges, intensifiers, modals, punctuation, vocabulary overlap (Jaccard/Dice/Cosine/KL), cohesion, genre/register | Style markers and text similarity |
|
|
28
|
+
| [**character**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/character) | Letter frequencies, digit/uppercase ratios, special characters, whitespace | Character-level fingerprinting |
|
|
29
|
+
| [**ngrams**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/ngrams) | Word/character/POS n-grams, Shannon entropy, skipgrams | N-gram profiles and entropy |
|
|
30
|
+
| [**dialect**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/dialect) | British/American classification, spelling/grammar/vocabulary markers, markedness | Regional dialect detection |
|
|
31
|
+
| [**consistency**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/consistency) | Sliding-window chi-squared drift, pattern classification | Intra-document style analysis |
|
|
32
|
+
| [**prosody**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/prosody) | Syllable stress, rhythm regularity | Prose rhythm (requires spaCy) |
|
|
33
|
+
| [**viz**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/viz) | Timeline, scatter, report (PNG + interactive HTML) | Drift detection visualization |
|
|
34
|
+
|
|
35
|
+
## Development
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
git clone https://github.com/craigtrim/pystylometry && cd pystylometry
|
|
39
|
+
pip install -e ".[dev,all]"
|
|
40
|
+
make test # 1022 tests
|
|
41
|
+
make lint # ruff + mypy
|
|
42
|
+
make all # lint + test + build
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## License
|
|
46
|
+
|
|
47
|
+
MIT
|
|
48
|
+
|
|
49
|
+
## Author
|
|
50
|
+
|
|
51
|
+
Craig Trim -- craigtrim@gmail.com
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "pystylometry"
|
|
3
|
+
version = "1.3.1"
|
|
4
|
+
description = "Comprehensive Python package for stylometric analysis"
|
|
5
|
+
authors = ["Craig Trim <craigtrim@gmail.com>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
packages = [{ include = "pystylometry" }]
|
|
9
|
+
keywords = [
|
|
10
|
+
"stylometry",
|
|
11
|
+
"nlp",
|
|
12
|
+
"text-analysis",
|
|
13
|
+
"authorship",
|
|
14
|
+
"readability",
|
|
15
|
+
"lexical-diversity",
|
|
16
|
+
"readability-metrics",
|
|
17
|
+
]
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Development Status :: 4 - Beta",
|
|
20
|
+
"Intended Audience :: Science/Research",
|
|
21
|
+
"Intended Audience :: Developers",
|
|
22
|
+
"License :: OSI Approved :: MIT License",
|
|
23
|
+
"Programming Language :: Python :: 3",
|
|
24
|
+
"Programming Language :: Python :: 3.9",
|
|
25
|
+
"Programming Language :: Python :: 3.10",
|
|
26
|
+
"Programming Language :: Python :: 3.11",
|
|
27
|
+
"Programming Language :: Python :: 3.12",
|
|
28
|
+
"Topic :: Text Processing :: Linguistic",
|
|
29
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
30
|
+
"Typing :: Typed",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[tool.poetry.scripts]
|
|
34
|
+
# pystylometry-drift manuscript.txt --window-size=500 --stride=250
|
|
35
|
+
pystylometry-drift = "pystylometry.cli:drift_cli"
|
|
36
|
+
# pystylometry-viewer drift_analyzer.html
|
|
37
|
+
pystylometry-viewer = "pystylometry.cli:viewer_cli"
|
|
38
|
+
|
|
39
|
+
[tool.poetry.urls]
|
|
40
|
+
Homepage = "https://github.com/craigtrim/pystylometry"
|
|
41
|
+
Repository = "https://github.com/craigtrim/pystylometry"
|
|
42
|
+
Issues = "https://github.com/craigtrim/pystylometry/issues"
|
|
43
|
+
|
|
44
|
+
[tool.poetry.dependencies]
|
|
45
|
+
python = "^3.9"
|
|
46
|
+
stylometry-ttr = "^1.0.3"
|
|
47
|
+
|
|
48
|
+
[tool.poetry.group.readability.dependencies]
|
|
49
|
+
pronouncing = "^0.2.0"
|
|
50
|
+
# spaCy added for NLP-enhanced Gunning Fog Index (PR #4)
|
|
51
|
+
# Enables accurate proper noun detection (POS tagging) and inflection handling (lemmatization)
|
|
52
|
+
# See: https://github.com/craigtrim/pystylometry/pull/4
|
|
53
|
+
spacy = "^3.8.0"
|
|
54
|
+
|
|
55
|
+
[tool.poetry.group.syntactic.dependencies]
|
|
56
|
+
spacy = "^3.8.0"
|
|
57
|
+
|
|
58
|
+
[tool.poetry.group.lexical.dependencies]
|
|
59
|
+
# Lexical lookup tools for rare word and neologism detection
|
|
60
|
+
# Used in hapax legomena analysis to distinguish:
|
|
61
|
+
# - True neologisms (not in WordNet, not in BNC)
|
|
62
|
+
# - Rare words (in BNC but not WordNet, or vice versa)
|
|
63
|
+
# - Common words (in both lexicons)
|
|
64
|
+
bnc-lookup = ">=1.3.0"
|
|
65
|
+
wordnet-lookup = "*"
|
|
66
|
+
|
|
67
|
+
[tool.poetry.group.viz.dependencies]
|
|
68
|
+
# Visualization support for drift detection and stylometric analysis
|
|
69
|
+
# Install with: pip install pystylometry[viz] or poetry install --with viz
|
|
70
|
+
matplotlib = "^3.8.0"
|
|
71
|
+
seaborn = "^0.13.0"
|
|
72
|
+
|
|
73
|
+
[tool.poetry.group.dev.dependencies]
|
|
74
|
+
pytest = "^8.0"
|
|
75
|
+
pytest-cov = "^4.0"
|
|
76
|
+
ruff = "^0.1"
|
|
77
|
+
mypy = "^1.0"
|
|
78
|
+
|
|
79
|
+
[build-system]
|
|
80
|
+
requires = ["poetry-core"]
|
|
81
|
+
build-backend = "poetry.core.masonry.api"
|
|
82
|
+
|
|
83
|
+
[tool.pytest.ini_options]
|
|
84
|
+
testpaths = ["tests"]
|
|
85
|
+
python_files = ["test_*.py"]
|
|
86
|
+
addopts = "-v --cov=pystylometry --cov-report=term-missing"
|
|
87
|
+
|
|
88
|
+
[tool.ruff]
|
|
89
|
+
line-length = 100
|
|
90
|
+
target-version = "py39"
|
|
91
|
+
|
|
92
|
+
[tool.ruff.lint]
|
|
93
|
+
select = ["E", "F", "I", "N", "W"]
|
|
94
|
+
ignore = []
|
|
95
|
+
|
|
96
|
+
[tool.ruff.lint.per-file-ignores]
|
|
97
|
+
# JSX template files contain embedded JavaScript that can't be easily line-wrapped
|
|
98
|
+
"pystylometry/viz/jsx/*.py" = ["E501"]
|
|
99
|
+
# Test files may have long assertions and test data
|
|
100
|
+
"tests/*.py" = ["E501"]
|
|
101
|
+
|
|
102
|
+
[tool.mypy]
|
|
103
|
+
python_version = "3.9"
|
|
104
|
+
warn_return_any = true
|
|
105
|
+
warn_unused_configs = true
|
|
106
|
+
disallow_untyped_defs = true
|
|
107
|
+
|
|
108
|
+
[[tool.mypy.overrides]]
|
|
109
|
+
module = [
|
|
110
|
+
"seaborn",
|
|
111
|
+
"seaborn.*",
|
|
112
|
+
"bnc_lookup",
|
|
113
|
+
"wordnet_lookup",
|
|
114
|
+
"matplotlib.*",
|
|
115
|
+
]
|
|
116
|
+
ignore_missing_imports = true
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# pystylometry
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Core package for stylometric analysis and authorship attribution.
|
|
7
|
+
|
|
8
|
+
## Module Map
|
|
9
|
+
|
|
10
|
+
| Module | Purpose | Key Functions |
|
|
11
|
+
|--------|---------|---------------|
|
|
12
|
+
| [`lexical/`](lexical/) | Vocabulary diversity & richness | `compute_mtld`, `compute_yule`, `compute_ttr`, `compute_hapax_ratios` |
|
|
13
|
+
| [`readability/`](readability/) | Text readability scoring | `compute_flesch`, `compute_gunning_fog`, `compute_ari`, `compute_smog` |
|
|
14
|
+
| [`syntactic/`](syntactic/) | Sentence & parse structure | `compute_pos_ratios`, `compute_sentence_types`, `compute_advanced_syntactic` |
|
|
15
|
+
| [`authorship/`](authorship/) | Author attribution & comparison | `compute_burrows_delta`, `compute_kilgarriff`, `compute_compression_distance` |
|
|
16
|
+
| [`stylistic/`](stylistic/) | Style markers & vocabulary overlap | `compute_stylistic_markers`, `compute_vocabulary_overlap`, `compute_genre_register` |
|
|
17
|
+
| [`character/`](character/) | Character-level features | `compute_character_metrics` |
|
|
18
|
+
| [`ngrams/`](ngrams/) | N-gram entropy & sequences | `compute_extended_ngrams`, `compute_ngram_entropy` |
|
|
19
|
+
| [`dialect/`](dialect/) | Regional dialect detection | `compute_dialect` |
|
|
20
|
+
| [`consistency/`](consistency/) | Intra-document drift detection | `compute_kilgarriff_drift` |
|
|
21
|
+
| [`prosody/`](prosody/) | Rhythm & stress patterns | `compute_rhythm_prosody` |
|
|
22
|
+
| [`viz/`](viz/) | Visualization (PNG & interactive HTML) | `plot_drift_timeline`, `export_drift_report_jsx` |
|
|
23
|
+
|
|
24
|
+
## Shared Internals
|
|
25
|
+
|
|
26
|
+
| File | Purpose |
|
|
27
|
+
|------|---------|
|
|
28
|
+
| `_types.py` | All dataclass result types (e.g. `FleschResult`, `MTLDResult`, `KilgarriffDriftResult`) |
|
|
29
|
+
| `_normalize.py` | Text normalization for readability and stylometry pipelines |
|
|
30
|
+
| `_utils.py` | Shared tokenization and helper functions |
|
|
31
|
+
| `tokenizer.py` | Configurable tokenizer with sentence/word splitting |
|
|
32
|
+
| `cli.py` | Command-line interface (`pystylometry analyze`) |
|
|
33
|
+
|
|
34
|
+
## Installation Extras
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
pip install pystylometry # Core (lexical only)
|
|
38
|
+
pip install pystylometry[readability] # + readability
|
|
39
|
+
pip install pystylometry[syntactic] # + syntactic (requires spaCy)
|
|
40
|
+
pip install pystylometry[authorship] # + authorship attribution
|
|
41
|
+
pip install pystylometry[all] # Everything
|
|
42
|
+
```
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""
|
|
2
|
+
pystylometry - Comprehensive Python package for stylometric analysis.
|
|
3
|
+
|
|
4
|
+
A modular package for text analysis with lexical, readability, syntactic,
|
|
5
|
+
authorship, n-gram, dialect detection, and consistency analysis metrics.
|
|
6
|
+
|
|
7
|
+
Installation:
|
|
8
|
+
pip install pystylometry # Core (lexical only)
|
|
9
|
+
pip install pystylometry[readability] # With readability metrics
|
|
10
|
+
pip install pystylometry[syntactic] # With syntactic analysis
|
|
11
|
+
pip install pystylometry[authorship] # With authorship attribution
|
|
12
|
+
pip install pystylometry[all] # Everything
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
# Direct module imports
|
|
16
|
+
from pystylometry.lexical import compute_mtld, compute_yule
|
|
17
|
+
from pystylometry.readability import compute_flesch
|
|
18
|
+
from pystylometry.syntactic import compute_pos_ratios
|
|
19
|
+
from pystylometry.authorship import compute_burrows_delta, compute_kilgarriff
|
|
20
|
+
from pystylometry.consistency import compute_kilgarriff_drift
|
|
21
|
+
from pystylometry.dialect import compute_dialect
|
|
22
|
+
|
|
23
|
+
# Or use the unified analyze() function
|
|
24
|
+
from pystylometry import analyze
|
|
25
|
+
|
|
26
|
+
results = analyze(text, lexical=True, readability=True)
|
|
27
|
+
print(results.lexical['mtld'].mtld_average)
|
|
28
|
+
print(results.readability['flesch'].reading_ease)
|
|
29
|
+
|
|
30
|
+
# Dialect detection
|
|
31
|
+
result = compute_dialect("The colour of the programme was brilliant.")
|
|
32
|
+
print(result.dialect) # 'british'
|
|
33
|
+
print(result.british_score) # 0.85
|
|
34
|
+
|
|
35
|
+
# Consistency analysis (Style Drift Detector - Issue #36)
|
|
36
|
+
from pystylometry.consistency import compute_kilgarriff_drift
|
|
37
|
+
|
|
38
|
+
result = compute_kilgarriff_drift(long_document)
|
|
39
|
+
print(result.pattern) # 'consistent', 'sudden_spike', 'suspiciously_uniform', etc.
|
|
40
|
+
print(result.pattern_confidence)
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from ._types import AnalysisResult
|
|
44
|
+
|
|
45
|
+
# Version
|
|
46
|
+
__version__ = "0.1.0"
|
|
47
|
+
|
|
48
|
+
# Core exports - always available
|
|
49
|
+
from . import lexical
|
|
50
|
+
|
|
51
|
+
# Optional exports - may raise ImportError if dependencies not installed
|
|
52
|
+
try:
|
|
53
|
+
from . import readability # noqa: F401
|
|
54
|
+
|
|
55
|
+
_READABILITY_AVAILABLE = True
|
|
56
|
+
except ImportError:
|
|
57
|
+
_READABILITY_AVAILABLE = False
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
from . import syntactic # noqa: F401
|
|
61
|
+
|
|
62
|
+
_SYNTACTIC_AVAILABLE = True
|
|
63
|
+
except ImportError:
|
|
64
|
+
_SYNTACTIC_AVAILABLE = False
|
|
65
|
+
|
|
66
|
+
# Prosody requires pronouncing (CMU dictionary) - same dependency as readability
|
|
67
|
+
try:
|
|
68
|
+
from . import prosody # noqa: F401 - Rhythm and prosody metrics (Issue #25)
|
|
69
|
+
|
|
70
|
+
_PROSODY_AVAILABLE = True
|
|
71
|
+
except ImportError:
|
|
72
|
+
_PROSODY_AVAILABLE = False
|
|
73
|
+
|
|
74
|
+
# Authorship, ngrams, dialect, consistency, and stylistic use only stdlib (no external dependencies)
|
|
75
|
+
from . import (
|
|
76
|
+
authorship, # noqa: F401
|
|
77
|
+
consistency, # noqa: F401 - Style drift detection (Issue #36)
|
|
78
|
+
dialect, # noqa: F401
|
|
79
|
+
ngrams, # noqa: F401
|
|
80
|
+
stylistic, # noqa: F401 - Vocabulary overlap and similarity (Issue #21)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
_AUTHORSHIP_AVAILABLE = True
|
|
84
|
+
_NGRAMS_AVAILABLE = True
|
|
85
|
+
_DIALECT_AVAILABLE = True
|
|
86
|
+
_CONSISTENCY_AVAILABLE = True
|
|
87
|
+
_STYLISTIC_AVAILABLE = True
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def analyze(
|
|
91
|
+
text: str,
|
|
92
|
+
lexical_metrics: bool = True,
|
|
93
|
+
readability_metrics: bool = False,
|
|
94
|
+
syntactic_metrics: bool = False,
|
|
95
|
+
authorship_metrics: bool = False,
|
|
96
|
+
ngram_metrics: bool = False,
|
|
97
|
+
) -> AnalysisResult:
|
|
98
|
+
"""
|
|
99
|
+
Unified interface to compute multiple stylometric metrics at once.
|
|
100
|
+
|
|
101
|
+
This is a convenience function that calls all requested metric computations
|
|
102
|
+
and returns a unified result object. Only computes metrics for which the
|
|
103
|
+
required optional dependencies are installed.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
text: Input text to analyze
|
|
107
|
+
lexical_metrics: Compute lexical diversity metrics (default: True)
|
|
108
|
+
readability_metrics: Compute readability metrics (default: False)
|
|
109
|
+
syntactic_metrics: Compute syntactic metrics (default: False)
|
|
110
|
+
authorship_metrics: Compute authorship metrics (default: False)
|
|
111
|
+
Note: Authorship metrics typically require multiple texts for comparison.
|
|
112
|
+
This will compute features that can be used for authorship analysis.
|
|
113
|
+
ngram_metrics: Compute n-gram entropy metrics (default: False)
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
AnalysisResult with requested metrics in nested dictionaries
|
|
117
|
+
|
|
118
|
+
Raises:
|
|
119
|
+
ImportError: If requested analysis requires uninstalled dependencies
|
|
120
|
+
|
|
121
|
+
Example:
|
|
122
|
+
>>> from pystylometry import analyze
|
|
123
|
+
>>> results = analyze(text, lexical=True, readability=True)
|
|
124
|
+
>>> print(results.lexical['mtld'].mtld_average)
|
|
125
|
+
>>> print(results.readability['flesch'].reading_ease)
|
|
126
|
+
|
|
127
|
+
Example with all metrics:
|
|
128
|
+
>>> results = analyze(text, lexical=True, readability=True,
|
|
129
|
+
... syntactic=True, ngrams=True)
|
|
130
|
+
>>> print(f"MTLD: {results.lexical['mtld'].mtld_average:.2f}")
|
|
131
|
+
>>> print(f"Flesch: {results.readability['flesch'].reading_ease:.1f}")
|
|
132
|
+
>>> print(f"Noun ratio: {results.syntactic['pos'].noun_ratio:.3f}")
|
|
133
|
+
>>> print(f"Bigram entropy: {results.ngrams['word_bigram'].entropy:.3f}")
|
|
134
|
+
"""
|
|
135
|
+
result = AnalysisResult(metadata={"text_length": len(text)})
|
|
136
|
+
|
|
137
|
+
# Lexical metrics (always available)
|
|
138
|
+
if lexical_metrics:
|
|
139
|
+
result.lexical = {}
|
|
140
|
+
result.lexical["ttr"] = lexical.compute_ttr(text)
|
|
141
|
+
result.lexical["mtld"] = lexical.compute_mtld(text)
|
|
142
|
+
result.lexical["yule"] = lexical.compute_yule(text)
|
|
143
|
+
result.lexical["hapax"] = lexical.compute_hapax_ratios(text)
|
|
144
|
+
|
|
145
|
+
# Readability metrics (optional dependency)
|
|
146
|
+
if readability_metrics:
|
|
147
|
+
if not _READABILITY_AVAILABLE:
|
|
148
|
+
raise ImportError(
|
|
149
|
+
"Readability metrics require optional dependencies. "
|
|
150
|
+
"Install with: pip install pystylometry[readability]"
|
|
151
|
+
)
|
|
152
|
+
# Import locally to avoid name conflict
|
|
153
|
+
from . import readability as readability_module
|
|
154
|
+
|
|
155
|
+
result.readability = {}
|
|
156
|
+
result.readability["flesch"] = readability_module.compute_flesch(text)
|
|
157
|
+
result.readability["smog"] = readability_module.compute_smog(text)
|
|
158
|
+
result.readability["gunning_fog"] = readability_module.compute_gunning_fog(text)
|
|
159
|
+
result.readability["coleman_liau"] = readability_module.compute_coleman_liau(text)
|
|
160
|
+
result.readability["ari"] = readability_module.compute_ari(text)
|
|
161
|
+
|
|
162
|
+
# Syntactic metrics (optional dependency)
|
|
163
|
+
if syntactic_metrics:
|
|
164
|
+
if not _SYNTACTIC_AVAILABLE:
|
|
165
|
+
raise ImportError(
|
|
166
|
+
"Syntactic metrics require optional dependencies. "
|
|
167
|
+
"Install with: pip install pystylometry[syntactic]"
|
|
168
|
+
)
|
|
169
|
+
# Import locally to avoid name conflict
|
|
170
|
+
from . import syntactic as syntactic_module
|
|
171
|
+
|
|
172
|
+
result.syntactic = {}
|
|
173
|
+
result.syntactic["pos"] = syntactic_module.compute_pos_ratios(text)
|
|
174
|
+
result.syntactic["sentence_stats"] = syntactic_module.compute_sentence_stats(text)
|
|
175
|
+
|
|
176
|
+
# Authorship metrics (uses stdlib only)
|
|
177
|
+
# Note: These are typically used for comparison between texts
|
|
178
|
+
# Here we just note that they're available but don't compute them
|
|
179
|
+
# since they require multiple texts as input
|
|
180
|
+
if authorship_metrics:
|
|
181
|
+
result.authorship = {
|
|
182
|
+
"note": "Authorship metrics require multiple texts for comparison. "
|
|
183
|
+
"Use pystylometry.authorship.compute_burrows_delta(text1, text2) directly."
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
# N-gram metrics (uses stdlib only)
|
|
187
|
+
if ngram_metrics:
|
|
188
|
+
result.ngrams = {}
|
|
189
|
+
result.ngrams["character_bigram"] = ngrams.compute_character_bigram_entropy(text)
|
|
190
|
+
result.ngrams["word_bigram"] = ngrams.compute_word_bigram_entropy(text)
|
|
191
|
+
|
|
192
|
+
return result
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# Convenient access to availability flags
|
|
196
|
+
def get_available_modules() -> dict[str, bool]:
|
|
197
|
+
"""
|
|
198
|
+
Get dictionary of available optional modules.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Dictionary mapping module names to availability status
|
|
202
|
+
|
|
203
|
+
Example:
|
|
204
|
+
>>> from pystylometry import get_available_modules
|
|
205
|
+
>>> available = get_available_modules()
|
|
206
|
+
>>> if available['readability']:
|
|
207
|
+
... from pystylometry.readability import compute_flesch
|
|
208
|
+
>>> if available['consistency']:
|
|
209
|
+
... from pystylometry.consistency import compute_kilgarriff_drift
|
|
210
|
+
"""
|
|
211
|
+
return {
|
|
212
|
+
"lexical": True, # Always available
|
|
213
|
+
"readability": _READABILITY_AVAILABLE,
|
|
214
|
+
"syntactic": _SYNTACTIC_AVAILABLE,
|
|
215
|
+
"authorship": _AUTHORSHIP_AVAILABLE,
|
|
216
|
+
"ngrams": _NGRAMS_AVAILABLE,
|
|
217
|
+
"dialect": _DIALECT_AVAILABLE,
|
|
218
|
+
"consistency": _CONSISTENCY_AVAILABLE, # Style drift detection (Issue #36)
|
|
219
|
+
"stylistic": _STYLISTIC_AVAILABLE, # Vocabulary overlap (Issue #21)
|
|
220
|
+
"prosody": _PROSODY_AVAILABLE, # Rhythm and prosody (Issue #25)
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
__all__ = [
|
|
225
|
+
"__version__",
|
|
226
|
+
"analyze",
|
|
227
|
+
"get_available_modules",
|
|
228
|
+
"lexical",
|
|
229
|
+
]
|
|
230
|
+
|
|
231
|
+
# Conditionally add to __all__ based on availability
|
|
232
|
+
if _READABILITY_AVAILABLE:
|
|
233
|
+
__all__.append("readability")
|
|
234
|
+
if _SYNTACTIC_AVAILABLE:
|
|
235
|
+
__all__.append("syntactic")
|
|
236
|
+
if _AUTHORSHIP_AVAILABLE:
|
|
237
|
+
__all__.append("authorship")
|
|
238
|
+
if _NGRAMS_AVAILABLE:
|
|
239
|
+
__all__.append("ngrams")
|
|
240
|
+
if _DIALECT_AVAILABLE:
|
|
241
|
+
__all__.append("dialect")
|
|
242
|
+
if _CONSISTENCY_AVAILABLE:
|
|
243
|
+
__all__.append("consistency")
|
|
244
|
+
if _STYLISTIC_AVAILABLE:
|
|
245
|
+
__all__.append("stylistic")
|
|
246
|
+
if _PROSODY_AVAILABLE:
|
|
247
|
+
__all__.append("prosody")
|