pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Type-Token Ratio (TTR) analysis using stylometry-ttr package.
|
|
2
|
+
|
|
3
|
+
This module provides a facade wrapper around the stylometry-ttr package,
|
|
4
|
+
maintaining consistent API patterns with other pystylometry metrics.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from .._types import Distribution, TTRResult, make_distribution
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def compute_ttr(text: str, text_id: str | None = None, chunk_size: int = 1000) -> TTRResult:
|
|
17
|
+
"""
|
|
18
|
+
Compute Type-Token Ratio (TTR) metrics for vocabulary richness.
|
|
19
|
+
|
|
20
|
+
This is a facade wrapper around the stylometry-ttr package that provides
|
|
21
|
+
multiple TTR variants for measuring lexical diversity. TTR measures the
|
|
22
|
+
ratio of unique words (types) to total words (tokens).
|
|
23
|
+
|
|
24
|
+
Metrics computed:
|
|
25
|
+
- Raw TTR: unique_words / total_words
|
|
26
|
+
- Root TTR (Guiraud's index): unique_words / sqrt(total_words)
|
|
27
|
+
- Log TTR (Herdan's C): log(unique_words) / log(total_words)
|
|
28
|
+
- STTR: Standardized TTR across fixed-size chunks (reduces length bias)
|
|
29
|
+
- Delta Std: Standard deviation of TTR across chunks (vocabulary consistency)
|
|
30
|
+
|
|
31
|
+
Related GitHub Issue:
|
|
32
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
33
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
34
|
+
|
|
35
|
+
References:
|
|
36
|
+
Guiraud, P. (1960). Problèmes et méthodes de la statistique linguistique.
|
|
37
|
+
Herdan, G. (1960). Type-token Mathematics: A Textbook of Mathematical
|
|
38
|
+
Linguistics. Mouton.
|
|
39
|
+
Johnson, W. (1944). Studies in language behavior: I. A program of research.
|
|
40
|
+
Psychological Monographs, 56(2), 1-15.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
text: Input text to analyze
|
|
44
|
+
text_id: Optional identifier for the text (for tracking purposes)
|
|
45
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
46
|
+
Note: The stylometry-ttr package handles its own internal chunking,
|
|
47
|
+
so this parameter is included for API consistency but actual chunking
|
|
48
|
+
behavior is delegated to stylometry-ttr.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
TTRResult with all TTR variants and metadata, including Distribution
|
|
52
|
+
objects for stylometric fingerprinting.
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
>>> result = compute_ttr("The quick brown fox jumps over the lazy dog.")
|
|
56
|
+
>>> print(f"Raw TTR: {result.ttr:.3f}")
|
|
57
|
+
Raw TTR: 0.900
|
|
58
|
+
>>> print(f"Root TTR: {result.root_ttr:.3f}")
|
|
59
|
+
Root TTR: 2.846
|
|
60
|
+
>>> print(f"STTR: {result.sttr:.3f}")
|
|
61
|
+
STTR: 1.000
|
|
62
|
+
|
|
63
|
+
>>> # With text identifier
|
|
64
|
+
>>> result = compute_ttr("Sample text here.", text_id="sample-001")
|
|
65
|
+
>>> print(result.metadata["text_id"])
|
|
66
|
+
sample-001
|
|
67
|
+
"""
|
|
68
|
+
try:
|
|
69
|
+
from stylometry_ttr import compute_ttr as _compute_ttr
|
|
70
|
+
except ImportError as e:
|
|
71
|
+
raise ImportError(
|
|
72
|
+
"TTR metrics require the stylometry-ttr package. "
|
|
73
|
+
"This should have been installed as a core dependency. "
|
|
74
|
+
"Install with: pip install stylometry-ttr"
|
|
75
|
+
) from e
|
|
76
|
+
|
|
77
|
+
# Call the stylometry-ttr compute_ttr function
|
|
78
|
+
# Note: stylometry-ttr requires text_id to be a string, not None
|
|
79
|
+
ttr_result = _compute_ttr(text, text_id=text_id or "")
|
|
80
|
+
|
|
81
|
+
# Extract values, handling None for short texts
|
|
82
|
+
ttr_val = ttr_result.ttr
|
|
83
|
+
root_ttr_val = ttr_result.root_ttr
|
|
84
|
+
log_ttr_val = ttr_result.log_ttr
|
|
85
|
+
sttr_val = ttr_result.sttr if ttr_result.sttr is not None else 0.0
|
|
86
|
+
delta_std_val = ttr_result.delta_std if ttr_result.delta_std is not None else 0.0
|
|
87
|
+
|
|
88
|
+
# Create single-value distributions from stylometry-ttr results
|
|
89
|
+
# The stylometry-ttr package handles its own internal chunking for STTR
|
|
90
|
+
# so we wrap the aggregate results in Distribution objects
|
|
91
|
+
ttr_dist = (
|
|
92
|
+
make_distribution([ttr_val])
|
|
93
|
+
if ttr_val is not None
|
|
94
|
+
else Distribution(
|
|
95
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
root_ttr_dist = (
|
|
99
|
+
make_distribution([root_ttr_val])
|
|
100
|
+
if root_ttr_val is not None
|
|
101
|
+
else Distribution(
|
|
102
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
103
|
+
)
|
|
104
|
+
)
|
|
105
|
+
log_ttr_dist = (
|
|
106
|
+
make_distribution([log_ttr_val])
|
|
107
|
+
if log_ttr_val is not None
|
|
108
|
+
else Distribution(
|
|
109
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
sttr_dist = (
|
|
113
|
+
make_distribution([sttr_val])
|
|
114
|
+
if ttr_result.sttr is not None
|
|
115
|
+
else Distribution(
|
|
116
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
delta_std_dist = (
|
|
120
|
+
make_distribution([delta_std_val])
|
|
121
|
+
if ttr_result.delta_std is not None
|
|
122
|
+
else Distribution(
|
|
123
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Convert to our TTRResult dataclass
|
|
128
|
+
return TTRResult(
|
|
129
|
+
total_words=ttr_result.total_words,
|
|
130
|
+
unique_words=ttr_result.unique_words,
|
|
131
|
+
ttr=ttr_val if ttr_val is not None else float("nan"),
|
|
132
|
+
root_ttr=root_ttr_val if root_ttr_val is not None else float("nan"),
|
|
133
|
+
log_ttr=log_ttr_val if log_ttr_val is not None else float("nan"),
|
|
134
|
+
sttr=sttr_val,
|
|
135
|
+
delta_std=delta_std_val,
|
|
136
|
+
ttr_dist=ttr_dist,
|
|
137
|
+
root_ttr_dist=root_ttr_dist,
|
|
138
|
+
log_ttr_dist=log_ttr_dist,
|
|
139
|
+
sttr_dist=sttr_dist,
|
|
140
|
+
delta_std_dist=delta_std_dist,
|
|
141
|
+
chunk_size=chunk_size,
|
|
142
|
+
chunk_count=1, # stylometry-ttr returns aggregate results
|
|
143
|
+
metadata={
|
|
144
|
+
"text_id": text_id or "",
|
|
145
|
+
"source": "stylometry-ttr",
|
|
146
|
+
"sttr_available": ttr_result.sttr is not None,
|
|
147
|
+
"delta_std_available": ttr_result.delta_std is not None,
|
|
148
|
+
},
|
|
149
|
+
)
|