pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +1 -2
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1224 -2
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +4 -0
- pystylometry/authorship/additional_methods.py +100 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +301 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +641 -0
- pystylometry/lexical/function_words.py +391 -0
- pystylometry/lexical/hapax.py +154 -7
- pystylometry/lexical/mtld.py +83 -7
- pystylometry/lexical/ttr.py +83 -0
- pystylometry/lexical/word_frequency_sophistication.py +581 -0
- pystylometry/lexical/yule.py +34 -7
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +985 -0
- pystylometry/readability/ari.py +93 -17
- pystylometry/readability/coleman_liau.py +102 -9
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +59 -14
- pystylometry/readability/gunning_fog.py +194 -25
- pystylometry/readability/smog.py +31 -14
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +432 -0
- pystylometry/syntactic/pos_ratios.py +104 -13
- pystylometry/syntactic/sentence_stats.py +57 -13
- pystylometry/syntactic/sentence_types.py +470 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
- pystylometry-1.0.0.dist-info/RECORD +46 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
- pystylometry-0.1.0.dist-info/RECORD +0 -26
pystylometry/lexical/mtld.py
CHANGED
|
@@ -4,6 +4,64 @@ from .._types import MTLDResult
|
|
|
4
4
|
from .._utils import tokenize
|
|
5
5
|
|
|
6
6
|
|
|
7
|
+
def _calculate_mtld_direction(tokens: list[str], threshold: float, forward: bool) -> float:
|
|
8
|
+
"""
|
|
9
|
+
Calculate MTLD in one direction (forward or backward).
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
tokens: List of tokens to analyze
|
|
13
|
+
threshold: TTR threshold to maintain (must be in range (0, 1))
|
|
14
|
+
forward: If True, process forward; if False, process backward
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
MTLD score for this direction
|
|
18
|
+
"""
|
|
19
|
+
if len(tokens) == 0:
|
|
20
|
+
return 0.0
|
|
21
|
+
|
|
22
|
+
# Process tokens in the specified direction
|
|
23
|
+
token_list = tokens if forward else tokens[::-1]
|
|
24
|
+
|
|
25
|
+
factors = 0.0
|
|
26
|
+
current_count = 0
|
|
27
|
+
current_types = set()
|
|
28
|
+
|
|
29
|
+
for token in token_list:
|
|
30
|
+
current_count += 1
|
|
31
|
+
current_types.add(token)
|
|
32
|
+
|
|
33
|
+
# Calculate current TTR
|
|
34
|
+
ttr = len(current_types) / current_count
|
|
35
|
+
|
|
36
|
+
# If TTR drops below threshold, we've completed a factor
|
|
37
|
+
if ttr < threshold:
|
|
38
|
+
factors += 1.0
|
|
39
|
+
current_count = 0
|
|
40
|
+
current_types = set()
|
|
41
|
+
|
|
42
|
+
# Handle remaining partial factor
|
|
43
|
+
# Add proportion of a complete factor based on how close we are to threshold
|
|
44
|
+
if current_count > 0:
|
|
45
|
+
ttr = len(current_types) / current_count
|
|
46
|
+
# If we're still above threshold, add partial factor credit
|
|
47
|
+
# Formula: (1 - current_ttr) / (1 - threshold)
|
|
48
|
+
# This represents how far we've progressed toward completing a factor
|
|
49
|
+
# In theory, ttr should always be >= threshold here because drops below
|
|
50
|
+
# threshold are handled in the loop above (which resets current_count).
|
|
51
|
+
# Adding defensive check to prevent mathematical errors.
|
|
52
|
+
if ttr >= threshold:
|
|
53
|
+
factors += (1.0 - ttr) / (1.0 - threshold)
|
|
54
|
+
|
|
55
|
+
# MTLD is the mean length of factors
|
|
56
|
+
# Total tokens / number of factors
|
|
57
|
+
if factors > 0:
|
|
58
|
+
return len(tokens) / factors
|
|
59
|
+
else:
|
|
60
|
+
# If no factors were completed, return the text length
|
|
61
|
+
# This happens when TTR stays above threshold for the entire text
|
|
62
|
+
return float(len(tokens))
|
|
63
|
+
|
|
64
|
+
|
|
7
65
|
def compute_mtld(
|
|
8
66
|
text: str,
|
|
9
67
|
threshold: float = 0.72,
|
|
@@ -16,8 +74,10 @@ def compute_mtld(
|
|
|
16
74
|
varying lengths.
|
|
17
75
|
|
|
18
76
|
Formula:
|
|
19
|
-
MTLD =
|
|
20
|
-
where
|
|
77
|
+
MTLD = total_tokens / factor_count
|
|
78
|
+
where factor_count includes:
|
|
79
|
+
- Completed factors (segments where TTR dropped below threshold)
|
|
80
|
+
- Partial factor for any remaining incomplete segment (weighted by proximity to threshold)
|
|
21
81
|
|
|
22
82
|
References:
|
|
23
83
|
McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D:
|
|
@@ -26,16 +86,28 @@ def compute_mtld(
|
|
|
26
86
|
|
|
27
87
|
Args:
|
|
28
88
|
text: Input text to analyze
|
|
29
|
-
threshold: TTR threshold to maintain (default: 0.72)
|
|
89
|
+
threshold: TTR threshold to maintain (default: 0.72, must be in range (0, 1))
|
|
30
90
|
|
|
31
91
|
Returns:
|
|
32
92
|
MTLDResult with forward, backward, and average MTLD scores
|
|
33
93
|
|
|
94
|
+
Raises:
|
|
95
|
+
ValueError: If threshold is not in range (0, 1)
|
|
96
|
+
|
|
34
97
|
Example:
|
|
35
98
|
>>> result = compute_mtld("The quick brown fox jumps over the lazy dog...")
|
|
36
99
|
>>> print(f"MTLD: {result.mtld_average:.2f}")
|
|
37
100
|
"""
|
|
38
|
-
|
|
101
|
+
# Validate threshold parameter
|
|
102
|
+
if not (0 < threshold < 1):
|
|
103
|
+
raise ValueError(
|
|
104
|
+
f"Threshold must be in range (0, 1), got {threshold}. "
|
|
105
|
+
"Common values: 0.72 (default), 0.5-0.8"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Case-insensitive tokenization for consistency with other lexical metrics
|
|
109
|
+
# (compute_yule, compute_hapax_ratios both use text.lower())
|
|
110
|
+
tokens = tokenize(text.lower())
|
|
39
111
|
|
|
40
112
|
if len(tokens) == 0:
|
|
41
113
|
return MTLDResult(
|
|
@@ -45,9 +117,13 @@ def compute_mtld(
|
|
|
45
117
|
metadata={"token_count": 0, "threshold": threshold},
|
|
46
118
|
)
|
|
47
119
|
|
|
48
|
-
#
|
|
49
|
-
mtld_forward =
|
|
50
|
-
|
|
120
|
+
# Calculate MTLD in forward direction
|
|
121
|
+
mtld_forward = _calculate_mtld_direction(tokens, threshold, forward=True)
|
|
122
|
+
|
|
123
|
+
# Calculate MTLD in backward direction
|
|
124
|
+
mtld_backward = _calculate_mtld_direction(tokens, threshold, forward=False)
|
|
125
|
+
|
|
126
|
+
# Average of forward and backward
|
|
51
127
|
mtld_average = (mtld_forward + mtld_backward) / 2
|
|
52
128
|
|
|
53
129
|
return MTLDResult(
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Type-Token Ratio (TTR) analysis using stylometry-ttr package.
|
|
2
|
+
|
|
3
|
+
This module provides a facade wrapper around the stylometry-ttr package,
|
|
4
|
+
maintaining consistent API patterns with other pystylometry metrics.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .._types import TTRResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def compute_ttr(text: str, text_id: str | None = None) -> TTRResult:
|
|
11
|
+
"""
|
|
12
|
+
Compute Type-Token Ratio (TTR) metrics for vocabulary richness.
|
|
13
|
+
|
|
14
|
+
This is a facade wrapper around the stylometry-ttr package that provides
|
|
15
|
+
multiple TTR variants for measuring lexical diversity. TTR measures the
|
|
16
|
+
ratio of unique words (types) to total words (tokens).
|
|
17
|
+
|
|
18
|
+
Metrics computed:
|
|
19
|
+
- Raw TTR: unique_words / total_words
|
|
20
|
+
- Root TTR (Guiraud's index): unique_words / sqrt(total_words)
|
|
21
|
+
- Log TTR (Herdan's C): log(unique_words) / log(total_words)
|
|
22
|
+
- STTR: Standardized TTR across fixed-size chunks (reduces length bias)
|
|
23
|
+
- Delta Std: Standard deviation of TTR across chunks (vocabulary consistency)
|
|
24
|
+
|
|
25
|
+
References:
|
|
26
|
+
Guiraud, P. (1960). Problèmes et méthodes de la statistique linguistique.
|
|
27
|
+
Herdan, G. (1960). Type-token Mathematics: A Textbook of Mathematical
|
|
28
|
+
Linguistics. Mouton.
|
|
29
|
+
Johnson, W. (1944). Studies in language behavior: I. A program of research.
|
|
30
|
+
Psychological Monographs, 56(2), 1-15.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
text: Input text to analyze
|
|
34
|
+
text_id: Optional identifier for the text (for tracking purposes)
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
TTRResult with all TTR variants and metadata
|
|
38
|
+
|
|
39
|
+
Example:
|
|
40
|
+
>>> result = compute_ttr("The quick brown fox jumps over the lazy dog.")
|
|
41
|
+
>>> print(f"Raw TTR: {result.ttr:.3f}")
|
|
42
|
+
Raw TTR: 0.900
|
|
43
|
+
>>> print(f"Root TTR: {result.root_ttr:.3f}")
|
|
44
|
+
Root TTR: 2.846
|
|
45
|
+
>>> print(f"STTR: {result.sttr:.3f}")
|
|
46
|
+
STTR: 1.000
|
|
47
|
+
|
|
48
|
+
>>> # With text identifier
|
|
49
|
+
>>> result = compute_ttr("Sample text here.", text_id="sample-001")
|
|
50
|
+
>>> print(result.metadata["text_id"])
|
|
51
|
+
sample-001
|
|
52
|
+
"""
|
|
53
|
+
try:
|
|
54
|
+
from stylometry_ttr import compute_ttr as _compute_ttr
|
|
55
|
+
except ImportError as e:
|
|
56
|
+
raise ImportError(
|
|
57
|
+
"TTR metrics require the stylometry-ttr package. "
|
|
58
|
+
"This should have been installed as a core dependency. "
|
|
59
|
+
"Install with: pip install stylometry-ttr"
|
|
60
|
+
) from e
|
|
61
|
+
|
|
62
|
+
# Call the stylometry-ttr compute_ttr function
|
|
63
|
+
# Note: stylometry-ttr requires text_id to be a string, not None
|
|
64
|
+
ttr_result = _compute_ttr(text, text_id=text_id or "")
|
|
65
|
+
|
|
66
|
+
# Convert to our TTRResult dataclass
|
|
67
|
+
# The stylometry-ttr result has attributes we can access
|
|
68
|
+
# Some fields (sttr, delta_std) may be None for short texts
|
|
69
|
+
return TTRResult(
|
|
70
|
+
total_words=ttr_result.total_words,
|
|
71
|
+
unique_words=ttr_result.unique_words,
|
|
72
|
+
ttr=ttr_result.ttr,
|
|
73
|
+
root_ttr=ttr_result.root_ttr,
|
|
74
|
+
log_ttr=ttr_result.log_ttr,
|
|
75
|
+
sttr=ttr_result.sttr if ttr_result.sttr is not None else 0.0,
|
|
76
|
+
delta_std=ttr_result.delta_std if ttr_result.delta_std is not None else 0.0,
|
|
77
|
+
metadata={
|
|
78
|
+
"text_id": text_id or "",
|
|
79
|
+
"source": "stylometry-ttr",
|
|
80
|
+
"sttr_available": ttr_result.sttr is not None,
|
|
81
|
+
"delta_std_available": ttr_result.delta_std is not None,
|
|
82
|
+
},
|
|
83
|
+
)
|