pystylometry 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +206 -0
- pystylometry/_types.py +172 -0
- pystylometry/_utils.py +197 -0
- pystylometry/authorship/__init__.py +10 -0
- pystylometry/authorship/burrows_delta.py +152 -0
- pystylometry/authorship/zeta.py +109 -0
- pystylometry/lexical/__init__.py +17 -0
- pystylometry/lexical/hapax.py +75 -0
- pystylometry/lexical/mtld.py +61 -0
- pystylometry/lexical/yule.py +66 -0
- pystylometry/ngrams/__init__.py +13 -0
- pystylometry/ngrams/entropy.py +130 -0
- pystylometry/readability/__init__.py +15 -0
- pystylometry/readability/ari.py +70 -0
- pystylometry/readability/coleman_liau.py +67 -0
- pystylometry/readability/flesch.py +81 -0
- pystylometry/readability/gunning_fog.py +63 -0
- pystylometry/readability/smog.py +71 -0
- pystylometry/readability/syllables.py +54 -0
- pystylometry/syntactic/__init__.py +9 -0
- pystylometry/syntactic/pos_ratios.py +61 -0
- pystylometry/syntactic/sentence_stats.py +60 -0
- pystylometry/tokenizer.py +598 -0
- pystylometry-0.1.0.dist-info/METADATA +238 -0
- pystylometry-0.1.0.dist-info/RECORD +26 -0
- pystylometry-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Burrows' Delta and Cosine Delta for authorship attribution."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
import statistics
|
|
5
|
+
from collections import Counter
|
|
6
|
+
|
|
7
|
+
from .._types import BurrowsDeltaResult
|
|
8
|
+
from .._utils import tokenize
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def compute_burrows_delta(
|
|
12
|
+
text1: str, text2: str, mfw: int = 500, distance_type: str = "burrows"
|
|
13
|
+
) -> BurrowsDeltaResult:
|
|
14
|
+
"""
|
|
15
|
+
Compute Burrows' Delta or Cosine Delta between two texts.
|
|
16
|
+
|
|
17
|
+
Burrows' Delta:
|
|
18
|
+
Delta = mean(|z₁(f) - z₂(f)|) for all features f
|
|
19
|
+
where z(f) = (frequency(f) - mean(f)) / std(f)
|
|
20
|
+
|
|
21
|
+
Cosine Delta:
|
|
22
|
+
Delta = 1 - cos(z₁, z₂)
|
|
23
|
+
Measures angular distance between z-score vectors
|
|
24
|
+
|
|
25
|
+
Both methods:
|
|
26
|
+
1. Extract most frequent words (MFW) across both texts
|
|
27
|
+
2. Calculate word frequencies in each text
|
|
28
|
+
3. Z-score normalize frequencies
|
|
29
|
+
4. Compute distance measure
|
|
30
|
+
|
|
31
|
+
Lower scores indicate more similar texts (likely same author).
|
|
32
|
+
|
|
33
|
+
References:
|
|
34
|
+
Burrows, J. (2002). 'Delta': A measure of stylistic difference and
|
|
35
|
+
a guide to likely authorship. Literary and Linguistic Computing, 17(3), 267-287.
|
|
36
|
+
|
|
37
|
+
Argamon, S. (2008). Interpreting Burrows's Delta: Geometric and
|
|
38
|
+
probabilistic foundations. Literary and Linguistic Computing, 23(2), 131-147.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
text1: First text to compare
|
|
42
|
+
text2: Second text to compare
|
|
43
|
+
mfw: Number of most frequent words to use (default: 500)
|
|
44
|
+
distance_type: "burrows", "cosine", or "eder" (default: "burrows")
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
BurrowsDeltaResult with delta score and metadata
|
|
48
|
+
|
|
49
|
+
Example:
|
|
50
|
+
>>> result = compute_burrows_delta(text1, text2, mfw=300)
|
|
51
|
+
>>> print(f"Delta score: {result.delta_score:.3f}")
|
|
52
|
+
>>> print(f"Lower is more similar")
|
|
53
|
+
"""
|
|
54
|
+
# Tokenize and count words
|
|
55
|
+
tokens1 = [t.lower() for t in tokenize(text1)]
|
|
56
|
+
tokens2 = [t.lower() for t in tokenize(text2)]
|
|
57
|
+
|
|
58
|
+
if len(tokens1) == 0 or len(tokens2) == 0:
|
|
59
|
+
return BurrowsDeltaResult(
|
|
60
|
+
delta_score=0.0,
|
|
61
|
+
distance_type=distance_type,
|
|
62
|
+
mfw_count=0,
|
|
63
|
+
metadata={
|
|
64
|
+
"text1_token_count": len(tokens1),
|
|
65
|
+
"text2_token_count": len(tokens2),
|
|
66
|
+
"warning": "One or both texts are empty",
|
|
67
|
+
},
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Get word frequencies
|
|
71
|
+
freq1 = Counter(tokens1)
|
|
72
|
+
freq2 = Counter(tokens2)
|
|
73
|
+
|
|
74
|
+
# Get most frequent words across both texts
|
|
75
|
+
all_words: Counter[str] = Counter()
|
|
76
|
+
all_words.update(freq1)
|
|
77
|
+
all_words.update(freq2)
|
|
78
|
+
most_common_words = [word for word, _ in all_words.most_common(mfw)]
|
|
79
|
+
|
|
80
|
+
# Calculate relative frequencies for MFW
|
|
81
|
+
def get_relative_freqs(freq_counter: Counter, words: list[str], total: int) -> list[float]:
|
|
82
|
+
return [freq_counter.get(word, 0) / total for word in words]
|
|
83
|
+
|
|
84
|
+
rel_freqs1 = get_relative_freqs(freq1, most_common_words, len(tokens1))
|
|
85
|
+
rel_freqs2 = get_relative_freqs(freq2, most_common_words, len(tokens2))
|
|
86
|
+
|
|
87
|
+
# Combine for z-score calculation (treat as corpus)
|
|
88
|
+
combined_freqs = [(f1 + f2) / 2 for f1, f2 in zip(rel_freqs1, rel_freqs2)]
|
|
89
|
+
|
|
90
|
+
# Calculate standard deviation for each word position
|
|
91
|
+
combined_std = []
|
|
92
|
+
for i in range(len(most_common_words)):
|
|
93
|
+
values = [rel_freqs1[i], rel_freqs2[i]]
|
|
94
|
+
std = statistics.stdev(values) if len(set(values)) > 1 else 1e-10
|
|
95
|
+
combined_std.append(std if std > 0 else 1e-10)
|
|
96
|
+
|
|
97
|
+
# Calculate z-scores
|
|
98
|
+
z1 = [(f - mean) / std for f, mean, std in zip(rel_freqs1, combined_freqs, combined_std)]
|
|
99
|
+
z2 = [(f - mean) / std for f, mean, std in zip(rel_freqs2, combined_freqs, combined_std)]
|
|
100
|
+
|
|
101
|
+
# Calculate distance based on type
|
|
102
|
+
if distance_type == "burrows":
|
|
103
|
+
# Burrows' Delta: mean absolute difference of z-scores
|
|
104
|
+
abs_diffs = [abs(z1_val - z2_val) for z1_val, z2_val in zip(z1, z2)]
|
|
105
|
+
delta_score = statistics.mean(abs_diffs) if abs_diffs else 0.0
|
|
106
|
+
elif distance_type == "cosine":
|
|
107
|
+
# Cosine Delta: 1 - cosine similarity
|
|
108
|
+
dot_product = sum(z1_val * z2_val for z1_val, z2_val in zip(z1, z2))
|
|
109
|
+
norm1 = math.sqrt(sum(z**2 for z in z1))
|
|
110
|
+
norm2 = math.sqrt(sum(z**2 for z in z2))
|
|
111
|
+
cosine_sim = dot_product / (norm1 * norm2) if norm1 > 0 and norm2 > 0 else 0
|
|
112
|
+
delta_score = 1 - cosine_sim
|
|
113
|
+
elif distance_type == "eder":
|
|
114
|
+
# Eder's Delta: similar to Burrows but with different normalization
|
|
115
|
+
abs_diffs = [abs(z1_val - z2_val) for z1_val, z2_val in zip(z1, z2)]
|
|
116
|
+
delta_score = statistics.mean(abs_diffs) if abs_diffs else 0.0
|
|
117
|
+
else:
|
|
118
|
+
abs_diffs = [abs(z1_val - z2_val) for z1_val, z2_val in zip(z1, z2)]
|
|
119
|
+
delta_score = statistics.mean(abs_diffs) if abs_diffs else 0.0
|
|
120
|
+
|
|
121
|
+
return BurrowsDeltaResult(
|
|
122
|
+
delta_score=delta_score,
|
|
123
|
+
distance_type=distance_type,
|
|
124
|
+
mfw_count=len(most_common_words),
|
|
125
|
+
metadata={
|
|
126
|
+
"text1_token_count": len(tokens1),
|
|
127
|
+
"text2_token_count": len(tokens2),
|
|
128
|
+
"text1_vocab": len(freq1),
|
|
129
|
+
"text2_vocab": len(freq2),
|
|
130
|
+
},
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def compute_cosine_delta(text1: str, text2: str, mfw: int = 500) -> BurrowsDeltaResult:
|
|
135
|
+
"""
|
|
136
|
+
Compute Cosine Delta between two texts.
|
|
137
|
+
|
|
138
|
+
Convenience function that calls compute_burrows_delta with distance_type="cosine".
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
text1: First text to compare
|
|
142
|
+
text2: Second text to compare
|
|
143
|
+
mfw: Number of most frequent words to use (default: 500)
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
BurrowsDeltaResult with cosine delta score
|
|
147
|
+
|
|
148
|
+
Example:
|
|
149
|
+
>>> result = compute_cosine_delta(text1, text2)
|
|
150
|
+
>>> print(f"Cosine Delta: {result.delta_score:.3f}")
|
|
151
|
+
"""
|
|
152
|
+
return compute_burrows_delta(text1, text2, mfw=mfw, distance_type="cosine")
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Zeta score for distinctive word usage in authorship attribution."""
|
|
2
|
+
|
|
3
|
+
from .._types import ZetaResult
|
|
4
|
+
from .._utils import tokenize
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def compute_zeta(text1: str, text2: str, segments: int = 10, top_n: int = 50) -> ZetaResult:
|
|
8
|
+
"""
|
|
9
|
+
Compute Zeta score for distinctive word usage between two texts or text groups.
|
|
10
|
+
|
|
11
|
+
Zeta identifies words that are consistently used in one text/author but not another.
|
|
12
|
+
|
|
13
|
+
Algorithm:
|
|
14
|
+
1. Divide each text into segments
|
|
15
|
+
2. Calculate document proportion (DP) for each word:
|
|
16
|
+
- DP₁ = proportion of segments in text1 containing the word
|
|
17
|
+
- DP₂ = proportion of segments in text2 containing the word
|
|
18
|
+
3. Zeta score = DP₁ - DP₂
|
|
19
|
+
4. Positive Zeta = marker words (distinctive of text1)
|
|
20
|
+
5. Negative Zeta = anti-marker words (distinctive of text2)
|
|
21
|
+
|
|
22
|
+
References:
|
|
23
|
+
Burrows, J. (2007). All the way through: Testing for authorship in
|
|
24
|
+
different frequency strata. Literary and Linguistic Computing, 22(1), 27-47.
|
|
25
|
+
|
|
26
|
+
Craig, H., & Kinney, A. F. (2009). Shakespeare, Computers, and the
|
|
27
|
+
Mystery of Authorship. Cambridge University Press.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
text1: First text (candidate author)
|
|
31
|
+
text2: Second text (comparison author/corpus)
|
|
32
|
+
segments: Number of segments to divide each text into (default: 10)
|
|
33
|
+
top_n: Number of top marker/anti-marker words to return (default: 50)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
ZetaResult with zeta score, marker words, and anti-marker words
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
>>> result = compute_zeta(author1_text, author2_text)
|
|
40
|
+
>>> print(f"Zeta score: {result.zeta_score:.3f}")
|
|
41
|
+
>>> print(f"Marker words: {result.marker_words[:10]}")
|
|
42
|
+
>>> print(f"Anti-markers: {result.anti_marker_words[:10]}")
|
|
43
|
+
"""
|
|
44
|
+
# Tokenize texts
|
|
45
|
+
tokens1 = [t.lower() for t in tokenize(text1)]
|
|
46
|
+
tokens2 = [t.lower() for t in tokenize(text2)]
|
|
47
|
+
|
|
48
|
+
if len(tokens1) < segments or len(tokens2) < segments:
|
|
49
|
+
return ZetaResult(
|
|
50
|
+
zeta_score=0.0,
|
|
51
|
+
marker_words=[],
|
|
52
|
+
anti_marker_words=[],
|
|
53
|
+
metadata={
|
|
54
|
+
"text1_token_count": len(tokens1),
|
|
55
|
+
"text2_token_count": len(tokens2),
|
|
56
|
+
"segments": segments,
|
|
57
|
+
"top_n": top_n,
|
|
58
|
+
"warning": "Text too short for requested number of segments",
|
|
59
|
+
},
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Divide texts into segments
|
|
63
|
+
def create_segments(tokens: list[str], n_segments: int) -> list[set[str]]:
|
|
64
|
+
segment_size = len(tokens) // n_segments
|
|
65
|
+
return [set(tokens[i * segment_size : (i + 1) * segment_size]) for i in range(n_segments)]
|
|
66
|
+
|
|
67
|
+
segments1 = create_segments(tokens1, segments)
|
|
68
|
+
segments2 = create_segments(tokens2, segments)
|
|
69
|
+
|
|
70
|
+
# Get all unique words
|
|
71
|
+
all_words = set(tokens1) | set(tokens2)
|
|
72
|
+
|
|
73
|
+
# Calculate document proportion (DP) for each word
|
|
74
|
+
word_scores = {}
|
|
75
|
+
for word in all_words:
|
|
76
|
+
# DP1: proportion of segments in text1 containing the word
|
|
77
|
+
dp1 = sum(1 for seg in segments1 if word in seg) / len(segments1)
|
|
78
|
+
# DP2: proportion of segments in text2 containing the word
|
|
79
|
+
dp2 = sum(1 for seg in segments2 if word in seg) / len(segments2)
|
|
80
|
+
# Zeta score for this word
|
|
81
|
+
word_scores[word] = dp1 - dp2
|
|
82
|
+
|
|
83
|
+
# Sort words by zeta score
|
|
84
|
+
sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
|
|
85
|
+
|
|
86
|
+
# Extract top marker words (positive zeta) and anti-marker words (negative zeta)
|
|
87
|
+
marker_words = [word for word, score in sorted_words[:top_n] if score > 0]
|
|
88
|
+
anti_marker_words = [word for word, score in sorted_words[-top_n:] if score < 0]
|
|
89
|
+
anti_marker_words.reverse() # Most negative first
|
|
90
|
+
|
|
91
|
+
# Overall zeta score (mean of absolute zeta scores)
|
|
92
|
+
zeta_score = (
|
|
93
|
+
sum(abs(score) for score in word_scores.values()) / len(word_scores) if word_scores else 0.0
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return ZetaResult(
|
|
97
|
+
zeta_score=zeta_score,
|
|
98
|
+
marker_words=marker_words,
|
|
99
|
+
anti_marker_words=anti_marker_words,
|
|
100
|
+
metadata={
|
|
101
|
+
"text1_token_count": len(tokens1),
|
|
102
|
+
"text2_token_count": len(tokens2),
|
|
103
|
+
"segments": segments,
|
|
104
|
+
"top_n": top_n,
|
|
105
|
+
"total_unique_words": len(all_words),
|
|
106
|
+
"marker_word_count": len(marker_words),
|
|
107
|
+
"anti_marker_word_count": len(anti_marker_words),
|
|
108
|
+
},
|
|
109
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Lexical diversity metrics."""
|
|
2
|
+
|
|
3
|
+
# Re-export from stylometry-ttr
|
|
4
|
+
# from stylometry_ttr import compute_ttr, TTRResult
|
|
5
|
+
|
|
6
|
+
# Local implementations
|
|
7
|
+
from .hapax import compute_hapax_ratios
|
|
8
|
+
from .mtld import compute_mtld
|
|
9
|
+
from .yule import compute_yule
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
# "compute_ttr", # From stylometry-ttr
|
|
13
|
+
# "TTRResult", # From stylometry-ttr
|
|
14
|
+
"compute_mtld",
|
|
15
|
+
"compute_yule",
|
|
16
|
+
"compute_hapax_ratios",
|
|
17
|
+
]
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Hapax legomena and related vocabulary richness metrics."""
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
|
|
5
|
+
from .._types import HapaxResult
|
|
6
|
+
from .._utils import tokenize
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def compute_hapax_ratios(text: str) -> HapaxResult:
|
|
10
|
+
"""
|
|
11
|
+
Compute hapax legomena, hapax dislegomena, and related richness metrics.
|
|
12
|
+
|
|
13
|
+
Hapax legomena = words appearing exactly once
|
|
14
|
+
Hapax dislegomena = words appearing exactly twice
|
|
15
|
+
|
|
16
|
+
Also computes:
|
|
17
|
+
- Sichel's S: V₂ / V (ratio of dislegomena to total vocabulary)
|
|
18
|
+
- Honoré's R: 100 × log(N) / (1 - V₁/V)
|
|
19
|
+
|
|
20
|
+
References:
|
|
21
|
+
Sichel, H. S. (1975). On a distribution law for word frequencies.
|
|
22
|
+
Journal of the American Statistical Association, 70(351a), 542-547.
|
|
23
|
+
|
|
24
|
+
Honoré, A. (1979). Some simple measures of richness of vocabulary.
|
|
25
|
+
Association for Literary and Linguistic Computing Bulletin, 7, 172-177.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
text: Input text to analyze
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
HapaxResult with counts, ratios, Sichel's S, Honoré's R, and metadata
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
>>> result = compute_hapax_ratios("The quick brown fox jumps over the lazy dog.")
|
|
35
|
+
>>> print(f"Hapax ratio: {result.hapax_ratio:.3f}")
|
|
36
|
+
>>> print(f"Sichel's S: {result.sichel_s:.3f}")
|
|
37
|
+
"""
|
|
38
|
+
tokens = tokenize(text.lower())
|
|
39
|
+
N = len(tokens) # noqa: N806
|
|
40
|
+
|
|
41
|
+
if N == 0:
|
|
42
|
+
return HapaxResult(
|
|
43
|
+
hapax_count=0,
|
|
44
|
+
hapax_ratio=0.0,
|
|
45
|
+
dis_hapax_count=0,
|
|
46
|
+
dis_hapax_ratio=0.0,
|
|
47
|
+
sichel_s=0.0,
|
|
48
|
+
honore_r=0.0,
|
|
49
|
+
metadata={"token_count": 0, "vocabulary_size": 0},
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Count frequency of each token
|
|
53
|
+
freq_counter = Counter(tokens)
|
|
54
|
+
V = len(freq_counter) # noqa: N806
|
|
55
|
+
|
|
56
|
+
# Count hapax legomena (V₁) and dislegomena (V₂)
|
|
57
|
+
V1 = sum(1 for count in freq_counter.values() if count == 1) # noqa: N806
|
|
58
|
+
V2 = sum(1 for count in freq_counter.values() if count == 2) # noqa: N806
|
|
59
|
+
|
|
60
|
+
# TODO: Implement Sichel's S and Honoré's R
|
|
61
|
+
sichel_s = 0.0 # Placeholder
|
|
62
|
+
honore_r = 0.0 # Placeholder
|
|
63
|
+
|
|
64
|
+
return HapaxResult(
|
|
65
|
+
hapax_count=V1,
|
|
66
|
+
hapax_ratio=V1 / N if N > 0 else 0.0,
|
|
67
|
+
dis_hapax_count=V2,
|
|
68
|
+
dis_hapax_ratio=V2 / N if N > 0 else 0.0,
|
|
69
|
+
sichel_s=sichel_s,
|
|
70
|
+
honore_r=honore_r,
|
|
71
|
+
metadata={
|
|
72
|
+
"token_count": N,
|
|
73
|
+
"vocabulary_size": V,
|
|
74
|
+
},
|
|
75
|
+
)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""MTLD (Measure of Textual Lexical Diversity) implementation."""
|
|
2
|
+
|
|
3
|
+
from .._types import MTLDResult
|
|
4
|
+
from .._utils import tokenize
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def compute_mtld(
|
|
8
|
+
text: str,
|
|
9
|
+
threshold: float = 0.72,
|
|
10
|
+
) -> MTLDResult:
|
|
11
|
+
"""
|
|
12
|
+
Compute MTLD (Measure of Textual Lexical Diversity).
|
|
13
|
+
|
|
14
|
+
MTLD measures the mean length of sequential word strings that maintain
|
|
15
|
+
a minimum threshold TTR. It's more robust than simple TTR for texts of
|
|
16
|
+
varying lengths.
|
|
17
|
+
|
|
18
|
+
Formula:
|
|
19
|
+
MTLD = mean(forward_factors, backward_factors)
|
|
20
|
+
where factors are word string lengths that maintain TTR >= threshold
|
|
21
|
+
|
|
22
|
+
References:
|
|
23
|
+
McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D:
|
|
24
|
+
A validation study of sophisticated approaches to lexical diversity assessment.
|
|
25
|
+
Behavior Research Methods, 42(2), 381-392.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
text: Input text to analyze
|
|
29
|
+
threshold: TTR threshold to maintain (default: 0.72)
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
MTLDResult with forward, backward, and average MTLD scores
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
>>> result = compute_mtld("The quick brown fox jumps over the lazy dog...")
|
|
36
|
+
>>> print(f"MTLD: {result.mtld_average:.2f}")
|
|
37
|
+
"""
|
|
38
|
+
tokens = tokenize(text)
|
|
39
|
+
|
|
40
|
+
if len(tokens) == 0:
|
|
41
|
+
return MTLDResult(
|
|
42
|
+
mtld_forward=0.0,
|
|
43
|
+
mtld_backward=0.0,
|
|
44
|
+
mtld_average=0.0,
|
|
45
|
+
metadata={"token_count": 0, "threshold": threshold},
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# TODO: Implement forward and backward MTLD calculation
|
|
49
|
+
mtld_forward = 0.0 # Placeholder
|
|
50
|
+
mtld_backward = 0.0 # Placeholder
|
|
51
|
+
mtld_average = (mtld_forward + mtld_backward) / 2
|
|
52
|
+
|
|
53
|
+
return MTLDResult(
|
|
54
|
+
mtld_forward=mtld_forward,
|
|
55
|
+
mtld_backward=mtld_backward,
|
|
56
|
+
mtld_average=mtld_average,
|
|
57
|
+
metadata={
|
|
58
|
+
"token_count": len(tokens),
|
|
59
|
+
"threshold": threshold,
|
|
60
|
+
},
|
|
61
|
+
)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Yule's K and I statistics for vocabulary richness."""
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
|
|
5
|
+
from .._types import YuleResult
|
|
6
|
+
from .._utils import tokenize
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def compute_yule(text: str) -> YuleResult:
|
|
10
|
+
"""
|
|
11
|
+
Compute Yule's K and I metrics for vocabulary richness.
|
|
12
|
+
|
|
13
|
+
Yule's K measures vocabulary repetitiveness (higher = more repetitive).
|
|
14
|
+
Yule's I is the inverse measure (higher = more diverse).
|
|
15
|
+
|
|
16
|
+
Formula:
|
|
17
|
+
K = 10⁴ × (Σm²×Vm - N) / N²
|
|
18
|
+
I = (V² / Σm²×Vm) - (1/N)
|
|
19
|
+
|
|
20
|
+
Where:
|
|
21
|
+
- N = total tokens
|
|
22
|
+
- V = vocabulary size (unique types)
|
|
23
|
+
- Vm = number of types occurring m times
|
|
24
|
+
- m = frequency count
|
|
25
|
+
|
|
26
|
+
References:
|
|
27
|
+
Yule, G. U. (1944). The Statistical Study of Literary Vocabulary.
|
|
28
|
+
Cambridge University Press.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
text: Input text to analyze
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
YuleResult with .yule_k, .yule_i, and metadata
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
>>> result = compute_yule("The quick brown fox jumps over the lazy dog.")
|
|
38
|
+
>>> print(f"Yule's K: {result.yule_k:.2f}")
|
|
39
|
+
>>> print(f"Yule's I: {result.yule_i:.2f}")
|
|
40
|
+
"""
|
|
41
|
+
tokens = tokenize(text.lower())
|
|
42
|
+
N = len(tokens) # noqa: N806
|
|
43
|
+
|
|
44
|
+
if N == 0:
|
|
45
|
+
return YuleResult(yule_k=0.0, yule_i=0.0, metadata={"token_count": 0, "vocabulary_size": 0})
|
|
46
|
+
|
|
47
|
+
# Count frequency of each token
|
|
48
|
+
freq_counter = Counter(tokens)
|
|
49
|
+
V = len(freq_counter) # noqa: N806
|
|
50
|
+
|
|
51
|
+
# Count how many words occur with each frequency
|
|
52
|
+
# Vm[m] = number of words that occur exactly m times
|
|
53
|
+
# freq_of_freqs = Counter(freq_counter.values()) # TODO: Will be needed for Yule's K
|
|
54
|
+
|
|
55
|
+
# TODO: Implement Yule's K and I calculations
|
|
56
|
+
yule_k = 0.0 # Placeholder
|
|
57
|
+
yule_i = 0.0 # Placeholder
|
|
58
|
+
|
|
59
|
+
return YuleResult(
|
|
60
|
+
yule_k=yule_k,
|
|
61
|
+
yule_i=yule_i,
|
|
62
|
+
metadata={
|
|
63
|
+
"token_count": N,
|
|
64
|
+
"vocabulary_size": V,
|
|
65
|
+
},
|
|
66
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""N-gram entropy and sequence analysis metrics."""
|
|
2
|
+
|
|
3
|
+
from .entropy import (
|
|
4
|
+
compute_character_bigram_entropy,
|
|
5
|
+
compute_ngram_entropy,
|
|
6
|
+
compute_word_bigram_entropy,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"compute_ngram_entropy",
|
|
11
|
+
"compute_character_bigram_entropy",
|
|
12
|
+
"compute_word_bigram_entropy",
|
|
13
|
+
]
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""N-gram entropy and perplexity calculations."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
from collections import Counter
|
|
5
|
+
|
|
6
|
+
from .._types import EntropyResult
|
|
7
|
+
from .._utils import tokenize
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def compute_ngram_entropy(text: str, n: int = 2, ngram_type: str = "word") -> EntropyResult:
|
|
11
|
+
"""
|
|
12
|
+
Compute n-gram entropy and perplexity for text.
|
|
13
|
+
|
|
14
|
+
Entropy measures the unpredictability of the next item in a sequence.
|
|
15
|
+
Higher entropy = more unpredictable = more diverse/complex text.
|
|
16
|
+
|
|
17
|
+
Formula:
|
|
18
|
+
H(X) = -Σ p(x) × log₂(p(x))
|
|
19
|
+
Perplexity = 2^H(X)
|
|
20
|
+
|
|
21
|
+
Where p(x) is the probability of n-gram x occurring.
|
|
22
|
+
|
|
23
|
+
References:
|
|
24
|
+
Shannon, C. E. (1948). A mathematical theory of communication.
|
|
25
|
+
Bell System Technical Journal, 27(3), 379-423.
|
|
26
|
+
|
|
27
|
+
Manning, C. D., & Schütze, H. (1999). Foundations of Statistical
|
|
28
|
+
Natural Language Processing. MIT Press.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
text: Input text to analyze
|
|
32
|
+
n: N-gram size (2 for bigrams, 3 for trigrams, etc.)
|
|
33
|
+
ngram_type: "word" or "character" (default: "word")
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
EntropyResult with entropy, perplexity, and metadata
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
>>> result = compute_ngram_entropy("The quick brown fox jumps", n=2, ngram_type="word")
|
|
40
|
+
>>> print(f"Bigram entropy: {result.entropy:.3f}")
|
|
41
|
+
>>> print(f"Perplexity: {result.perplexity:.3f}")
|
|
42
|
+
"""
|
|
43
|
+
# Generate n-grams
|
|
44
|
+
if ngram_type == "character":
|
|
45
|
+
items = list(text)
|
|
46
|
+
else: # word
|
|
47
|
+
items = tokenize(text)
|
|
48
|
+
|
|
49
|
+
if len(items) < n:
|
|
50
|
+
return EntropyResult(
|
|
51
|
+
entropy=0.0,
|
|
52
|
+
perplexity=1.0,
|
|
53
|
+
ngram_type=f"{ngram_type}_{n}gram",
|
|
54
|
+
metadata={
|
|
55
|
+
"n": n,
|
|
56
|
+
"ngram_type": ngram_type,
|
|
57
|
+
"item_count": len(items),
|
|
58
|
+
"warning": "Text too short for n-gram analysis",
|
|
59
|
+
},
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Create n-grams using sliding window
|
|
63
|
+
ngram_list = []
|
|
64
|
+
for i in range(len(items) - n + 1):
|
|
65
|
+
ngram = tuple(items[i : i + n])
|
|
66
|
+
ngram_list.append(ngram)
|
|
67
|
+
|
|
68
|
+
# Count n-gram frequencies
|
|
69
|
+
ngram_counts = Counter(ngram_list)
|
|
70
|
+
total_ngrams = len(ngram_list)
|
|
71
|
+
|
|
72
|
+
# Calculate entropy: H(X) = -Σ p(x) × log₂(p(x))
|
|
73
|
+
entropy = 0.0
|
|
74
|
+
for count in ngram_counts.values():
|
|
75
|
+
probability = count / total_ngrams
|
|
76
|
+
entropy -= probability * math.log2(probability)
|
|
77
|
+
|
|
78
|
+
# Calculate perplexity: 2^H(X)
|
|
79
|
+
perplexity = 2**entropy
|
|
80
|
+
|
|
81
|
+
return EntropyResult(
|
|
82
|
+
entropy=entropy,
|
|
83
|
+
perplexity=perplexity,
|
|
84
|
+
ngram_type=f"{ngram_type}_{n}gram",
|
|
85
|
+
metadata={
|
|
86
|
+
"n": n,
|
|
87
|
+
"ngram_type": ngram_type,
|
|
88
|
+
"item_count": len(items),
|
|
89
|
+
"unique_ngrams": len(ngram_counts),
|
|
90
|
+
"total_ngrams": total_ngrams,
|
|
91
|
+
},
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def compute_character_bigram_entropy(text: str) -> EntropyResult:
|
|
96
|
+
"""
|
|
97
|
+
Compute character bigram entropy.
|
|
98
|
+
|
|
99
|
+
Convenience function that calls compute_ngram_entropy with n=2, ngram_type="character".
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
text: Input text to analyze
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
EntropyResult with character bigram entropy and perplexity
|
|
106
|
+
|
|
107
|
+
Example:
|
|
108
|
+
>>> result = compute_character_bigram_entropy("The quick brown fox")
|
|
109
|
+
>>> print(f"Character bigram entropy: {result.entropy:.3f}")
|
|
110
|
+
"""
|
|
111
|
+
return compute_ngram_entropy(text, n=2, ngram_type="character")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def compute_word_bigram_entropy(text: str) -> EntropyResult:
|
|
115
|
+
"""
|
|
116
|
+
Compute word bigram entropy.
|
|
117
|
+
|
|
118
|
+
Convenience function that calls compute_ngram_entropy with n=2, ngram_type="word".
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
text: Input text to analyze
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
EntropyResult with word bigram entropy and perplexity
|
|
125
|
+
|
|
126
|
+
Example:
|
|
127
|
+
>>> result = compute_word_bigram_entropy("The quick brown fox jumps")
|
|
128
|
+
>>> print(f"Word bigram entropy: {result.entropy:.3f}")
|
|
129
|
+
"""
|
|
130
|
+
return compute_ngram_entropy(text, n=2, ngram_type="word")
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Readability metrics."""
|
|
2
|
+
|
|
3
|
+
from .ari import compute_ari
|
|
4
|
+
from .coleman_liau import compute_coleman_liau
|
|
5
|
+
from .flesch import compute_flesch
|
|
6
|
+
from .gunning_fog import compute_gunning_fog
|
|
7
|
+
from .smog import compute_smog
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"compute_flesch",
|
|
11
|
+
"compute_smog",
|
|
12
|
+
"compute_gunning_fog",
|
|
13
|
+
"compute_coleman_liau",
|
|
14
|
+
"compute_ari",
|
|
15
|
+
]
|