pystylometry 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,152 @@
1
+ """Burrows' Delta and Cosine Delta for authorship attribution."""
2
+
3
+ import math
4
+ import statistics
5
+ from collections import Counter
6
+
7
+ from .._types import BurrowsDeltaResult
8
+ from .._utils import tokenize
9
+
10
+
11
+ def compute_burrows_delta(
12
+ text1: str, text2: str, mfw: int = 500, distance_type: str = "burrows"
13
+ ) -> BurrowsDeltaResult:
14
+ """
15
+ Compute Burrows' Delta or Cosine Delta between two texts.
16
+
17
+ Burrows' Delta:
18
+ Delta = mean(|z₁(f) - z₂(f)|) for all features f
19
+ where z(f) = (frequency(f) - mean(f)) / std(f)
20
+
21
+ Cosine Delta:
22
+ Delta = 1 - cos(z₁, z₂)
23
+ Measures angular distance between z-score vectors
24
+
25
+ Both methods:
26
+ 1. Extract most frequent words (MFW) across both texts
27
+ 2. Calculate word frequencies in each text
28
+ 3. Z-score normalize frequencies
29
+ 4. Compute distance measure
30
+
31
+ Lower scores indicate more similar texts (likely same author).
32
+
33
+ References:
34
+ Burrows, J. (2002). 'Delta': A measure of stylistic difference and
35
+ a guide to likely authorship. Literary and Linguistic Computing, 17(3), 267-287.
36
+
37
+ Argamon, S. (2008). Interpreting Burrows's Delta: Geometric and
38
+ probabilistic foundations. Literary and Linguistic Computing, 23(2), 131-147.
39
+
40
+ Args:
41
+ text1: First text to compare
42
+ text2: Second text to compare
43
+ mfw: Number of most frequent words to use (default: 500)
44
+ distance_type: "burrows", "cosine", or "eder" (default: "burrows")
45
+
46
+ Returns:
47
+ BurrowsDeltaResult with delta score and metadata
48
+
49
+ Example:
50
+ >>> result = compute_burrows_delta(text1, text2, mfw=300)
51
+ >>> print(f"Delta score: {result.delta_score:.3f}")
52
+ >>> print(f"Lower is more similar")
53
+ """
54
+ # Tokenize and count words
55
+ tokens1 = [t.lower() for t in tokenize(text1)]
56
+ tokens2 = [t.lower() for t in tokenize(text2)]
57
+
58
+ if len(tokens1) == 0 or len(tokens2) == 0:
59
+ return BurrowsDeltaResult(
60
+ delta_score=0.0,
61
+ distance_type=distance_type,
62
+ mfw_count=0,
63
+ metadata={
64
+ "text1_token_count": len(tokens1),
65
+ "text2_token_count": len(tokens2),
66
+ "warning": "One or both texts are empty",
67
+ },
68
+ )
69
+
70
+ # Get word frequencies
71
+ freq1 = Counter(tokens1)
72
+ freq2 = Counter(tokens2)
73
+
74
+ # Get most frequent words across both texts
75
+ all_words: Counter[str] = Counter()
76
+ all_words.update(freq1)
77
+ all_words.update(freq2)
78
+ most_common_words = [word for word, _ in all_words.most_common(mfw)]
79
+
80
+ # Calculate relative frequencies for MFW
81
+ def get_relative_freqs(freq_counter: Counter, words: list[str], total: int) -> list[float]:
82
+ return [freq_counter.get(word, 0) / total for word in words]
83
+
84
+ rel_freqs1 = get_relative_freqs(freq1, most_common_words, len(tokens1))
85
+ rel_freqs2 = get_relative_freqs(freq2, most_common_words, len(tokens2))
86
+
87
+ # Combine for z-score calculation (treat as corpus)
88
+ combined_freqs = [(f1 + f2) / 2 for f1, f2 in zip(rel_freqs1, rel_freqs2)]
89
+
90
+ # Calculate standard deviation for each word position
91
+ combined_std = []
92
+ for i in range(len(most_common_words)):
93
+ values = [rel_freqs1[i], rel_freqs2[i]]
94
+ std = statistics.stdev(values) if len(set(values)) > 1 else 1e-10
95
+ combined_std.append(std if std > 0 else 1e-10)
96
+
97
+ # Calculate z-scores
98
+ z1 = [(f - mean) / std for f, mean, std in zip(rel_freqs1, combined_freqs, combined_std)]
99
+ z2 = [(f - mean) / std for f, mean, std in zip(rel_freqs2, combined_freqs, combined_std)]
100
+
101
+ # Calculate distance based on type
102
+ if distance_type == "burrows":
103
+ # Burrows' Delta: mean absolute difference of z-scores
104
+ abs_diffs = [abs(z1_val - z2_val) for z1_val, z2_val in zip(z1, z2)]
105
+ delta_score = statistics.mean(abs_diffs) if abs_diffs else 0.0
106
+ elif distance_type == "cosine":
107
+ # Cosine Delta: 1 - cosine similarity
108
+ dot_product = sum(z1_val * z2_val for z1_val, z2_val in zip(z1, z2))
109
+ norm1 = math.sqrt(sum(z**2 for z in z1))
110
+ norm2 = math.sqrt(sum(z**2 for z in z2))
111
+ cosine_sim = dot_product / (norm1 * norm2) if norm1 > 0 and norm2 > 0 else 0
112
+ delta_score = 1 - cosine_sim
113
+ elif distance_type == "eder":
114
+ # Eder's Delta: similar to Burrows but with different normalization
115
+ abs_diffs = [abs(z1_val - z2_val) for z1_val, z2_val in zip(z1, z2)]
116
+ delta_score = statistics.mean(abs_diffs) if abs_diffs else 0.0
117
+ else:
118
+ abs_diffs = [abs(z1_val - z2_val) for z1_val, z2_val in zip(z1, z2)]
119
+ delta_score = statistics.mean(abs_diffs) if abs_diffs else 0.0
120
+
121
+ return BurrowsDeltaResult(
122
+ delta_score=delta_score,
123
+ distance_type=distance_type,
124
+ mfw_count=len(most_common_words),
125
+ metadata={
126
+ "text1_token_count": len(tokens1),
127
+ "text2_token_count": len(tokens2),
128
+ "text1_vocab": len(freq1),
129
+ "text2_vocab": len(freq2),
130
+ },
131
+ )
132
+
133
+
134
+ def compute_cosine_delta(text1: str, text2: str, mfw: int = 500) -> BurrowsDeltaResult:
135
+ """
136
+ Compute Cosine Delta between two texts.
137
+
138
+ Convenience function that calls compute_burrows_delta with distance_type="cosine".
139
+
140
+ Args:
141
+ text1: First text to compare
142
+ text2: Second text to compare
143
+ mfw: Number of most frequent words to use (default: 500)
144
+
145
+ Returns:
146
+ BurrowsDeltaResult with cosine delta score
147
+
148
+ Example:
149
+ >>> result = compute_cosine_delta(text1, text2)
150
+ >>> print(f"Cosine Delta: {result.delta_score:.3f}")
151
+ """
152
+ return compute_burrows_delta(text1, text2, mfw=mfw, distance_type="cosine")
@@ -0,0 +1,109 @@
1
+ """Zeta score for distinctive word usage in authorship attribution."""
2
+
3
+ from .._types import ZetaResult
4
+ from .._utils import tokenize
5
+
6
+
7
+ def compute_zeta(text1: str, text2: str, segments: int = 10, top_n: int = 50) -> ZetaResult:
8
+ """
9
+ Compute Zeta score for distinctive word usage between two texts or text groups.
10
+
11
+ Zeta identifies words that are consistently used in one text/author but not another.
12
+
13
+ Algorithm:
14
+ 1. Divide each text into segments
15
+ 2. Calculate document proportion (DP) for each word:
16
+ - DP₁ = proportion of segments in text1 containing the word
17
+ - DP₂ = proportion of segments in text2 containing the word
18
+ 3. Zeta score = DP₁ - DP₂
19
+ 4. Positive Zeta = marker words (distinctive of text1)
20
+ 5. Negative Zeta = anti-marker words (distinctive of text2)
21
+
22
+ References:
23
+ Burrows, J. (2007). All the way through: Testing for authorship in
24
+ different frequency strata. Literary and Linguistic Computing, 22(1), 27-47.
25
+
26
+ Craig, H., & Kinney, A. F. (2009). Shakespeare, Computers, and the
27
+ Mystery of Authorship. Cambridge University Press.
28
+
29
+ Args:
30
+ text1: First text (candidate author)
31
+ text2: Second text (comparison author/corpus)
32
+ segments: Number of segments to divide each text into (default: 10)
33
+ top_n: Number of top marker/anti-marker words to return (default: 50)
34
+
35
+ Returns:
36
+ ZetaResult with zeta score, marker words, and anti-marker words
37
+
38
+ Example:
39
+ >>> result = compute_zeta(author1_text, author2_text)
40
+ >>> print(f"Zeta score: {result.zeta_score:.3f}")
41
+ >>> print(f"Marker words: {result.marker_words[:10]}")
42
+ >>> print(f"Anti-markers: {result.anti_marker_words[:10]}")
43
+ """
44
+ # Tokenize texts
45
+ tokens1 = [t.lower() for t in tokenize(text1)]
46
+ tokens2 = [t.lower() for t in tokenize(text2)]
47
+
48
+ if len(tokens1) < segments or len(tokens2) < segments:
49
+ return ZetaResult(
50
+ zeta_score=0.0,
51
+ marker_words=[],
52
+ anti_marker_words=[],
53
+ metadata={
54
+ "text1_token_count": len(tokens1),
55
+ "text2_token_count": len(tokens2),
56
+ "segments": segments,
57
+ "top_n": top_n,
58
+ "warning": "Text too short for requested number of segments",
59
+ },
60
+ )
61
+
62
+ # Divide texts into segments
63
+ def create_segments(tokens: list[str], n_segments: int) -> list[set[str]]:
64
+ segment_size = len(tokens) // n_segments
65
+ return [set(tokens[i * segment_size : (i + 1) * segment_size]) for i in range(n_segments)]
66
+
67
+ segments1 = create_segments(tokens1, segments)
68
+ segments2 = create_segments(tokens2, segments)
69
+
70
+ # Get all unique words
71
+ all_words = set(tokens1) | set(tokens2)
72
+
73
+ # Calculate document proportion (DP) for each word
74
+ word_scores = {}
75
+ for word in all_words:
76
+ # DP1: proportion of segments in text1 containing the word
77
+ dp1 = sum(1 for seg in segments1 if word in seg) / len(segments1)
78
+ # DP2: proportion of segments in text2 containing the word
79
+ dp2 = sum(1 for seg in segments2 if word in seg) / len(segments2)
80
+ # Zeta score for this word
81
+ word_scores[word] = dp1 - dp2
82
+
83
+ # Sort words by zeta score
84
+ sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
85
+
86
+ # Extract top marker words (positive zeta) and anti-marker words (negative zeta)
87
+ marker_words = [word for word, score in sorted_words[:top_n] if score > 0]
88
+ anti_marker_words = [word for word, score in sorted_words[-top_n:] if score < 0]
89
+ anti_marker_words.reverse() # Most negative first
90
+
91
+ # Overall zeta score (mean of absolute zeta scores)
92
+ zeta_score = (
93
+ sum(abs(score) for score in word_scores.values()) / len(word_scores) if word_scores else 0.0
94
+ )
95
+
96
+ return ZetaResult(
97
+ zeta_score=zeta_score,
98
+ marker_words=marker_words,
99
+ anti_marker_words=anti_marker_words,
100
+ metadata={
101
+ "text1_token_count": len(tokens1),
102
+ "text2_token_count": len(tokens2),
103
+ "segments": segments,
104
+ "top_n": top_n,
105
+ "total_unique_words": len(all_words),
106
+ "marker_word_count": len(marker_words),
107
+ "anti_marker_word_count": len(anti_marker_words),
108
+ },
109
+ )
@@ -0,0 +1,17 @@
1
+ """Lexical diversity metrics."""
2
+
3
+ # Re-export from stylometry-ttr
4
+ # from stylometry_ttr import compute_ttr, TTRResult
5
+
6
+ # Local implementations
7
+ from .hapax import compute_hapax_ratios
8
+ from .mtld import compute_mtld
9
+ from .yule import compute_yule
10
+
11
+ __all__ = [
12
+ # "compute_ttr", # From stylometry-ttr
13
+ # "TTRResult", # From stylometry-ttr
14
+ "compute_mtld",
15
+ "compute_yule",
16
+ "compute_hapax_ratios",
17
+ ]
@@ -0,0 +1,75 @@
1
+ """Hapax legomena and related vocabulary richness metrics."""
2
+
3
+ from collections import Counter
4
+
5
+ from .._types import HapaxResult
6
+ from .._utils import tokenize
7
+
8
+
9
+ def compute_hapax_ratios(text: str) -> HapaxResult:
10
+ """
11
+ Compute hapax legomena, hapax dislegomena, and related richness metrics.
12
+
13
+ Hapax legomena = words appearing exactly once
14
+ Hapax dislegomena = words appearing exactly twice
15
+
16
+ Also computes:
17
+ - Sichel's S: V₂ / V (ratio of dislegomena to total vocabulary)
18
+ - Honoré's R: 100 × log(N) / (1 - V₁/V)
19
+
20
+ References:
21
+ Sichel, H. S. (1975). On a distribution law for word frequencies.
22
+ Journal of the American Statistical Association, 70(351a), 542-547.
23
+
24
+ Honoré, A. (1979). Some simple measures of richness of vocabulary.
25
+ Association for Literary and Linguistic Computing Bulletin, 7, 172-177.
26
+
27
+ Args:
28
+ text: Input text to analyze
29
+
30
+ Returns:
31
+ HapaxResult with counts, ratios, Sichel's S, Honoré's R, and metadata
32
+
33
+ Example:
34
+ >>> result = compute_hapax_ratios("The quick brown fox jumps over the lazy dog.")
35
+ >>> print(f"Hapax ratio: {result.hapax_ratio:.3f}")
36
+ >>> print(f"Sichel's S: {result.sichel_s:.3f}")
37
+ """
38
+ tokens = tokenize(text.lower())
39
+ N = len(tokens) # noqa: N806
40
+
41
+ if N == 0:
42
+ return HapaxResult(
43
+ hapax_count=0,
44
+ hapax_ratio=0.0,
45
+ dis_hapax_count=0,
46
+ dis_hapax_ratio=0.0,
47
+ sichel_s=0.0,
48
+ honore_r=0.0,
49
+ metadata={"token_count": 0, "vocabulary_size": 0},
50
+ )
51
+
52
+ # Count frequency of each token
53
+ freq_counter = Counter(tokens)
54
+ V = len(freq_counter) # noqa: N806
55
+
56
+ # Count hapax legomena (V₁) and dislegomena (V₂)
57
+ V1 = sum(1 for count in freq_counter.values() if count == 1) # noqa: N806
58
+ V2 = sum(1 for count in freq_counter.values() if count == 2) # noqa: N806
59
+
60
+ # TODO: Implement Sichel's S and Honoré's R
61
+ sichel_s = 0.0 # Placeholder
62
+ honore_r = 0.0 # Placeholder
63
+
64
+ return HapaxResult(
65
+ hapax_count=V1,
66
+ hapax_ratio=V1 / N if N > 0 else 0.0,
67
+ dis_hapax_count=V2,
68
+ dis_hapax_ratio=V2 / N if N > 0 else 0.0,
69
+ sichel_s=sichel_s,
70
+ honore_r=honore_r,
71
+ metadata={
72
+ "token_count": N,
73
+ "vocabulary_size": V,
74
+ },
75
+ )
@@ -0,0 +1,61 @@
1
+ """MTLD (Measure of Textual Lexical Diversity) implementation."""
2
+
3
+ from .._types import MTLDResult
4
+ from .._utils import tokenize
5
+
6
+
7
+ def compute_mtld(
8
+ text: str,
9
+ threshold: float = 0.72,
10
+ ) -> MTLDResult:
11
+ """
12
+ Compute MTLD (Measure of Textual Lexical Diversity).
13
+
14
+ MTLD measures the mean length of sequential word strings that maintain
15
+ a minimum threshold TTR. It's more robust than simple TTR for texts of
16
+ varying lengths.
17
+
18
+ Formula:
19
+ MTLD = mean(forward_factors, backward_factors)
20
+ where factors are word string lengths that maintain TTR >= threshold
21
+
22
+ References:
23
+ McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D:
24
+ A validation study of sophisticated approaches to lexical diversity assessment.
25
+ Behavior Research Methods, 42(2), 381-392.
26
+
27
+ Args:
28
+ text: Input text to analyze
29
+ threshold: TTR threshold to maintain (default: 0.72)
30
+
31
+ Returns:
32
+ MTLDResult with forward, backward, and average MTLD scores
33
+
34
+ Example:
35
+ >>> result = compute_mtld("The quick brown fox jumps over the lazy dog...")
36
+ >>> print(f"MTLD: {result.mtld_average:.2f}")
37
+ """
38
+ tokens = tokenize(text)
39
+
40
+ if len(tokens) == 0:
41
+ return MTLDResult(
42
+ mtld_forward=0.0,
43
+ mtld_backward=0.0,
44
+ mtld_average=0.0,
45
+ metadata={"token_count": 0, "threshold": threshold},
46
+ )
47
+
48
+ # TODO: Implement forward and backward MTLD calculation
49
+ mtld_forward = 0.0 # Placeholder
50
+ mtld_backward = 0.0 # Placeholder
51
+ mtld_average = (mtld_forward + mtld_backward) / 2
52
+
53
+ return MTLDResult(
54
+ mtld_forward=mtld_forward,
55
+ mtld_backward=mtld_backward,
56
+ mtld_average=mtld_average,
57
+ metadata={
58
+ "token_count": len(tokens),
59
+ "threshold": threshold,
60
+ },
61
+ )
@@ -0,0 +1,66 @@
1
+ """Yule's K and I statistics for vocabulary richness."""
2
+
3
+ from collections import Counter
4
+
5
+ from .._types import YuleResult
6
+ from .._utils import tokenize
7
+
8
+
9
+ def compute_yule(text: str) -> YuleResult:
10
+ """
11
+ Compute Yule's K and I metrics for vocabulary richness.
12
+
13
+ Yule's K measures vocabulary repetitiveness (higher = more repetitive).
14
+ Yule's I is the inverse measure (higher = more diverse).
15
+
16
+ Formula:
17
+ K = 10⁴ × (Σm²×Vm - N) / N²
18
+ I = (V² / Σm²×Vm) - (1/N)
19
+
20
+ Where:
21
+ - N = total tokens
22
+ - V = vocabulary size (unique types)
23
+ - Vm = number of types occurring m times
24
+ - m = frequency count
25
+
26
+ References:
27
+ Yule, G. U. (1944). The Statistical Study of Literary Vocabulary.
28
+ Cambridge University Press.
29
+
30
+ Args:
31
+ text: Input text to analyze
32
+
33
+ Returns:
34
+ YuleResult with .yule_k, .yule_i, and metadata
35
+
36
+ Example:
37
+ >>> result = compute_yule("The quick brown fox jumps over the lazy dog.")
38
+ >>> print(f"Yule's K: {result.yule_k:.2f}")
39
+ >>> print(f"Yule's I: {result.yule_i:.2f}")
40
+ """
41
+ tokens = tokenize(text.lower())
42
+ N = len(tokens) # noqa: N806
43
+
44
+ if N == 0:
45
+ return YuleResult(yule_k=0.0, yule_i=0.0, metadata={"token_count": 0, "vocabulary_size": 0})
46
+
47
+ # Count frequency of each token
48
+ freq_counter = Counter(tokens)
49
+ V = len(freq_counter) # noqa: N806
50
+
51
+ # Count how many words occur with each frequency
52
+ # Vm[m] = number of words that occur exactly m times
53
+ # freq_of_freqs = Counter(freq_counter.values()) # TODO: Will be needed for Yule's K
54
+
55
+ # TODO: Implement Yule's K and I calculations
56
+ yule_k = 0.0 # Placeholder
57
+ yule_i = 0.0 # Placeholder
58
+
59
+ return YuleResult(
60
+ yule_k=yule_k,
61
+ yule_i=yule_i,
62
+ metadata={
63
+ "token_count": N,
64
+ "vocabulary_size": V,
65
+ },
66
+ )
@@ -0,0 +1,13 @@
1
+ """N-gram entropy and sequence analysis metrics."""
2
+
3
+ from .entropy import (
4
+ compute_character_bigram_entropy,
5
+ compute_ngram_entropy,
6
+ compute_word_bigram_entropy,
7
+ )
8
+
9
+ __all__ = [
10
+ "compute_ngram_entropy",
11
+ "compute_character_bigram_entropy",
12
+ "compute_word_bigram_entropy",
13
+ ]
@@ -0,0 +1,130 @@
1
+ """N-gram entropy and perplexity calculations."""
2
+
3
+ import math
4
+ from collections import Counter
5
+
6
+ from .._types import EntropyResult
7
+ from .._utils import tokenize
8
+
9
+
10
+ def compute_ngram_entropy(text: str, n: int = 2, ngram_type: str = "word") -> EntropyResult:
11
+ """
12
+ Compute n-gram entropy and perplexity for text.
13
+
14
+ Entropy measures the unpredictability of the next item in a sequence.
15
+ Higher entropy = more unpredictable = more diverse/complex text.
16
+
17
+ Formula:
18
+ H(X) = -Σ p(x) × log₂(p(x))
19
+ Perplexity = 2^H(X)
20
+
21
+ Where p(x) is the probability of n-gram x occurring.
22
+
23
+ References:
24
+ Shannon, C. E. (1948). A mathematical theory of communication.
25
+ Bell System Technical Journal, 27(3), 379-423.
26
+
27
+ Manning, C. D., & Schütze, H. (1999). Foundations of Statistical
28
+ Natural Language Processing. MIT Press.
29
+
30
+ Args:
31
+ text: Input text to analyze
32
+ n: N-gram size (2 for bigrams, 3 for trigrams, etc.)
33
+ ngram_type: "word" or "character" (default: "word")
34
+
35
+ Returns:
36
+ EntropyResult with entropy, perplexity, and metadata
37
+
38
+ Example:
39
+ >>> result = compute_ngram_entropy("The quick brown fox jumps", n=2, ngram_type="word")
40
+ >>> print(f"Bigram entropy: {result.entropy:.3f}")
41
+ >>> print(f"Perplexity: {result.perplexity:.3f}")
42
+ """
43
+ # Generate n-grams
44
+ if ngram_type == "character":
45
+ items = list(text)
46
+ else: # word
47
+ items = tokenize(text)
48
+
49
+ if len(items) < n:
50
+ return EntropyResult(
51
+ entropy=0.0,
52
+ perplexity=1.0,
53
+ ngram_type=f"{ngram_type}_{n}gram",
54
+ metadata={
55
+ "n": n,
56
+ "ngram_type": ngram_type,
57
+ "item_count": len(items),
58
+ "warning": "Text too short for n-gram analysis",
59
+ },
60
+ )
61
+
62
+ # Create n-grams using sliding window
63
+ ngram_list = []
64
+ for i in range(len(items) - n + 1):
65
+ ngram = tuple(items[i : i + n])
66
+ ngram_list.append(ngram)
67
+
68
+ # Count n-gram frequencies
69
+ ngram_counts = Counter(ngram_list)
70
+ total_ngrams = len(ngram_list)
71
+
72
+ # Calculate entropy: H(X) = -Σ p(x) × log₂(p(x))
73
+ entropy = 0.0
74
+ for count in ngram_counts.values():
75
+ probability = count / total_ngrams
76
+ entropy -= probability * math.log2(probability)
77
+
78
+ # Calculate perplexity: 2^H(X)
79
+ perplexity = 2**entropy
80
+
81
+ return EntropyResult(
82
+ entropy=entropy,
83
+ perplexity=perplexity,
84
+ ngram_type=f"{ngram_type}_{n}gram",
85
+ metadata={
86
+ "n": n,
87
+ "ngram_type": ngram_type,
88
+ "item_count": len(items),
89
+ "unique_ngrams": len(ngram_counts),
90
+ "total_ngrams": total_ngrams,
91
+ },
92
+ )
93
+
94
+
95
+ def compute_character_bigram_entropy(text: str) -> EntropyResult:
96
+ """
97
+ Compute character bigram entropy.
98
+
99
+ Convenience function that calls compute_ngram_entropy with n=2, ngram_type="character".
100
+
101
+ Args:
102
+ text: Input text to analyze
103
+
104
+ Returns:
105
+ EntropyResult with character bigram entropy and perplexity
106
+
107
+ Example:
108
+ >>> result = compute_character_bigram_entropy("The quick brown fox")
109
+ >>> print(f"Character bigram entropy: {result.entropy:.3f}")
110
+ """
111
+ return compute_ngram_entropy(text, n=2, ngram_type="character")
112
+
113
+
114
+ def compute_word_bigram_entropy(text: str) -> EntropyResult:
115
+ """
116
+ Compute word bigram entropy.
117
+
118
+ Convenience function that calls compute_ngram_entropy with n=2, ngram_type="word".
119
+
120
+ Args:
121
+ text: Input text to analyze
122
+
123
+ Returns:
124
+ EntropyResult with word bigram entropy and perplexity
125
+
126
+ Example:
127
+ >>> result = compute_word_bigram_entropy("The quick brown fox jumps")
128
+ >>> print(f"Word bigram entropy: {result.entropy:.3f}")
129
+ """
130
+ return compute_ngram_entropy(text, n=2, ngram_type="word")
@@ -0,0 +1,15 @@
1
+ """Readability metrics."""
2
+
3
+ from .ari import compute_ari
4
+ from .coleman_liau import compute_coleman_liau
5
+ from .flesch import compute_flesch
6
+ from .gunning_fog import compute_gunning_fog
7
+ from .smog import compute_smog
8
+
9
+ __all__ = [
10
+ "compute_flesch",
11
+ "compute_smog",
12
+ "compute_gunning_fog",
13
+ "compute_coleman_liau",
14
+ "compute_ari",
15
+ ]