pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pystylometry/README.md +42 -0
  2. pystylometry/__init__.py +45 -3
  3. pystylometry/_types.py +1017 -259
  4. pystylometry/authorship/README.md +21 -0
  5. pystylometry/authorship/__init__.py +28 -4
  6. pystylometry/authorship/additional_methods.py +260 -40
  7. pystylometry/authorship/compression.py +175 -0
  8. pystylometry/authorship/kilgarriff.py +354 -0
  9. pystylometry/character/README.md +17 -0
  10. pystylometry/character/character_metrics.py +267 -179
  11. pystylometry/cli.py +427 -0
  12. pystylometry/consistency/README.md +27 -0
  13. pystylometry/consistency/__init__.py +57 -0
  14. pystylometry/consistency/_thresholds.py +162 -0
  15. pystylometry/consistency/drift.py +549 -0
  16. pystylometry/dialect/README.md +26 -0
  17. pystylometry/dialect/__init__.py +65 -0
  18. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  19. pystylometry/dialect/_loader.py +360 -0
  20. pystylometry/dialect/detector.py +533 -0
  21. pystylometry/lexical/README.md +23 -0
  22. pystylometry/lexical/advanced_diversity.py +61 -22
  23. pystylometry/lexical/function_words.py +255 -56
  24. pystylometry/lexical/hapax.py +182 -52
  25. pystylometry/lexical/mtld.py +108 -26
  26. pystylometry/lexical/ttr.py +76 -10
  27. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  28. pystylometry/lexical/yule.py +136 -50
  29. pystylometry/ngrams/README.md +18 -0
  30. pystylometry/ngrams/entropy.py +150 -49
  31. pystylometry/ngrams/extended_ngrams.py +314 -69
  32. pystylometry/prosody/README.md +17 -0
  33. pystylometry/prosody/rhythm_prosody.py +773 -11
  34. pystylometry/readability/README.md +23 -0
  35. pystylometry/readability/additional_formulas.py +1887 -762
  36. pystylometry/readability/ari.py +144 -82
  37. pystylometry/readability/coleman_liau.py +136 -109
  38. pystylometry/readability/flesch.py +177 -73
  39. pystylometry/readability/gunning_fog.py +165 -161
  40. pystylometry/readability/smog.py +123 -42
  41. pystylometry/stylistic/README.md +20 -0
  42. pystylometry/stylistic/cohesion_coherence.py +669 -13
  43. pystylometry/stylistic/genre_register.py +1560 -17
  44. pystylometry/stylistic/markers.py +611 -17
  45. pystylometry/stylistic/vocabulary_overlap.py +354 -13
  46. pystylometry/syntactic/README.md +20 -0
  47. pystylometry/syntactic/advanced_syntactic.py +76 -14
  48. pystylometry/syntactic/pos_ratios.py +70 -6
  49. pystylometry/syntactic/sentence_stats.py +55 -12
  50. pystylometry/syntactic/sentence_types.py +71 -15
  51. pystylometry/viz/README.md +27 -0
  52. pystylometry/viz/__init__.py +71 -0
  53. pystylometry/viz/drift.py +589 -0
  54. pystylometry/viz/jsx/__init__.py +31 -0
  55. pystylometry/viz/jsx/_base.py +144 -0
  56. pystylometry/viz/jsx/report.py +677 -0
  57. pystylometry/viz/jsx/timeline.py +716 -0
  58. pystylometry/viz/jsx/viewer.py +1032 -0
  59. pystylometry-1.3.0.dist-info/METADATA +136 -0
  60. pystylometry-1.3.0.dist-info/RECORD +76 -0
  61. {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
  62. pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
  63. pystylometry-1.0.0.dist-info/METADATA +0 -275
  64. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -1,15 +1,71 @@
1
- """Yule's K and I statistics for vocabulary richness."""
1
+ """Yule's K and I statistics for vocabulary richness.
2
2
 
3
+ This module implements Yule's K and I metrics with native chunked analysis
4
+ for stylometric fingerprinting.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
10
+
11
+ import math
3
12
  from collections import Counter
4
13
 
5
- from .._types import YuleResult
14
+ from .._types import Distribution, YuleResult, chunk_text, make_distribution
6
15
  from .._utils import tokenize
7
16
 
8
17
 
9
- def compute_yule(text: str) -> YuleResult:
18
+ def _compute_yule_single(text: str) -> tuple[float, float, dict]:
19
+ """Compute Yule's K and I for a single chunk of text.
20
+
21
+ Returns:
22
+ Tuple of (yule_k, yule_i, metadata_dict).
23
+ Returns (nan, nan, metadata) for empty/invalid input.
24
+ """
25
+ tokens = tokenize(text.lower())
26
+ N = len(tokens) # noqa: N806
27
+
28
+ if N == 0:
29
+ return (
30
+ float("nan"),
31
+ float("nan"),
32
+ {"token_count": 0, "vocabulary_size": 0},
33
+ )
34
+
35
+ # Count frequency of each token
36
+ freq_counter = Counter(tokens)
37
+ V = len(freq_counter) # noqa: N806
38
+
39
+ # Count how many words occur with each frequency
40
+ freq_of_freqs = Counter(freq_counter.values())
41
+
42
+ # Calculate Σm²×Vm
43
+ sum_m2_vm = sum(m * m * vm for m, vm in freq_of_freqs.items())
44
+
45
+ # Yule's K: 10⁴ × (Σm²×Vm - N) / N²
46
+ yule_k = 10_000 * (sum_m2_vm - N) / (N * N)
47
+
48
+ # Yule's I: V² / (Σm²×Vm - N)
49
+ denominator = sum_m2_vm - N
50
+ if denominator == 0:
51
+ yule_i = float("nan")
52
+ else:
53
+ yule_i = (V * V) / denominator
54
+
55
+ return (
56
+ yule_k,
57
+ yule_i,
58
+ {"token_count": N, "vocabulary_size": V},
59
+ )
60
+
61
+
62
+ def compute_yule(text: str, chunk_size: int = 1000) -> YuleResult:
10
63
  """
11
64
  Compute Yule's K and I metrics for vocabulary richness.
12
65
 
66
+ This function uses native chunked analysis to capture variance and patterns
67
+ across the text, which is essential for stylometric fingerprinting.
68
+
13
69
  Yule's K measures vocabulary repetitiveness (higher = more repetitive).
14
70
  Yule's I is the inverse measure (higher = more diverse).
15
71
 
@@ -23,71 +79,101 @@ def compute_yule(text: str) -> YuleResult:
23
79
  - Vm = number of types occurring m times
24
80
  - m = frequency count
25
81
 
82
+ Related GitHub Issue:
83
+ #27 - Native chunked analysis with Distribution dataclass
84
+ https://github.com/craigtrim/pystylometry/issues/27
85
+
26
86
  References:
27
87
  Yule, G. U. (1944). The Statistical Study of Literary Vocabulary.
28
88
  Cambridge University Press.
29
89
 
30
90
  Args:
31
91
  text: Input text to analyze
92
+ chunk_size: Number of words per chunk (default: 1000)
32
93
 
33
94
  Returns:
34
- YuleResult with .yule_k, .yule_i, and metadata
35
-
36
- Note: For empty input or when Σm²×Vm = N (perfectly uniform vocabulary),
37
- metrics will be float('nan') to indicate undefined values.
95
+ YuleResult with yule_k, yule_i, distributions, and metadata
38
96
 
39
97
  Example:
40
- >>> result = compute_yule("The quick brown fox jumps over the lazy dog.")
41
- >>> print(f"Yule's K: {result.yule_k:.2f}")
42
- >>> print(f"Yule's I: {result.yule_i:.2f}")
43
-
44
- >>> # Empty input returns NaN
45
- >>> import math
46
- >>> result_empty = compute_yule("")
47
- >>> math.isnan(result_empty.yule_k)
48
- True
98
+ >>> result = compute_yule("Long text here...", chunk_size=1000)
99
+ >>> result.yule_k # Mean across chunks
100
+ 120.5
101
+ >>> result.yule_k_dist.std # Variance reveals fingerprint
102
+ 15.2
49
103
  """
50
- tokens = tokenize(text.lower())
51
- N = len(tokens) # noqa: N806
52
-
53
- if N == 0:
104
+ # Chunk the text
105
+ chunks = chunk_text(text, chunk_size)
106
+
107
+ # Compute metrics per chunk
108
+ yule_k_values = []
109
+ yule_i_values = []
110
+ total_tokens = 0
111
+ total_vocab = 0
112
+
113
+ for chunk in chunks:
114
+ k, i, meta = _compute_yule_single(chunk)
115
+ if not math.isnan(k):
116
+ yule_k_values.append(k)
117
+ if not math.isnan(i):
118
+ yule_i_values.append(i)
119
+ total_tokens += meta.get("token_count", 0)
120
+ total_vocab += meta.get("vocabulary_size", 0)
121
+
122
+ # Handle empty or all-invalid chunks
123
+ if not yule_k_values:
124
+ empty_dist = Distribution(
125
+ values=[],
126
+ mean=float("nan"),
127
+ median=float("nan"),
128
+ std=0.0,
129
+ range=0.0,
130
+ iqr=0.0,
131
+ )
54
132
  return YuleResult(
55
133
  yule_k=float("nan"),
56
134
  yule_i=float("nan"),
57
- metadata={"token_count": 0, "vocabulary_size": 0},
135
+ yule_k_dist=empty_dist,
136
+ yule_i_dist=empty_dist,
137
+ chunk_size=chunk_size,
138
+ chunk_count=len(chunks),
139
+ metadata={
140
+ # Backward-compatible keys
141
+ "token_count": 0,
142
+ "vocabulary_size": 0,
143
+ # New prefixed keys for consistency
144
+ "total_token_count": 0,
145
+ "total_vocabulary_size": 0,
146
+ },
58
147
  )
59
148
 
60
- # Count frequency of each token
61
- freq_counter = Counter(tokens)
62
- V = len(freq_counter) # noqa: N806
63
-
64
- # Count how many words occur with each frequency
65
- # Vm[m] = number of words that occur exactly m times
66
- freq_of_freqs = Counter(freq_counter.values())
67
-
68
- # Calculate Σm²×Vm (sum of m-squared times Vm for all m)
69
- # This is the sum across all frequency levels of:
70
- # (frequency)² × (count of words at that frequency)
71
- sum_m2_vm = sum(m * m * vm for m, vm in freq_of_freqs.items())
72
-
73
- # Yule's K: 10⁴ × (Σm²×Vm - N) / N²
74
- # K measures vocabulary repetitiveness (higher K = more repetitive)
75
- yule_k = 10_000 * (sum_m2_vm - N) / (N * N)
76
-
77
- # Yule's I: V² / (Σm²×Vm - N)
78
- # I is the inverse measure (higher I = more diverse)
79
- # If Σm²×Vm = N (perfectly uniform vocabulary), denominator is 0, return NaN
80
- denominator = sum_m2_vm - N
81
- if denominator == 0:
82
- yule_i = float("nan")
83
- else:
84
- yule_i = (V * V) / denominator
149
+ # Build distributions
150
+ yule_k_dist = make_distribution(yule_k_values)
151
+ yule_i_dist = (
152
+ make_distribution(yule_i_values)
153
+ if yule_i_values
154
+ else Distribution(
155
+ values=[],
156
+ mean=float("nan"),
157
+ median=float("nan"),
158
+ std=0.0,
159
+ range=0.0,
160
+ iqr=0.0,
161
+ )
162
+ )
85
163
 
86
164
  return YuleResult(
87
- yule_k=yule_k,
88
- yule_i=yule_i,
165
+ yule_k=yule_k_dist.mean,
166
+ yule_i=yule_i_dist.mean,
167
+ yule_k_dist=yule_k_dist,
168
+ yule_i_dist=yule_i_dist,
169
+ chunk_size=chunk_size,
170
+ chunk_count=len(chunks),
89
171
  metadata={
90
- "token_count": N,
91
- "vocabulary_size": V,
172
+ # Backward-compatible keys
173
+ "token_count": total_tokens,
174
+ "vocabulary_size": total_vocab,
175
+ # New prefixed keys for consistency
176
+ "total_token_count": total_tokens,
177
+ "total_vocabulary_size": total_vocab,
92
178
  },
93
179
  )
@@ -0,0 +1,18 @@
1
+ # ngrams
2
+
3
+ ![4 public functions](https://img.shields.io/badge/functions-4-blue)
4
+ ![No external deps](https://img.shields.io/badge/deps-none-brightgreen)
5
+
6
+ N-gram generation, entropy computation, and sequence analysis.
7
+
8
+ ## Catalogue
9
+
10
+ | File | Functions | What It Measures |
11
+ |------|-----------|-----------------|
12
+ | `entropy.py` | `compute_ngram_entropy`, `compute_character_bigram_entropy`, `compute_word_bigram_entropy` | Shannon entropy at character and word n-gram levels |
13
+ | `extended_ngrams.py` | `compute_extended_ngrams` | Word, character, and POS n-gram profiles with frequency distributions |
14
+
15
+ ## See Also
16
+
17
+ - [`syntactic/`](../syntactic/) provides POS tags consumed by `compute_extended_ngrams(text, pos=True)`
18
+ - [`character/`](../character/) for character-level features without n-gram structure
@@ -1,16 +1,83 @@
1
- """N-gram entropy and perplexity calculations."""
1
+ """N-gram entropy and perplexity calculations.
2
+
3
+ This module implements n-gram entropy computation with native chunked analysis
4
+ for stylometric fingerprinting.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
2
10
 
3
11
  import math
4
12
  from collections import Counter
5
13
 
6
- from .._types import EntropyResult
14
+ from .._types import Distribution, EntropyResult, chunk_text, make_distribution
7
15
  from .._utils import tokenize
8
16
 
9
17
 
10
- def compute_ngram_entropy(text: str, n: int = 2, ngram_type: str = "word") -> EntropyResult:
18
+ def _compute_ngram_entropy_single(text: str, n: int, ngram_type: str) -> tuple[float, float, dict]:
19
+ """Compute n-gram entropy for a single chunk of text.
20
+
21
+ Returns:
22
+ Tuple of (entropy, perplexity, metadata_dict).
23
+ Returns (nan, nan, metadata) for empty/invalid input.
24
+ """
25
+ # Generate n-grams
26
+ if ngram_type == "character":
27
+ items = list(text)
28
+ else: # word
29
+ items = tokenize(text)
30
+
31
+ if len(items) < n:
32
+ return (
33
+ float("nan"),
34
+ float("nan"),
35
+ {
36
+ "item_count": len(items),
37
+ "unique_ngrams": 0,
38
+ "total_ngrams": 0,
39
+ },
40
+ )
41
+
42
+ # Create n-grams using sliding window
43
+ ngram_list = []
44
+ for i in range(len(items) - n + 1):
45
+ ngram = tuple(items[i : i + n])
46
+ ngram_list.append(ngram)
47
+
48
+ # Count n-gram frequencies
49
+ ngram_counts = Counter(ngram_list)
50
+ total_ngrams = len(ngram_list)
51
+
52
+ # Calculate entropy: H(X) = -Σ p(x) × log₂(p(x))
53
+ entropy = 0.0
54
+ for count in ngram_counts.values():
55
+ probability = count / total_ngrams
56
+ entropy -= probability * math.log2(probability)
57
+
58
+ # Calculate perplexity: 2^H(X)
59
+ perplexity = 2**entropy
60
+
61
+ return (
62
+ entropy,
63
+ perplexity,
64
+ {
65
+ "item_count": len(items),
66
+ "unique_ngrams": len(ngram_counts),
67
+ "total_ngrams": total_ngrams,
68
+ },
69
+ )
70
+
71
+
72
+ def compute_ngram_entropy(
73
+ text: str, n: int = 2, ngram_type: str = "word", chunk_size: int = 1000
74
+ ) -> EntropyResult:
11
75
  """
12
76
  Compute n-gram entropy and perplexity for text.
13
77
 
78
+ This function uses native chunked analysis to capture variance and patterns
79
+ across the text, which is essential for stylometric fingerprinting.
80
+
14
81
  Entropy measures the unpredictability of the next item in a sequence.
15
82
  Higher entropy = more unpredictable = more diverse/complex text.
16
83
 
@@ -20,6 +87,10 @@ def compute_ngram_entropy(text: str, n: int = 2, ngram_type: str = "word") -> En
20
87
 
21
88
  Where p(x) is the probability of n-gram x occurring.
22
89
 
90
+ Related GitHub Issue:
91
+ #27 - Native chunked analysis with Distribution dataclass
92
+ https://github.com/craigtrim/pystylometry/issues/27
93
+
23
94
  References:
24
95
  Shannon, C. E. (1948). A mathematical theory of communication.
25
96
  Bell System Technical Journal, 27(3), 379-423.
@@ -31,100 +102,130 @@ def compute_ngram_entropy(text: str, n: int = 2, ngram_type: str = "word") -> En
31
102
  text: Input text to analyze
32
103
  n: N-gram size (2 for bigrams, 3 for trigrams, etc.)
33
104
  ngram_type: "word" or "character" (default: "word")
105
+ chunk_size: Number of words per chunk (default: 1000)
34
106
 
35
107
  Returns:
36
- EntropyResult with entropy, perplexity, and metadata
108
+ EntropyResult with entropy, perplexity, distributions, and metadata
37
109
 
38
110
  Example:
39
- >>> result = compute_ngram_entropy("The quick brown fox jumps", n=2, ngram_type="word")
40
- >>> print(f"Bigram entropy: {result.entropy:.3f}")
41
- >>> print(f"Perplexity: {result.perplexity:.3f}")
111
+ >>> result = compute_ngram_entropy("Long text here...", n=2, chunk_size=1000)
112
+ >>> result.entropy # Mean across chunks
113
+ 5.2
114
+ >>> result.entropy_dist.std # Variance reveals fingerprint
115
+ 0.3
42
116
  """
43
- # Generate n-grams
44
- if ngram_type == "character":
45
- items = list(text)
46
- else: # word
47
- items = tokenize(text)
48
-
49
- if len(items) < n:
117
+ # Chunk the text
118
+ chunks = chunk_text(text, chunk_size)
119
+
120
+ # Compute metrics per chunk
121
+ entropy_values = []
122
+ perplexity_values = []
123
+ total_items = 0
124
+ total_unique_ngrams = 0
125
+ total_ngrams = 0
126
+
127
+ for chunk in chunks:
128
+ ent, perp, meta = _compute_ngram_entropy_single(chunk, n, ngram_type)
129
+ if not math.isnan(ent):
130
+ entropy_values.append(ent)
131
+ perplexity_values.append(perp)
132
+ total_items += meta.get("item_count", 0)
133
+ total_unique_ngrams += meta.get("unique_ngrams", 0)
134
+ total_ngrams += meta.get("total_ngrams", 0)
135
+
136
+ # Handle empty or all-invalid chunks
137
+ if not entropy_values:
138
+ empty_dist = Distribution(
139
+ values=[],
140
+ mean=float("nan"),
141
+ median=float("nan"),
142
+ std=0.0,
143
+ range=0.0,
144
+ iqr=0.0,
145
+ )
50
146
  return EntropyResult(
51
- entropy=0.0,
52
- perplexity=1.0,
147
+ entropy=float("nan"),
148
+ perplexity=float("nan"),
53
149
  ngram_type=f"{ngram_type}_{n}gram",
150
+ entropy_dist=empty_dist,
151
+ perplexity_dist=empty_dist,
152
+ chunk_size=chunk_size,
153
+ chunk_count=len(chunks),
54
154
  metadata={
55
155
  "n": n,
56
156
  "ngram_type": ngram_type,
57
- "item_count": len(items),
157
+ "total_item_count": total_items,
58
158
  "warning": "Text too short for n-gram analysis",
59
159
  },
60
160
  )
61
161
 
62
- # Create n-grams using sliding window
63
- ngram_list = []
64
- for i in range(len(items) - n + 1):
65
- ngram = tuple(items[i : i + n])
66
- ngram_list.append(ngram)
67
-
68
- # Count n-gram frequencies
69
- ngram_counts = Counter(ngram_list)
70
- total_ngrams = len(ngram_list)
71
-
72
- # Calculate entropy: H(X) = -Σ p(x) × log₂(p(x))
73
- entropy = 0.0
74
- for count in ngram_counts.values():
75
- probability = count / total_ngrams
76
- entropy -= probability * math.log2(probability)
77
-
78
- # Calculate perplexity: 2^H(X)
79
- perplexity = 2**entropy
162
+ # Build distributions
163
+ entropy_dist = make_distribution(entropy_values)
164
+ perplexity_dist = make_distribution(perplexity_values)
80
165
 
81
166
  return EntropyResult(
82
- entropy=entropy,
83
- perplexity=perplexity,
167
+ entropy=entropy_dist.mean,
168
+ perplexity=perplexity_dist.mean,
84
169
  ngram_type=f"{ngram_type}_{n}gram",
170
+ entropy_dist=entropy_dist,
171
+ perplexity_dist=perplexity_dist,
172
+ chunk_size=chunk_size,
173
+ chunk_count=len(chunks),
85
174
  metadata={
86
175
  "n": n,
87
176
  "ngram_type": ngram_type,
88
- "item_count": len(items),
89
- "unique_ngrams": len(ngram_counts),
177
+ "total_item_count": total_items,
178
+ "total_unique_ngrams": total_unique_ngrams,
90
179
  "total_ngrams": total_ngrams,
91
180
  },
92
181
  )
93
182
 
94
183
 
95
- def compute_character_bigram_entropy(text: str) -> EntropyResult:
184
+ def compute_character_bigram_entropy(text: str, chunk_size: int = 1000) -> EntropyResult:
96
185
  """
97
186
  Compute character bigram entropy.
98
187
 
99
188
  Convenience function that calls compute_ngram_entropy with n=2, ngram_type="character".
100
189
 
190
+ Related GitHub Issue:
191
+ #27 - Native chunked analysis with Distribution dataclass
192
+ https://github.com/craigtrim/pystylometry/issues/27
193
+
101
194
  Args:
102
195
  text: Input text to analyze
196
+ chunk_size: Number of words per chunk (default: 1000)
103
197
 
104
198
  Returns:
105
- EntropyResult with character bigram entropy and perplexity
199
+ EntropyResult with character bigram entropy, perplexity, and distributions
106
200
 
107
201
  Example:
108
- >>> result = compute_character_bigram_entropy("The quick brown fox")
109
- >>> print(f"Character bigram entropy: {result.entropy:.3f}")
202
+ >>> result = compute_character_bigram_entropy("Long text here...", chunk_size=1000)
203
+ >>> result.entropy # Mean across chunks
204
+ 3.8
110
205
  """
111
- return compute_ngram_entropy(text, n=2, ngram_type="character")
206
+ return compute_ngram_entropy(text, n=2, ngram_type="character", chunk_size=chunk_size)
112
207
 
113
208
 
114
- def compute_word_bigram_entropy(text: str) -> EntropyResult:
209
+ def compute_word_bigram_entropy(text: str, chunk_size: int = 1000) -> EntropyResult:
115
210
  """
116
211
  Compute word bigram entropy.
117
212
 
118
213
  Convenience function that calls compute_ngram_entropy with n=2, ngram_type="word".
119
214
 
215
+ Related GitHub Issue:
216
+ #27 - Native chunked analysis with Distribution dataclass
217
+ https://github.com/craigtrim/pystylometry/issues/27
218
+
120
219
  Args:
121
220
  text: Input text to analyze
221
+ chunk_size: Number of words per chunk (default: 1000)
122
222
 
123
223
  Returns:
124
- EntropyResult with word bigram entropy and perplexity
224
+ EntropyResult with word bigram entropy, perplexity, and distributions
125
225
 
126
226
  Example:
127
- >>> result = compute_word_bigram_entropy("The quick brown fox jumps")
128
- >>> print(f"Word bigram entropy: {result.entropy:.3f}")
227
+ >>> result = compute_word_bigram_entropy("Long text here...", chunk_size=1000)
228
+ >>> result.entropy # Mean across chunks
229
+ 5.2
129
230
  """
130
- return compute_ngram_entropy(text, n=2, ngram_type="word")
231
+ return compute_ngram_entropy(text, n=2, ngram_type="word", chunk_size=chunk_size)