pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pystylometry/README.md +42 -0
  2. pystylometry/__init__.py +45 -3
  3. pystylometry/_types.py +1017 -259
  4. pystylometry/authorship/README.md +21 -0
  5. pystylometry/authorship/__init__.py +28 -4
  6. pystylometry/authorship/additional_methods.py +260 -40
  7. pystylometry/authorship/compression.py +175 -0
  8. pystylometry/authorship/kilgarriff.py +354 -0
  9. pystylometry/character/README.md +17 -0
  10. pystylometry/character/character_metrics.py +267 -179
  11. pystylometry/cli.py +427 -0
  12. pystylometry/consistency/README.md +27 -0
  13. pystylometry/consistency/__init__.py +57 -0
  14. pystylometry/consistency/_thresholds.py +162 -0
  15. pystylometry/consistency/drift.py +549 -0
  16. pystylometry/dialect/README.md +26 -0
  17. pystylometry/dialect/__init__.py +65 -0
  18. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  19. pystylometry/dialect/_loader.py +360 -0
  20. pystylometry/dialect/detector.py +533 -0
  21. pystylometry/lexical/README.md +23 -0
  22. pystylometry/lexical/advanced_diversity.py +61 -22
  23. pystylometry/lexical/function_words.py +255 -56
  24. pystylometry/lexical/hapax.py +182 -52
  25. pystylometry/lexical/mtld.py +108 -26
  26. pystylometry/lexical/ttr.py +76 -10
  27. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  28. pystylometry/lexical/yule.py +136 -50
  29. pystylometry/ngrams/README.md +18 -0
  30. pystylometry/ngrams/entropy.py +150 -49
  31. pystylometry/ngrams/extended_ngrams.py +314 -69
  32. pystylometry/prosody/README.md +17 -0
  33. pystylometry/prosody/rhythm_prosody.py +773 -11
  34. pystylometry/readability/README.md +23 -0
  35. pystylometry/readability/additional_formulas.py +1887 -762
  36. pystylometry/readability/ari.py +144 -82
  37. pystylometry/readability/coleman_liau.py +136 -109
  38. pystylometry/readability/flesch.py +177 -73
  39. pystylometry/readability/gunning_fog.py +165 -161
  40. pystylometry/readability/smog.py +123 -42
  41. pystylometry/stylistic/README.md +20 -0
  42. pystylometry/stylistic/cohesion_coherence.py +669 -13
  43. pystylometry/stylistic/genre_register.py +1560 -17
  44. pystylometry/stylistic/markers.py +611 -17
  45. pystylometry/stylistic/vocabulary_overlap.py +354 -13
  46. pystylometry/syntactic/README.md +20 -0
  47. pystylometry/syntactic/advanced_syntactic.py +76 -14
  48. pystylometry/syntactic/pos_ratios.py +70 -6
  49. pystylometry/syntactic/sentence_stats.py +55 -12
  50. pystylometry/syntactic/sentence_types.py +71 -15
  51. pystylometry/viz/README.md +27 -0
  52. pystylometry/viz/__init__.py +71 -0
  53. pystylometry/viz/drift.py +589 -0
  54. pystylometry/viz/jsx/__init__.py +31 -0
  55. pystylometry/viz/jsx/_base.py +144 -0
  56. pystylometry/viz/jsx/report.py +677 -0
  57. pystylometry/viz/jsx/timeline.py +716 -0
  58. pystylometry/viz/jsx/viewer.py +1032 -0
  59. pystylometry-1.3.0.dist-info/METADATA +136 -0
  60. pystylometry-1.3.0.dist-info/RECORD +76 -0
  61. {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
  62. pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
  63. pystylometry-1.0.0.dist-info/METADATA +0 -275
  64. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -1,16 +1,87 @@
1
- """Hapax legomena and related vocabulary richness metrics."""
1
+ """Hapax legomena and related vocabulary richness metrics.
2
+
3
+ This module implements hapax metrics with native chunked analysis for
4
+ stylometric fingerprinting.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
2
10
 
3
11
  import math
4
12
  from collections import Counter
5
13
 
6
- from .._types import HapaxLexiconResult, HapaxResult, LexiconCategories
14
+ from .._types import (
15
+ Distribution,
16
+ HapaxLexiconResult,
17
+ HapaxResult,
18
+ LexiconCategories,
19
+ chunk_text,
20
+ make_distribution,
21
+ )
7
22
  from .._utils import check_optional_dependency, tokenize
8
23
 
9
24
 
10
- def compute_hapax_ratios(text: str) -> HapaxResult:
25
+ def _compute_hapax_single(text: str) -> tuple[int, float, int, float, float, float, dict]:
26
+ """Compute hapax metrics for a single chunk of text.
27
+
28
+ Returns:
29
+ Tuple of (hapax_count, hapax_ratio, dis_hapax_count, dis_hapax_ratio,
30
+ sichel_s, honore_r, metadata_dict).
31
+ Returns nans for ratios on empty input.
32
+ """
33
+ tokens = tokenize(text.lower())
34
+ N = len(tokens) # noqa: N806
35
+
36
+ if N == 0:
37
+ return (
38
+ 0,
39
+ float("nan"),
40
+ 0,
41
+ float("nan"),
42
+ float("nan"),
43
+ float("nan"),
44
+ {"token_count": 0, "vocabulary_size": 0},
45
+ )
46
+
47
+ # Count frequency of each token
48
+ freq_counter = Counter(tokens)
49
+ V = len(freq_counter) # noqa: N806
50
+
51
+ # Count hapax legomena (V₁) and dislegomena (V₂)
52
+ V1 = sum(1 for count in freq_counter.values() if count == 1) # noqa: N806
53
+ V2 = sum(1 for count in freq_counter.values() if count == 2) # noqa: N806
54
+
55
+ # Sichel's S: ratio of dislegomena to vocabulary size
56
+ sichel_s = V2 / V if V > 0 else 0.0
57
+
58
+ # Honoré's R: 100 × log(N) / (1 - V₁/V)
59
+ if V1 == V:
60
+ honore_r = float("inf")
61
+ else:
62
+ honore_r = 100 * math.log(N) / (1 - V1 / V)
63
+
64
+ hapax_ratio = V1 / N if N > 0 else 0.0
65
+ dis_hapax_ratio = V2 / N if N > 0 else 0.0
66
+
67
+ return (
68
+ V1,
69
+ hapax_ratio,
70
+ V2,
71
+ dis_hapax_ratio,
72
+ sichel_s,
73
+ honore_r,
74
+ {"token_count": N, "vocabulary_size": V},
75
+ )
76
+
77
+
78
+ def compute_hapax_ratios(text: str, chunk_size: int = 1000) -> HapaxResult:
11
79
  """
12
80
  Compute hapax legomena, hapax dislegomena, and related richness metrics.
13
81
 
82
+ This function uses native chunked analysis to capture variance and patterns
83
+ across the text, which is essential for stylometric fingerprinting.
84
+
14
85
  Hapax legomena = words appearing exactly once
15
86
  Hapax dislegomena = words appearing exactly twice
16
87
 
@@ -18,6 +89,10 @@ def compute_hapax_ratios(text: str) -> HapaxResult:
18
89
  - Sichel's S: V₂ / V (ratio of dislegomena to total vocabulary)
19
90
  - Honoré's R: 100 × log(N) / (1 - V₁/V)
20
91
 
92
+ Related GitHub Issue:
93
+ #27 - Native chunked analysis with Distribution dataclass
94
+ https://github.com/craigtrim/pystylometry/issues/27
95
+
21
96
  References:
22
97
  Sichel, H. S. (1975). On a distribution law for word frequencies.
23
98
  Journal of the American Statistical Association, 70(351a), 542-547.
@@ -27,68 +102,123 @@ def compute_hapax_ratios(text: str) -> HapaxResult:
27
102
 
28
103
  Args:
29
104
  text: Input text to analyze
105
+ chunk_size: Number of words per chunk (default: 1000)
30
106
 
31
107
  Returns:
32
- HapaxResult with counts, ratios, Sichel's S, Honoré's R, and metadata
33
-
34
- Note: When all words are unique (V₁ = V), Honoré's R returns float('inf')
35
- to indicate maximal vocabulary richness (division by zero case).
108
+ HapaxResult with counts, ratios, distributions, and metadata
36
109
 
37
110
  Example:
38
- >>> text = "The quick brown fox jumps over the lazy dog"
39
- >>> result = compute_hapax_ratios(text)
40
- >>> result.hapax_count # Words appearing once
41
- 7
42
- >>> result.dis_hapax_count # Words appearing twice
43
- 1
44
- >>> print(f"Sichel's S: {result.sichel_s:.3f}")
45
- Sichel's S: 0.125
111
+ >>> result = compute_hapax_ratios("Long text here...", chunk_size=1000)
112
+ >>> result.hapax_ratio # Mean across chunks
113
+ 0.45
114
+ >>> result.hapax_ratio_dist.std # Variance reveals fingerprint
115
+ 0.08
46
116
  """
47
- tokens = tokenize(text.lower())
48
- N = len(tokens) # noqa: N806
49
-
50
- if N == 0:
117
+ # Chunk the text
118
+ chunks = chunk_text(text, chunk_size)
119
+
120
+ # Compute metrics per chunk
121
+ hapax_ratio_values = []
122
+ dis_hapax_ratio_values = []
123
+ sichel_s_values = []
124
+ honore_r_values = []
125
+ honore_r_inf_count = 0 # Track chunks where all words are unique (V₁ = V)
126
+ total_hapax_count = 0
127
+ total_dis_hapax_count = 0
128
+ total_tokens = 0
129
+ total_vocab = 0
130
+ valid_chunk_count = 0
131
+
132
+ for chunk in chunks:
133
+ h_cnt, h_rat, dh_cnt, dh_rat, sichel, honore, meta = _compute_hapax_single(chunk)
134
+ total_hapax_count += h_cnt
135
+ total_dis_hapax_count += dh_cnt
136
+ total_tokens += meta.get("token_count", 0)
137
+ total_vocab += meta.get("vocabulary_size", 0)
138
+
139
+ if not math.isnan(h_rat):
140
+ hapax_ratio_values.append(h_rat)
141
+ valid_chunk_count += 1
142
+ if not math.isnan(dh_rat):
143
+ dis_hapax_ratio_values.append(dh_rat)
144
+ if not math.isnan(sichel):
145
+ sichel_s_values.append(sichel)
146
+ if math.isinf(honore):
147
+ # Track infinite values (when V₁ = V, maximal vocabulary richness)
148
+ honore_r_inf_count += 1
149
+ elif not math.isnan(honore):
150
+ honore_r_values.append(honore)
151
+
152
+ # Handle empty or all-invalid chunks
153
+ if not hapax_ratio_values:
154
+ empty_dist = Distribution(
155
+ values=[],
156
+ mean=float("nan"),
157
+ median=float("nan"),
158
+ std=0.0,
159
+ range=0.0,
160
+ iqr=0.0,
161
+ )
51
162
  return HapaxResult(
52
163
  hapax_count=0,
53
- hapax_ratio=0.0,
164
+ hapax_ratio=float("nan"),
54
165
  dis_hapax_count=0,
55
- dis_hapax_ratio=0.0,
56
- sichel_s=0.0,
57
- honore_r=0.0,
58
- metadata={"token_count": 0, "vocabulary_size": 0},
166
+ dis_hapax_ratio=float("nan"),
167
+ sichel_s=float("nan"),
168
+ honore_r=float("nan"),
169
+ hapax_ratio_dist=empty_dist,
170
+ dis_hapax_ratio_dist=empty_dist,
171
+ sichel_s_dist=empty_dist,
172
+ honore_r_dist=empty_dist,
173
+ chunk_size=chunk_size,
174
+ chunk_count=len(chunks),
175
+ metadata={"total_token_count": 0, "total_vocabulary_size": 0},
59
176
  )
60
177
 
61
- # Count frequency of each token
62
- freq_counter = Counter(tokens)
63
- V = len(freq_counter) # noqa: N806
64
-
65
- # Count hapax legomena (V₁) and dislegomena (V₂)
66
- V1 = sum(1 for count in freq_counter.values() if count == 1) # noqa: N806
67
- V2 = sum(1 for count in freq_counter.values() if count == 2) # noqa: N806
68
-
69
- # Sichel's S: ratio of dislegomena to vocabulary size
70
- # S = V₂ / V
71
- sichel_s = V2 / V if V > 0 else 0.0
178
+ # Build distributions
179
+ hapax_ratio_dist = make_distribution(hapax_ratio_values)
180
+ dis_hapax_ratio_dist = make_distribution(dis_hapax_ratio_values)
181
+ sichel_s_dist = (
182
+ make_distribution(sichel_s_values)
183
+ if sichel_s_values
184
+ else Distribution(
185
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
186
+ )
187
+ )
72
188
 
73
- # Honoré's R: 100 × log(N) / (1 - V₁/V)
74
- # R = 100 × log(N) / (1 - V₁/V)
75
- # If V₁ = V (all words appear once), denominator is 0, return infinity
76
- # This indicates maximal vocabulary richness (every word unique)
77
- if V1 == V:
78
- honore_r = float("inf")
189
+ # Handle honore_r specially: if all valid chunks had V₁ = V (all unique words),
190
+ # return infinity to indicate maximal vocabulary richness
191
+ if honore_r_values:
192
+ honore_r_dist = make_distribution(honore_r_values)
193
+ honore_r_final = honore_r_dist.mean
194
+ elif honore_r_inf_count > 0 and honore_r_inf_count == valid_chunk_count:
195
+ # All valid chunks had infinite honore_r (all words unique)
196
+ honore_r_dist = Distribution(
197
+ values=[], mean=float("inf"), median=float("inf"), std=0.0, range=0.0, iqr=0.0
198
+ )
199
+ honore_r_final = float("inf")
79
200
  else:
80
- honore_r = 100 * math.log(N) / (1 - V1 / V)
201
+ honore_r_dist = Distribution(
202
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
203
+ )
204
+ honore_r_final = float("nan")
81
205
 
82
206
  return HapaxResult(
83
- hapax_count=V1,
84
- hapax_ratio=V1 / N if N > 0 else 0.0,
85
- dis_hapax_count=V2,
86
- dis_hapax_ratio=V2 / N if N > 0 else 0.0,
87
- sichel_s=sichel_s,
88
- honore_r=honore_r,
207
+ hapax_count=total_hapax_count,
208
+ hapax_ratio=hapax_ratio_dist.mean,
209
+ dis_hapax_count=total_dis_hapax_count,
210
+ dis_hapax_ratio=dis_hapax_ratio_dist.mean,
211
+ sichel_s=sichel_s_dist.mean,
212
+ honore_r=honore_r_final,
213
+ hapax_ratio_dist=hapax_ratio_dist,
214
+ dis_hapax_ratio_dist=dis_hapax_ratio_dist,
215
+ sichel_s_dist=sichel_s_dist,
216
+ honore_r_dist=honore_r_dist,
217
+ chunk_size=chunk_size,
218
+ chunk_count=len(chunks),
89
219
  metadata={
90
- "token_count": N,
91
- "vocabulary_size": V,
220
+ "total_token_count": total_tokens,
221
+ "total_vocabulary_size": total_vocab,
92
222
  },
93
223
  )
94
224
 
@@ -148,8 +278,8 @@ def compute_hapax_with_lexicon_analysis(text: str) -> HapaxLexiconResult:
148
278
  check_optional_dependency("bnc_lookup", "lexical")
149
279
  check_optional_dependency("wordnet_lookup", "lexical")
150
280
 
151
- from bnc_lookup import is_bnc_term # type: ignore[import-not-found]
152
- from wordnet_lookup import is_wordnet_term # type: ignore[import-not-found]
281
+ from bnc_lookup import exists as is_bnc_term # type: ignore[import-untyped]
282
+ from wordnet_lookup import is_wordnet_term # type: ignore[import-untyped]
153
283
 
154
284
  # First compute standard hapax metrics
155
285
  hapax_result = compute_hapax_ratios(text)
@@ -1,6 +1,16 @@
1
- """MTLD (Measure of Textual Lexical Diversity) implementation."""
1
+ """MTLD (Measure of Textual Lexical Diversity) implementation.
2
2
 
3
- from .._types import MTLDResult
3
+ This module implements MTLD with native chunked analysis for stylometric
4
+ fingerprinting.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
10
+
11
+ import math
12
+
13
+ from .._types import Distribution, MTLDResult, chunk_text, make_distribution
4
14
  from .._utils import tokenize
5
15
 
6
16
 
@@ -62,13 +72,46 @@ def _calculate_mtld_direction(tokens: list[str], threshold: float, forward: bool
62
72
  return float(len(tokens))
63
73
 
64
74
 
75
+ def _compute_mtld_single(text: str, threshold: float) -> tuple[float, float, float, dict]:
76
+ """Compute MTLD for a single chunk of text.
77
+
78
+ Returns:
79
+ Tuple of (mtld_forward, mtld_backward, mtld_average, metadata_dict).
80
+ Returns (nan, nan, nan, metadata) for empty input.
81
+ """
82
+ tokens = tokenize(text.lower())
83
+
84
+ if len(tokens) == 0:
85
+ return (
86
+ float("nan"),
87
+ float("nan"),
88
+ float("nan"),
89
+ {"token_count": 0},
90
+ )
91
+
92
+ mtld_forward = _calculate_mtld_direction(tokens, threshold, forward=True)
93
+ mtld_backward = _calculate_mtld_direction(tokens, threshold, forward=False)
94
+ mtld_average = (mtld_forward + mtld_backward) / 2
95
+
96
+ return (
97
+ mtld_forward,
98
+ mtld_backward,
99
+ mtld_average,
100
+ {"token_count": len(tokens)},
101
+ )
102
+
103
+
65
104
  def compute_mtld(
66
105
  text: str,
67
106
  threshold: float = 0.72,
107
+ chunk_size: int = 1000,
68
108
  ) -> MTLDResult:
69
109
  """
70
110
  Compute MTLD (Measure of Textual Lexical Diversity).
71
111
 
112
+ This function uses native chunked analysis to capture variance and patterns
113
+ across the text, which is essential for stylometric fingerprinting.
114
+
72
115
  MTLD measures the mean length of sequential word strings that maintain
73
116
  a minimum threshold TTR. It's more robust than simple TTR for texts of
74
117
  varying lengths.
@@ -79,6 +122,10 @@ def compute_mtld(
79
122
  - Completed factors (segments where TTR dropped below threshold)
80
123
  - Partial factor for any remaining incomplete segment (weighted by proximity to threshold)
81
124
 
125
+ Related GitHub Issue:
126
+ #27 - Native chunked analysis with Distribution dataclass
127
+ https://github.com/craigtrim/pystylometry/issues/27
128
+
82
129
  References:
83
130
  McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D:
84
131
  A validation study of sophisticated approaches to lexical diversity assessment.
@@ -87,16 +134,20 @@ def compute_mtld(
87
134
  Args:
88
135
  text: Input text to analyze
89
136
  threshold: TTR threshold to maintain (default: 0.72, must be in range (0, 1))
137
+ chunk_size: Number of words per chunk (default: 1000)
90
138
 
91
139
  Returns:
92
- MTLDResult with forward, backward, and average MTLD scores
140
+ MTLDResult with forward, backward, average MTLD scores and distributions
93
141
 
94
142
  Raises:
95
143
  ValueError: If threshold is not in range (0, 1)
96
144
 
97
145
  Example:
98
- >>> result = compute_mtld("The quick brown fox jumps over the lazy dog...")
99
- >>> print(f"MTLD: {result.mtld_average:.2f}")
146
+ >>> result = compute_mtld("Long text here...", chunk_size=1000)
147
+ >>> result.mtld_average # Mean across chunks
148
+ 72.5
149
+ >>> result.mtld_average_dist.std # Variance reveals fingerprint
150
+ 8.3
100
151
  """
101
152
  # Validate threshold parameter
102
153
  if not (0 < threshold < 1):
@@ -105,33 +156,64 @@ def compute_mtld(
105
156
  "Common values: 0.72 (default), 0.5-0.8"
106
157
  )
107
158
 
108
- # Case-insensitive tokenization for consistency with other lexical metrics
109
- # (compute_yule, compute_hapax_ratios both use text.lower())
110
- tokens = tokenize(text.lower())
111
-
112
- if len(tokens) == 0:
159
+ # Chunk the text
160
+ chunks = chunk_text(text, chunk_size)
161
+
162
+ # Compute metrics per chunk
163
+ forward_values = []
164
+ backward_values = []
165
+ average_values = []
166
+ total_tokens = 0
167
+
168
+ for chunk in chunks:
169
+ fwd, bwd, avg, meta = _compute_mtld_single(chunk, threshold)
170
+ if not math.isnan(fwd):
171
+ forward_values.append(fwd)
172
+ backward_values.append(bwd)
173
+ average_values.append(avg)
174
+ total_tokens += meta.get("token_count", 0)
175
+
176
+ # Handle empty or all-invalid chunks
177
+ if not forward_values:
178
+ empty_dist = Distribution(
179
+ values=[],
180
+ mean=float("nan"),
181
+ median=float("nan"),
182
+ std=0.0,
183
+ range=0.0,
184
+ iqr=0.0,
185
+ )
113
186
  return MTLDResult(
114
- mtld_forward=0.0,
115
- mtld_backward=0.0,
116
- mtld_average=0.0,
117
- metadata={"token_count": 0, "threshold": threshold},
187
+ mtld_forward=float("nan"),
188
+ mtld_backward=float("nan"),
189
+ mtld_average=float("nan"),
190
+ mtld_forward_dist=empty_dist,
191
+ mtld_backward_dist=empty_dist,
192
+ mtld_average_dist=empty_dist,
193
+ chunk_size=chunk_size,
194
+ chunk_count=len(chunks),
195
+ metadata={
196
+ "total_token_count": 0,
197
+ "threshold": threshold,
198
+ },
118
199
  )
119
200
 
120
- # Calculate MTLD in forward direction
121
- mtld_forward = _calculate_mtld_direction(tokens, threshold, forward=True)
122
-
123
- # Calculate MTLD in backward direction
124
- mtld_backward = _calculate_mtld_direction(tokens, threshold, forward=False)
125
-
126
- # Average of forward and backward
127
- mtld_average = (mtld_forward + mtld_backward) / 2
201
+ # Build distributions
202
+ forward_dist = make_distribution(forward_values)
203
+ backward_dist = make_distribution(backward_values)
204
+ average_dist = make_distribution(average_values)
128
205
 
129
206
  return MTLDResult(
130
- mtld_forward=mtld_forward,
131
- mtld_backward=mtld_backward,
132
- mtld_average=mtld_average,
207
+ mtld_forward=forward_dist.mean,
208
+ mtld_backward=backward_dist.mean,
209
+ mtld_average=average_dist.mean,
210
+ mtld_forward_dist=forward_dist,
211
+ mtld_backward_dist=backward_dist,
212
+ mtld_average_dist=average_dist,
213
+ chunk_size=chunk_size,
214
+ chunk_count=len(chunks),
133
215
  metadata={
134
- "token_count": len(tokens),
216
+ "total_token_count": total_tokens,
135
217
  "threshold": threshold,
136
218
  },
137
219
  )
@@ -2,12 +2,18 @@
2
2
 
3
3
  This module provides a facade wrapper around the stylometry-ttr package,
4
4
  maintaining consistent API patterns with other pystylometry metrics.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
5
9
  """
6
10
 
7
- from .._types import TTRResult
11
+ from __future__ import annotations
12
+
13
+ from .._types import Distribution, TTRResult, make_distribution
8
14
 
9
15
 
10
- def compute_ttr(text: str, text_id: str | None = None) -> TTRResult:
16
+ def compute_ttr(text: str, text_id: str | None = None, chunk_size: int = 1000) -> TTRResult:
11
17
  """
12
18
  Compute Type-Token Ratio (TTR) metrics for vocabulary richness.
13
19
 
@@ -22,6 +28,10 @@ def compute_ttr(text: str, text_id: str | None = None) -> TTRResult:
22
28
  - STTR: Standardized TTR across fixed-size chunks (reduces length bias)
23
29
  - Delta Std: Standard deviation of TTR across chunks (vocabulary consistency)
24
30
 
31
+ Related GitHub Issue:
32
+ #27 - Native chunked analysis with Distribution dataclass
33
+ https://github.com/craigtrim/pystylometry/issues/27
34
+
25
35
  References:
26
36
  Guiraud, P. (1960). Problèmes et méthodes de la statistique linguistique.
27
37
  Herdan, G. (1960). Type-token Mathematics: A Textbook of Mathematical
@@ -32,9 +42,14 @@ def compute_ttr(text: str, text_id: str | None = None) -> TTRResult:
32
42
  Args:
33
43
  text: Input text to analyze
34
44
  text_id: Optional identifier for the text (for tracking purposes)
45
+ chunk_size: Number of words per chunk (default: 1000).
46
+ Note: The stylometry-ttr package handles its own internal chunking,
47
+ so this parameter is included for API consistency but actual chunking
48
+ behavior is delegated to stylometry-ttr.
35
49
 
36
50
  Returns:
37
- TTRResult with all TTR variants and metadata
51
+ TTRResult with all TTR variants and metadata, including Distribution
52
+ objects for stylometric fingerprinting.
38
53
 
39
54
  Example:
40
55
  >>> result = compute_ttr("The quick brown fox jumps over the lazy dog.")
@@ -63,17 +78,68 @@ def compute_ttr(text: str, text_id: str | None = None) -> TTRResult:
63
78
  # Note: stylometry-ttr requires text_id to be a string, not None
64
79
  ttr_result = _compute_ttr(text, text_id=text_id or "")
65
80
 
81
+ # Extract values, handling None for short texts
82
+ ttr_val = ttr_result.ttr
83
+ root_ttr_val = ttr_result.root_ttr
84
+ log_ttr_val = ttr_result.log_ttr
85
+ sttr_val = ttr_result.sttr if ttr_result.sttr is not None else 0.0
86
+ delta_std_val = ttr_result.delta_std if ttr_result.delta_std is not None else 0.0
87
+
88
+ # Create single-value distributions from stylometry-ttr results
89
+ # The stylometry-ttr package handles its own internal chunking for STTR
90
+ # so we wrap the aggregate results in Distribution objects
91
+ ttr_dist = (
92
+ make_distribution([ttr_val])
93
+ if ttr_val is not None
94
+ else Distribution(
95
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
96
+ )
97
+ )
98
+ root_ttr_dist = (
99
+ make_distribution([root_ttr_val])
100
+ if root_ttr_val is not None
101
+ else Distribution(
102
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
103
+ )
104
+ )
105
+ log_ttr_dist = (
106
+ make_distribution([log_ttr_val])
107
+ if log_ttr_val is not None
108
+ else Distribution(
109
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
110
+ )
111
+ )
112
+ sttr_dist = (
113
+ make_distribution([sttr_val])
114
+ if ttr_result.sttr is not None
115
+ else Distribution(
116
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
117
+ )
118
+ )
119
+ delta_std_dist = (
120
+ make_distribution([delta_std_val])
121
+ if ttr_result.delta_std is not None
122
+ else Distribution(
123
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
124
+ )
125
+ )
126
+
66
127
  # Convert to our TTRResult dataclass
67
- # The stylometry-ttr result has attributes we can access
68
- # Some fields (sttr, delta_std) may be None for short texts
69
128
  return TTRResult(
70
129
  total_words=ttr_result.total_words,
71
130
  unique_words=ttr_result.unique_words,
72
- ttr=ttr_result.ttr,
73
- root_ttr=ttr_result.root_ttr,
74
- log_ttr=ttr_result.log_ttr,
75
- sttr=ttr_result.sttr if ttr_result.sttr is not None else 0.0,
76
- delta_std=ttr_result.delta_std if ttr_result.delta_std is not None else 0.0,
131
+ ttr=ttr_val if ttr_val is not None else float("nan"),
132
+ root_ttr=root_ttr_val if root_ttr_val is not None else float("nan"),
133
+ log_ttr=log_ttr_val if log_ttr_val is not None else float("nan"),
134
+ sttr=sttr_val,
135
+ delta_std=delta_std_val,
136
+ ttr_dist=ttr_dist,
137
+ root_ttr_dist=root_ttr_dist,
138
+ log_ttr_dist=log_ttr_dist,
139
+ sttr_dist=sttr_dist,
140
+ delta_std_dist=delta_std_dist,
141
+ chunk_size=chunk_size,
142
+ chunk_count=1, # stylometry-ttr returns aggregate results
77
143
  metadata={
78
144
  "text_id": text_id or "",
79
145
  "source": "stylometry-ttr",