pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pystylometry/__init__.py +1 -2
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1224 -2
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +4 -0
  6. pystylometry/authorship/additional_methods.py +100 -0
  7. pystylometry/character/__init__.py +15 -0
  8. pystylometry/character/character_metrics.py +301 -0
  9. pystylometry/lexical/__init__.py +13 -6
  10. pystylometry/lexical/advanced_diversity.py +641 -0
  11. pystylometry/lexical/function_words.py +391 -0
  12. pystylometry/lexical/hapax.py +154 -7
  13. pystylometry/lexical/mtld.py +83 -7
  14. pystylometry/lexical/ttr.py +83 -0
  15. pystylometry/lexical/word_frequency_sophistication.py +581 -0
  16. pystylometry/lexical/yule.py +34 -7
  17. pystylometry/ngrams/__init__.py +2 -0
  18. pystylometry/ngrams/extended_ngrams.py +235 -0
  19. pystylometry/prosody/__init__.py +12 -0
  20. pystylometry/prosody/rhythm_prosody.py +53 -0
  21. pystylometry/readability/__init__.py +12 -0
  22. pystylometry/readability/additional_formulas.py +985 -0
  23. pystylometry/readability/ari.py +93 -17
  24. pystylometry/readability/coleman_liau.py +102 -9
  25. pystylometry/readability/complex_words.py +531 -0
  26. pystylometry/readability/flesch.py +59 -14
  27. pystylometry/readability/gunning_fog.py +194 -25
  28. pystylometry/readability/smog.py +31 -14
  29. pystylometry/readability/syllables.py +137 -30
  30. pystylometry/stylistic/__init__.py +20 -0
  31. pystylometry/stylistic/cohesion_coherence.py +45 -0
  32. pystylometry/stylistic/genre_register.py +45 -0
  33. pystylometry/stylistic/markers.py +131 -0
  34. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  35. pystylometry/syntactic/__init__.py +4 -0
  36. pystylometry/syntactic/advanced_syntactic.py +432 -0
  37. pystylometry/syntactic/pos_ratios.py +104 -13
  38. pystylometry/syntactic/sentence_stats.py +57 -13
  39. pystylometry/syntactic/sentence_types.py +470 -0
  40. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
  41. pystylometry-1.0.0.dist-info/RECORD +46 -0
  42. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
  43. pystylometry-0.1.0.dist-info/RECORD +0 -26
@@ -4,6 +4,64 @@ from .._types import MTLDResult
4
4
  from .._utils import tokenize
5
5
 
6
6
 
7
+ def _calculate_mtld_direction(tokens: list[str], threshold: float, forward: bool) -> float:
8
+ """
9
+ Calculate MTLD in one direction (forward or backward).
10
+
11
+ Args:
12
+ tokens: List of tokens to analyze
13
+ threshold: TTR threshold to maintain (must be in range (0, 1))
14
+ forward: If True, process forward; if False, process backward
15
+
16
+ Returns:
17
+ MTLD score for this direction
18
+ """
19
+ if len(tokens) == 0:
20
+ return 0.0
21
+
22
+ # Process tokens in the specified direction
23
+ token_list = tokens if forward else tokens[::-1]
24
+
25
+ factors = 0.0
26
+ current_count = 0
27
+ current_types = set()
28
+
29
+ for token in token_list:
30
+ current_count += 1
31
+ current_types.add(token)
32
+
33
+ # Calculate current TTR
34
+ ttr = len(current_types) / current_count
35
+
36
+ # If TTR drops below threshold, we've completed a factor
37
+ if ttr < threshold:
38
+ factors += 1.0
39
+ current_count = 0
40
+ current_types = set()
41
+
42
+ # Handle remaining partial factor
43
+ # Add proportion of a complete factor based on how close we are to threshold
44
+ if current_count > 0:
45
+ ttr = len(current_types) / current_count
46
+ # If we're still above threshold, add partial factor credit
47
+ # Formula: (1 - current_ttr) / (1 - threshold)
48
+ # This represents how far we've progressed toward completing a factor
49
+ # In theory, ttr should always be >= threshold here because drops below
50
+ # threshold are handled in the loop above (which resets current_count).
51
+ # Adding defensive check to prevent mathematical errors.
52
+ if ttr >= threshold:
53
+ factors += (1.0 - ttr) / (1.0 - threshold)
54
+
55
+ # MTLD is the mean length of factors
56
+ # Total tokens / number of factors
57
+ if factors > 0:
58
+ return len(tokens) / factors
59
+ else:
60
+ # If no factors were completed, return the text length
61
+ # This happens when TTR stays above threshold for the entire text
62
+ return float(len(tokens))
63
+
64
+
7
65
  def compute_mtld(
8
66
  text: str,
9
67
  threshold: float = 0.72,
@@ -16,8 +74,10 @@ def compute_mtld(
16
74
  varying lengths.
17
75
 
18
76
  Formula:
19
- MTLD = mean(forward_factors, backward_factors)
20
- where factors are word string lengths that maintain TTR >= threshold
77
+ MTLD = total_tokens / factor_count
78
+ where factor_count includes:
79
+ - Completed factors (segments where TTR dropped below threshold)
80
+ - Partial factor for any remaining incomplete segment (weighted by proximity to threshold)
21
81
 
22
82
  References:
23
83
  McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D:
@@ -26,16 +86,28 @@ def compute_mtld(
26
86
 
27
87
  Args:
28
88
  text: Input text to analyze
29
- threshold: TTR threshold to maintain (default: 0.72)
89
+ threshold: TTR threshold to maintain (default: 0.72, must be in range (0, 1))
30
90
 
31
91
  Returns:
32
92
  MTLDResult with forward, backward, and average MTLD scores
33
93
 
94
+ Raises:
95
+ ValueError: If threshold is not in range (0, 1)
96
+
34
97
  Example:
35
98
  >>> result = compute_mtld("The quick brown fox jumps over the lazy dog...")
36
99
  >>> print(f"MTLD: {result.mtld_average:.2f}")
37
100
  """
38
- tokens = tokenize(text)
101
+ # Validate threshold parameter
102
+ if not (0 < threshold < 1):
103
+ raise ValueError(
104
+ f"Threshold must be in range (0, 1), got {threshold}. "
105
+ "Common values: 0.72 (default), 0.5-0.8"
106
+ )
107
+
108
+ # Case-insensitive tokenization for consistency with other lexical metrics
109
+ # (compute_yule, compute_hapax_ratios both use text.lower())
110
+ tokens = tokenize(text.lower())
39
111
 
40
112
  if len(tokens) == 0:
41
113
  return MTLDResult(
@@ -45,9 +117,13 @@ def compute_mtld(
45
117
  metadata={"token_count": 0, "threshold": threshold},
46
118
  )
47
119
 
48
- # TODO: Implement forward and backward MTLD calculation
49
- mtld_forward = 0.0 # Placeholder
50
- mtld_backward = 0.0 # Placeholder
120
+ # Calculate MTLD in forward direction
121
+ mtld_forward = _calculate_mtld_direction(tokens, threshold, forward=True)
122
+
123
+ # Calculate MTLD in backward direction
124
+ mtld_backward = _calculate_mtld_direction(tokens, threshold, forward=False)
125
+
126
+ # Average of forward and backward
51
127
  mtld_average = (mtld_forward + mtld_backward) / 2
52
128
 
53
129
  return MTLDResult(
@@ -0,0 +1,83 @@
1
+ """Type-Token Ratio (TTR) analysis using stylometry-ttr package.
2
+
3
+ This module provides a facade wrapper around the stylometry-ttr package,
4
+ maintaining consistent API patterns with other pystylometry metrics.
5
+ """
6
+
7
+ from .._types import TTRResult
8
+
9
+
10
+ def compute_ttr(text: str, text_id: str | None = None) -> TTRResult:
11
+ """
12
+ Compute Type-Token Ratio (TTR) metrics for vocabulary richness.
13
+
14
+ This is a facade wrapper around the stylometry-ttr package that provides
15
+ multiple TTR variants for measuring lexical diversity. TTR measures the
16
+ ratio of unique words (types) to total words (tokens).
17
+
18
+ Metrics computed:
19
+ - Raw TTR: unique_words / total_words
20
+ - Root TTR (Guiraud's index): unique_words / sqrt(total_words)
21
+ - Log TTR (Herdan's C): log(unique_words) / log(total_words)
22
+ - STTR: Standardized TTR across fixed-size chunks (reduces length bias)
23
+ - Delta Std: Standard deviation of TTR across chunks (vocabulary consistency)
24
+
25
+ References:
26
+ Guiraud, P. (1960). Problèmes et méthodes de la statistique linguistique.
27
+ Herdan, G. (1960). Type-token Mathematics: A Textbook of Mathematical
28
+ Linguistics. Mouton.
29
+ Johnson, W. (1944). Studies in language behavior: I. A program of research.
30
+ Psychological Monographs, 56(2), 1-15.
31
+
32
+ Args:
33
+ text: Input text to analyze
34
+ text_id: Optional identifier for the text (for tracking purposes)
35
+
36
+ Returns:
37
+ TTRResult with all TTR variants and metadata
38
+
39
+ Example:
40
+ >>> result = compute_ttr("The quick brown fox jumps over the lazy dog.")
41
+ >>> print(f"Raw TTR: {result.ttr:.3f}")
42
+ Raw TTR: 0.900
43
+ >>> print(f"Root TTR: {result.root_ttr:.3f}")
44
+ Root TTR: 2.846
45
+ >>> print(f"STTR: {result.sttr:.3f}")
46
+ STTR: 1.000
47
+
48
+ >>> # With text identifier
49
+ >>> result = compute_ttr("Sample text here.", text_id="sample-001")
50
+ >>> print(result.metadata["text_id"])
51
+ sample-001
52
+ """
53
+ try:
54
+ from stylometry_ttr import compute_ttr as _compute_ttr
55
+ except ImportError as e:
56
+ raise ImportError(
57
+ "TTR metrics require the stylometry-ttr package. "
58
+ "This should have been installed as a core dependency. "
59
+ "Install with: pip install stylometry-ttr"
60
+ ) from e
61
+
62
+ # Call the stylometry-ttr compute_ttr function
63
+ # Note: stylometry-ttr requires text_id to be a string, not None
64
+ ttr_result = _compute_ttr(text, text_id=text_id or "")
65
+
66
+ # Convert to our TTRResult dataclass
67
+ # The stylometry-ttr result has attributes we can access
68
+ # Some fields (sttr, delta_std) may be None for short texts
69
+ return TTRResult(
70
+ total_words=ttr_result.total_words,
71
+ unique_words=ttr_result.unique_words,
72
+ ttr=ttr_result.ttr,
73
+ root_ttr=ttr_result.root_ttr,
74
+ log_ttr=ttr_result.log_ttr,
75
+ sttr=ttr_result.sttr if ttr_result.sttr is not None else 0.0,
76
+ delta_std=ttr_result.delta_std if ttr_result.delta_std is not None else 0.0,
77
+ metadata={
78
+ "text_id": text_id or "",
79
+ "source": "stylometry-ttr",
80
+ "sttr_available": ttr_result.sttr is not None,
81
+ "delta_std_available": ttr_result.delta_std is not None,
82
+ },
83
+ )