pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pystylometry/__init__.py +1 -2
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1224 -2
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +4 -0
  6. pystylometry/authorship/additional_methods.py +100 -0
  7. pystylometry/character/__init__.py +15 -0
  8. pystylometry/character/character_metrics.py +301 -0
  9. pystylometry/lexical/__init__.py +13 -6
  10. pystylometry/lexical/advanced_diversity.py +641 -0
  11. pystylometry/lexical/function_words.py +391 -0
  12. pystylometry/lexical/hapax.py +154 -7
  13. pystylometry/lexical/mtld.py +83 -7
  14. pystylometry/lexical/ttr.py +83 -0
  15. pystylometry/lexical/word_frequency_sophistication.py +581 -0
  16. pystylometry/lexical/yule.py +34 -7
  17. pystylometry/ngrams/__init__.py +2 -0
  18. pystylometry/ngrams/extended_ngrams.py +235 -0
  19. pystylometry/prosody/__init__.py +12 -0
  20. pystylometry/prosody/rhythm_prosody.py +53 -0
  21. pystylometry/readability/__init__.py +12 -0
  22. pystylometry/readability/additional_formulas.py +985 -0
  23. pystylometry/readability/ari.py +93 -17
  24. pystylometry/readability/coleman_liau.py +102 -9
  25. pystylometry/readability/complex_words.py +531 -0
  26. pystylometry/readability/flesch.py +59 -14
  27. pystylometry/readability/gunning_fog.py +194 -25
  28. pystylometry/readability/smog.py +31 -14
  29. pystylometry/readability/syllables.py +137 -30
  30. pystylometry/stylistic/__init__.py +20 -0
  31. pystylometry/stylistic/cohesion_coherence.py +45 -0
  32. pystylometry/stylistic/genre_register.py +45 -0
  33. pystylometry/stylistic/markers.py +131 -0
  34. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  35. pystylometry/syntactic/__init__.py +4 -0
  36. pystylometry/syntactic/advanced_syntactic.py +432 -0
  37. pystylometry/syntactic/pos_ratios.py +104 -13
  38. pystylometry/syntactic/sentence_stats.py +57 -13
  39. pystylometry/syntactic/sentence_types.py +470 -0
  40. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
  41. pystylometry-1.0.0.dist-info/RECORD +46 -0
  42. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
  43. pystylometry-0.1.0.dist-info/RECORD +0 -26
@@ -1,8 +1,47 @@
1
1
  """Automated Readability Index (ARI)."""
2
2
 
3
+ import math
4
+
3
5
  from .._types import ARIResult
4
6
  from .._utils import split_sentences, tokenize
5
7
 
8
+ # Formula coefficients from Senter & Smith (1967)
9
+ # Reference: Senter, R. J., & Smith, E. A. (1967). Automated readability index.
10
+ # AMRL-TR-6620. Aerospace Medical Research Laboratories.
11
+
12
+ # Coefficient for characters per word
13
+ _CHARACTER_COEFFICIENT = 4.71
14
+
15
+ # Coefficient for words per sentence
16
+ _WORD_COEFFICIENT = 0.5
17
+
18
+ # Intercept to calibrate scale to U.S. grade levels
19
+ _INTERCEPT = -21.43
20
+
21
+
22
+ def _get_age_range(grade_level: int) -> str:
23
+ """
24
+ Map grade level to age range.
25
+
26
+ Args:
27
+ grade_level: U.S. grade level (0-20+)
28
+
29
+ Returns:
30
+ Age range string
31
+ """
32
+ if grade_level <= 0:
33
+ return "5-6 years (Kindergarten)"
34
+ elif grade_level <= 5:
35
+ return "6-11 years (Elementary)"
36
+ elif grade_level <= 8:
37
+ return "11-14 years (Middle School)"
38
+ elif grade_level <= 12:
39
+ return "14-18 years (High School)"
40
+ elif grade_level <= 14:
41
+ return "18-22 years (College)"
42
+ else:
43
+ return "22+ years (Graduate)"
44
+
6
45
 
7
46
  def compute_ari(text: str) -> ARIResult:
8
47
  """
@@ -11,15 +50,22 @@ def compute_ari(text: str) -> ARIResult:
11
50
  Formula:
12
51
  ARI = 4.71 × (characters/words) + 0.5 × (words/sentences) - 21.43
13
52
 
14
- The ARI is designed to gauge the understandability of a text and produces
15
- an approximate representation of the US grade level needed to comprehend the text.
53
+ The ARI uses character counts and word counts (similar to Coleman-Liau)
54
+ but adds sentence length as a factor. It produces an approximate
55
+ representation of the US grade level needed to comprehend the text.
56
+
57
+ **Implementation Notes:**
58
+ - Grade levels are clamped to [0, 20] range
59
+ - Uses round-half-up rounding for grade level calculation
60
+ - Character count includes alphanumeric characters only (letters and digits)
61
+ - Reliability heuristic: 100+ words recommended
16
62
 
17
63
  Grade Level to Age mapping:
18
- 1-5: 5-11 years
19
- 6-8: 11-14 years
20
- 9-12: 14-18 years
21
- 13-14: 18-22 years
22
- 14+: 22+ years (college level)
64
+ 1-5: 6-11 years (Elementary)
65
+ 6-8: 11-14 years (Middle School)
66
+ 9-12: 14-18 years (High School)
67
+ 13-14: 18-22 years (College)
68
+ 15+: 22+ years (Graduate)
23
69
 
24
70
  References:
25
71
  Senter, R. J., & Smith, E. A. (1967). Automated readability index.
@@ -34,27 +80,56 @@ def compute_ari(text: str) -> ARIResult:
34
80
  Example:
35
81
  >>> result = compute_ari("The quick brown fox jumps over the lazy dog.")
36
82
  >>> print(f"ARI Score: {result.ari_score:.1f}")
83
+ ARI Score: 0.1
37
84
  >>> print(f"Grade Level: {result.grade_level}")
85
+ Grade Level: 0
38
86
  >>> print(f"Age Range: {result.age_range}")
87
+ Age Range: 5-6 years (Kindergarten)
88
+ >>> result.metadata["reliable"]
89
+ False
39
90
  """
40
91
  sentences = split_sentences(text)
41
92
  tokens = tokenize(text)
42
93
 
94
+ # Count characters (alphanumeric: letters and digits, excluding spaces/punctuation)
95
+ # Computed before early return to ensure metadata consistency
96
+ character_count = sum(1 for char in text if char.isalnum())
97
+
43
98
  if len(sentences) == 0 or len(tokens) == 0:
44
99
  return ARIResult(
45
100
  ari_score=0.0,
46
101
  grade_level=0,
47
- age_range="Unknown",
48
- metadata={"sentence_count": 0, "word_count": 0, "character_count": 0},
102
+ age_range="5-6 years (Kindergarten)",
103
+ metadata={
104
+ "sentence_count": len(sentences),
105
+ "word_count": len(tokens),
106
+ "character_count": character_count,
107
+ "characters_per_word": 0.0,
108
+ "words_per_sentence": 0.0,
109
+ "reliable": False,
110
+ },
49
111
  )
50
112
 
51
- # Count characters (letters, numbers, excluding spaces and punctuation)
52
- character_count = sum(1 for char in text if char.isalnum())
113
+ # Calculate ratios
114
+ chars_per_word = character_count / len(tokens)
115
+ words_per_sentence = len(tokens) / len(sentences)
116
+
117
+ # Apply ARI formula
118
+ ari_score = (
119
+ _CHARACTER_COEFFICIENT * chars_per_word
120
+ + _WORD_COEFFICIENT * words_per_sentence
121
+ + _INTERCEPT
122
+ )
123
+
124
+ # Use round-half-up rounding and clamp to valid grade range [0, 20]
125
+ # math.floor(x + 0.5) implements round-half-up for both positive and negative values
126
+ grade_level = max(0, min(20, math.floor(ari_score + 0.5)))
127
+
128
+ # Get age range from grade level
129
+ age_range = _get_age_range(grade_level)
53
130
 
54
- # TODO: Implement ARI formula
55
- ari_score = 0.0 # Placeholder
56
- grade_level = 0 # Placeholder
57
- age_range = "Unknown" # Placeholder
131
+ # Reliability heuristic: like other readability metrics, 100+ words recommended
132
+ reliable = len(tokens) >= 100
58
133
 
59
134
  return ARIResult(
60
135
  ari_score=ari_score,
@@ -64,7 +139,8 @@ def compute_ari(text: str) -> ARIResult:
64
139
  "sentence_count": len(sentences),
65
140
  "word_count": len(tokens),
66
141
  "character_count": character_count,
67
- "characters_per_word": character_count / len(tokens) if tokens else 0,
68
- "words_per_sentence": len(tokens) / len(sentences) if sentences else 0,
142
+ "characters_per_word": chars_per_word,
143
+ "words_per_sentence": words_per_sentence,
144
+ "reliable": reliable,
69
145
  },
70
146
  )
@@ -1,8 +1,26 @@
1
1
  """Coleman-Liau Index."""
2
2
 
3
+ import math
4
+
3
5
  from .._types import ColemanLiauResult
4
6
  from .._utils import split_sentences, tokenize
5
7
 
8
+ # Regression coefficients from Coleman & Liau (1975)
9
+ # Derived from empirical analysis of Cloze test results on graded texts
10
+ # Reference: Coleman, M., & Liau, T. L. (1975). A computer readability formula
11
+ # designed for machine scoring. Journal of Applied Psychology, 60(2), 283.
12
+
13
+ # Coefficient for letters per 100 words
14
+ # Represents impact of word length on reading difficulty
15
+ _LETTER_COEFFICIENT = 0.0588
16
+
17
+ # Coefficient for sentences per 100 words (negative: more sentences = easier)
18
+ # Represents impact of sentence length on reading difficulty
19
+ _SENTENCE_COEFFICIENT = -0.296
20
+
21
+ # Intercept to calibrate scale to U.S. grade levels (1-16)
22
+ _INTERCEPT = -15.8
23
+
6
24
 
7
25
  def compute_coleman_liau(text: str) -> ColemanLiauResult:
8
26
  """
@@ -16,7 +34,21 @@ def compute_coleman_liau(text: str) -> ColemanLiauResult:
16
34
  S = average number of sentences per 100 words
17
35
 
18
36
  The Coleman-Liau index relies on characters rather than syllables,
19
- making it easier to compute and potentially more language-agnostic.
37
+ making it easier to compute and not requiring syllable-counting algorithms.
38
+
39
+ **Implementation Notes:**
40
+ - Grade levels are NOT clamped (removed upper bound of 20 per PR #2 review).
41
+ The original Coleman & Liau (1975) paper calibrated to grades 1-16 but did not
42
+ specify an upper bound. Post-graduate texts may exceed grade 20.
43
+ - Uses round-half-up rounding (not banker's rounding) for grade level calculation
44
+ - Letter counts (Unicode alphabetic characters only) computed from tokenized words
45
+ to ensure measurement consistency. Both letter count and word count use identical
46
+ tokenization logic, preventing divergence in edge cases (emails, URLs, hyphens).
47
+ See PR #2 review discussion: https://github.com/craigtrim/pystylometry/pull/2
48
+ - Reliability heuristic based on validation study passage lengths (~100 words);
49
+ shorter texts flagged in metadata
50
+ - English-centric sentence splitting and Unicode assumptions limit true
51
+ cross-language applicability
20
52
 
21
53
  References:
22
54
  Coleman, M., & Liau, T. L. (1975). A computer readability formula
@@ -31,28 +63,88 @@ def compute_coleman_liau(text: str) -> ColemanLiauResult:
31
63
  Example:
32
64
  >>> result = compute_coleman_liau("The quick brown fox jumps over the lazy dog.")
33
65
  >>> print(f"CLI Index: {result.cli_index:.1f}")
66
+ CLI Index: 3.8
34
67
  >>> print(f"Grade Level: {result.grade_level}")
68
+ Grade Level: 4
69
+ >>> result.metadata["reliable"]
70
+ False
35
71
  """
36
72
  sentences = split_sentences(text)
37
- tokens = tokenize(text)
73
+ all_tokens = tokenize(text)
74
+
75
+ # Filter to only tokens that contain at least one alphabetic character
76
+ # This excludes pure punctuation (. ! ?) but keeps words with mixed content
77
+ # (Hello123, Test@example.com) to count their letters per Coleman-Liau spec.
78
+ # This is different from Gunning Fog which uses stricter normalization.
79
+ tokens = [token for token in all_tokens if any(char.isalpha() for char in token)]
80
+
81
+ # CRITICAL: Count letters from tokenized words, NOT from raw text
82
+ # ===============================================================
83
+ # Coleman & Liau (1975) define L as "average number of letters per 100 words"
84
+ # where both letters and words must be measured consistently from the same text units.
85
+ #
86
+ # Original implementation (buggy):
87
+ # letter_count = sum(1 for char in text if char.isalpha())
88
+ # This counted letters from RAW text but words from TOKENIZED text
89
+ #
90
+ # Problem cases (PR #2 review https://github.com/craigtrim/pystylometry/pull/2):
91
+ # - "test@example.com" → tokenizer may split into ['test', '@', 'example', '.', 'com']
92
+ # Raw letter count: 15 letters, Token count: 5 tokens → wrong ratio
93
+ # - "co-operate" → tokenizer may split into ['co', '-', 'operate']
94
+ # Raw letter count: 9 letters, Token count: 3 tokens → wrong ratio
95
+ # - URLs, special tokens, etc. → similar inconsistencies
96
+ #
97
+ # Fixed implementation:
98
+ # Count only alphabetic characters that appear in valid word tokens (after normalization).
99
+ # This ensures both letter count and word count use identical tokenization logic,
100
+ # maintaining the mathematical integrity of the L term in the Coleman-Liau formula.
101
+ letter_count = sum(1 for token in tokens for char in token if char.isalpha())
38
102
 
39
103
  if len(sentences) == 0 or len(tokens) == 0:
40
104
  return ColemanLiauResult(
41
105
  cli_index=0.0,
42
106
  grade_level=0,
43
- metadata={"sentence_count": 0, "word_count": 0, "letter_count": 0},
107
+ metadata={
108
+ "sentence_count": len(sentences),
109
+ "word_count": len(tokens),
110
+ "letter_count": letter_count,
111
+ "letters_per_100_words": 0.0,
112
+ "sentences_per_100_words": 0.0,
113
+ "reliable": False,
114
+ },
44
115
  )
45
116
 
46
- # Count letters (excluding spaces and punctuation)
47
- letter_count = sum(1 for char in text if char.isalpha())
48
-
49
117
  # Calculate per 100 words
50
118
  L = (letter_count / len(tokens)) * 100 # noqa: N806
51
119
  S = (len(sentences) / len(tokens)) * 100 # noqa: N806
52
120
 
53
- # TODO: Implement Coleman-Liau formula
54
- cli_index = 0.0 # Placeholder
55
- grade_level = 0 # Placeholder
121
+ # Compute Coleman-Liau Index using empirically-derived coefficients
122
+ cli_index = _LETTER_COEFFICIENT * L + _SENTENCE_COEFFICIENT * S + _INTERCEPT
123
+
124
+ # Grade Level Calculation and Bounds
125
+ # ===================================
126
+ # Round-half-up rounding (not Python's default banker's rounding):
127
+ # 4.5 → 5 (always rounds up), not round-half-to-even
128
+ # math.floor(x + 0.5) implements this for both positive and negative values
129
+ #
130
+ # Lower bound (0): Prevent negative grades for very simple texts
131
+ # Coleman & Liau (1975) calibrated to U.S. grades 1-16, but simpler texts
132
+ # (e.g., "Go. Run. Stop.") can produce negative CLI values. We clamp to 0
133
+ # as there is no "negative grade level" in the educational system.
134
+ #
135
+ # Upper bound (REMOVED per PR #2 review):
136
+ # Original implementation clamped at grade 20, but this was arbitrary.
137
+ # Coleman & Liau (1975) did not specify an upper bound in their paper.
138
+ # Clamping discards information: PhD dissertations (grade 25) and complex
139
+ # legal documents (grade 30+) would both report as grade 20, making them
140
+ # indistinguishable. The empirical formula should determine the full range.
141
+ #
142
+ # See PR #2 discussion: https://github.com/craigtrim/pystylometry/pull/2
143
+ grade_level = max(0, math.floor(cli_index + 0.5))
144
+
145
+ # Reliability heuristic: validation study used ~100-word passages
146
+ # Not a hard minimum, but shorter texts may deviate from expected behavior
147
+ reliable = len(tokens) >= 100
56
148
 
57
149
  return ColemanLiauResult(
58
150
  cli_index=cli_index,
@@ -63,5 +155,6 @@ def compute_coleman_liau(text: str) -> ColemanLiauResult:
63
155
  "letter_count": letter_count,
64
156
  "letters_per_100_words": L,
65
157
  "sentences_per_100_words": S,
158
+ "reliable": reliable,
66
159
  },
67
160
  )