pystylometry 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,70 @@
1
+ """Automated Readability Index (ARI)."""
2
+
3
+ from .._types import ARIResult
4
+ from .._utils import split_sentences, tokenize
5
+
6
+
7
+ def compute_ari(text: str) -> ARIResult:
8
+ """
9
+ Compute Automated Readability Index (ARI).
10
+
11
+ Formula:
12
+ ARI = 4.71 × (characters/words) + 0.5 × (words/sentences) - 21.43
13
+
14
+ The ARI is designed to gauge the understandability of a text and produces
15
+ an approximate representation of the US grade level needed to comprehend the text.
16
+
17
+ Grade Level to Age mapping:
18
+ 1-5: 5-11 years
19
+ 6-8: 11-14 years
20
+ 9-12: 14-18 years
21
+ 13-14: 18-22 years
22
+ 14+: 22+ years (college level)
23
+
24
+ References:
25
+ Senter, R. J., & Smith, E. A. (1967). Automated readability index.
26
+ AMRL-TR-6620. Aerospace Medical Research Laboratories.
27
+
28
+ Args:
29
+ text: Input text to analyze
30
+
31
+ Returns:
32
+ ARIResult with ARI score, grade level, and age range
33
+
34
+ Example:
35
+ >>> result = compute_ari("The quick brown fox jumps over the lazy dog.")
36
+ >>> print(f"ARI Score: {result.ari_score:.1f}")
37
+ >>> print(f"Grade Level: {result.grade_level}")
38
+ >>> print(f"Age Range: {result.age_range}")
39
+ """
40
+ sentences = split_sentences(text)
41
+ tokens = tokenize(text)
42
+
43
+ if len(sentences) == 0 or len(tokens) == 0:
44
+ return ARIResult(
45
+ ari_score=0.0,
46
+ grade_level=0,
47
+ age_range="Unknown",
48
+ metadata={"sentence_count": 0, "word_count": 0, "character_count": 0},
49
+ )
50
+
51
+ # Count characters (letters, numbers, excluding spaces and punctuation)
52
+ character_count = sum(1 for char in text if char.isalnum())
53
+
54
+ # TODO: Implement ARI formula
55
+ ari_score = 0.0 # Placeholder
56
+ grade_level = 0 # Placeholder
57
+ age_range = "Unknown" # Placeholder
58
+
59
+ return ARIResult(
60
+ ari_score=ari_score,
61
+ grade_level=grade_level,
62
+ age_range=age_range,
63
+ metadata={
64
+ "sentence_count": len(sentences),
65
+ "word_count": len(tokens),
66
+ "character_count": character_count,
67
+ "characters_per_word": character_count / len(tokens) if tokens else 0,
68
+ "words_per_sentence": len(tokens) / len(sentences) if sentences else 0,
69
+ },
70
+ )
@@ -0,0 +1,67 @@
1
+ """Coleman-Liau Index."""
2
+
3
+ from .._types import ColemanLiauResult
4
+ from .._utils import split_sentences, tokenize
5
+
6
+
7
+ def compute_coleman_liau(text: str) -> ColemanLiauResult:
8
+ """
9
+ Compute Coleman-Liau Index.
10
+
11
+ Formula:
12
+ CLI = 0.0588 × L - 0.296 × S - 15.8
13
+
14
+ Where:
15
+ L = average number of letters per 100 words
16
+ S = average number of sentences per 100 words
17
+
18
+ The Coleman-Liau index relies on characters rather than syllables,
19
+ making it easier to compute and potentially more language-agnostic.
20
+
21
+ References:
22
+ Coleman, M., & Liau, T. L. (1975). A computer readability formula
23
+ designed for machine scoring. Journal of Applied Psychology, 60(2), 283.
24
+
25
+ Args:
26
+ text: Input text to analyze
27
+
28
+ Returns:
29
+ ColemanLiauResult with CLI index and grade level
30
+
31
+ Example:
32
+ >>> result = compute_coleman_liau("The quick brown fox jumps over the lazy dog.")
33
+ >>> print(f"CLI Index: {result.cli_index:.1f}")
34
+ >>> print(f"Grade Level: {result.grade_level}")
35
+ """
36
+ sentences = split_sentences(text)
37
+ tokens = tokenize(text)
38
+
39
+ if len(sentences) == 0 or len(tokens) == 0:
40
+ return ColemanLiauResult(
41
+ cli_index=0.0,
42
+ grade_level=0,
43
+ metadata={"sentence_count": 0, "word_count": 0, "letter_count": 0},
44
+ )
45
+
46
+ # Count letters (excluding spaces and punctuation)
47
+ letter_count = sum(1 for char in text if char.isalpha())
48
+
49
+ # Calculate per 100 words
50
+ L = (letter_count / len(tokens)) * 100 # noqa: N806
51
+ S = (len(sentences) / len(tokens)) * 100 # noqa: N806
52
+
53
+ # TODO: Implement Coleman-Liau formula
54
+ cli_index = 0.0 # Placeholder
55
+ grade_level = 0 # Placeholder
56
+
57
+ return ColemanLiauResult(
58
+ cli_index=cli_index,
59
+ grade_level=grade_level,
60
+ metadata={
61
+ "sentence_count": len(sentences),
62
+ "word_count": len(tokens),
63
+ "letter_count": letter_count,
64
+ "letters_per_100_words": L,
65
+ "sentences_per_100_words": S,
66
+ },
67
+ )
@@ -0,0 +1,81 @@
1
+ """Flesch Reading Ease and Flesch-Kincaid Grade Level."""
2
+
3
+ from .._types import FleschResult
4
+ from .._utils import split_sentences, tokenize
5
+ from .syllables import count_syllables
6
+
7
+
8
+ def compute_flesch(text: str) -> FleschResult:
9
+ """
10
+ Compute Flesch Reading Ease and Flesch-Kincaid Grade Level.
11
+
12
+ Flesch Reading Ease:
13
+ Score = 206.835 - 1.015 × (words/sentences) - 84.6 × (syllables/words)
14
+ Higher scores = easier to read (0-100 scale)
15
+
16
+ Flesch-Kincaid Grade Level:
17
+ Grade = 0.39 × (words/sentences) + 11.8 × (syllables/words) - 15.59
18
+
19
+ Interpretation of Reading Ease:
20
+ 90-100: Very Easy (5th grade)
21
+ 80-89: Easy (6th grade)
22
+ 70-79: Fairly Easy (7th grade)
23
+ 60-69: Standard (8th-9th grade)
24
+ 50-59: Fairly Difficult (10th-12th grade)
25
+ 30-49: Difficult (College)
26
+ 0-29: Very Difficult (College graduate)
27
+
28
+ References:
29
+ Flesch, R. (1948). A new readability yardstick.
30
+ Journal of Applied Psychology, 32(3), 221.
31
+
32
+ Kincaid, J. P., et al. (1975). Derivation of new readability formulas
33
+ for Navy enlisted personnel. Naval Technical Training Command.
34
+
35
+ Args:
36
+ text: Input text to analyze
37
+
38
+ Returns:
39
+ FleschResult with reading ease, grade level, and difficulty rating
40
+
41
+ Example:
42
+ >>> result = compute_flesch("The quick brown fox jumps over the lazy dog.")
43
+ >>> print(f"Reading Ease: {result.reading_ease:.1f}")
44
+ >>> print(f"Grade Level: {result.grade_level:.1f}")
45
+ >>> print(f"Difficulty: {result.difficulty}")
46
+ """
47
+ sentences = split_sentences(text)
48
+ tokens = tokenize(text)
49
+
50
+ if len(sentences) == 0 or len(tokens) == 0:
51
+ return FleschResult(
52
+ reading_ease=0.0,
53
+ grade_level=0.0,
54
+ difficulty="Unknown",
55
+ metadata={"sentence_count": 0, "word_count": 0, "syllable_count": 0},
56
+ )
57
+
58
+ # Count syllables
59
+ total_syllables = sum(count_syllables(word) for word in tokens)
60
+
61
+ # Calculate metrics
62
+ words_per_sentence = len(tokens) / len(sentences)
63
+ syllables_per_word = total_syllables / len(tokens)
64
+
65
+ # TODO: Implement Flesch formulas
66
+ reading_ease = 0.0 # Placeholder
67
+ grade_level = 0.0 # Placeholder
68
+ difficulty = "Unknown" # Placeholder
69
+
70
+ return FleschResult(
71
+ reading_ease=reading_ease,
72
+ grade_level=grade_level,
73
+ difficulty=difficulty,
74
+ metadata={
75
+ "sentence_count": len(sentences),
76
+ "word_count": len(tokens),
77
+ "syllable_count": total_syllables,
78
+ "words_per_sentence": words_per_sentence,
79
+ "syllables_per_word": syllables_per_word,
80
+ },
81
+ )
@@ -0,0 +1,63 @@
1
+ """Gunning Fog Index."""
2
+
3
+ from .._types import GunningFogResult
4
+ from .._utils import split_sentences, tokenize
5
+ from .syllables import count_syllables
6
+
7
+
8
+ def compute_gunning_fog(text: str) -> GunningFogResult:
9
+ """
10
+ Compute Gunning Fog Index.
11
+
12
+ Formula:
13
+ Fog Index = 0.4 × [(words/sentences) + 100 × (complex words/words)]
14
+
15
+ Where complex words are defined as words with 3+ syllables,
16
+ excluding proper nouns, compound words, and common suffixes.
17
+
18
+ The index estimates years of formal education needed to understand the text
19
+ on first reading.
20
+
21
+ References:
22
+ Gunning, R. (1952). The Technique of Clear Writing.
23
+ McGraw-Hill.
24
+
25
+ Args:
26
+ text: Input text to analyze
27
+
28
+ Returns:
29
+ GunningFogResult with fog index and grade level
30
+
31
+ Example:
32
+ >>> result = compute_gunning_fog("The quick brown fox jumps over the lazy dog.")
33
+ >>> print(f"Fog Index: {result.fog_index:.1f}")
34
+ >>> print(f"Grade Level: {result.grade_level}")
35
+ """
36
+ sentences = split_sentences(text)
37
+ tokens = tokenize(text)
38
+
39
+ if len(sentences) == 0 or len(tokens) == 0:
40
+ return GunningFogResult(
41
+ fog_index=0.0,
42
+ grade_level=0,
43
+ metadata={"sentence_count": 0, "word_count": 0, "complex_word_count": 0},
44
+ )
45
+
46
+ # Count complex words (3+ syllables)
47
+ # TODO: Exclude proper nouns, compound words, and -es/-ed/-ing endings
48
+ complex_word_count = sum(1 for word in tokens if count_syllables(word) >= 3)
49
+
50
+ # TODO: Implement Gunning Fog formula
51
+ fog_index = 0.0 # Placeholder
52
+ grade_level = 0 # Placeholder
53
+
54
+ return GunningFogResult(
55
+ fog_index=fog_index,
56
+ grade_level=grade_level,
57
+ metadata={
58
+ "sentence_count": len(sentences),
59
+ "word_count": len(tokens),
60
+ "complex_word_count": complex_word_count,
61
+ "complex_word_percentage": (complex_word_count / len(tokens) * 100) if tokens else 0,
62
+ },
63
+ )
@@ -0,0 +1,71 @@
1
+ """SMOG (Simple Measure of Gobbledygook) Index."""
2
+
3
+ from .._types import SMOGResult
4
+ from .._utils import split_sentences, tokenize
5
+ from .syllables import count_syllables
6
+
7
+
8
+ def compute_smog(text: str) -> SMOGResult:
9
+ """
10
+ Compute SMOG (Simple Measure of Gobbledygook) Index.
11
+
12
+ Formula:
13
+ SMOG = 1.043 × √(polysyllables × 30/sentences) + 3.1291
14
+
15
+ Where polysyllables are words with 3 or more syllables.
16
+
17
+ The SMOG index estimates the years of education needed to understand the text.
18
+ It's particularly useful for healthcare materials.
19
+
20
+ References:
21
+ McLaughlin, G. H. (1969). SMOG grading: A new readability formula.
22
+ Journal of Reading, 12(8), 639-646.
23
+
24
+ Args:
25
+ text: Input text to analyze
26
+
27
+ Returns:
28
+ SMOGResult with SMOG index and grade level
29
+
30
+ Example:
31
+ >>> result = compute_smog("The quick brown fox jumps over the lazy dog.")
32
+ >>> print(f"SMOG Index: {result.smog_index:.1f}")
33
+ >>> print(f"Grade Level: {result.grade_level}")
34
+ """
35
+ sentences = split_sentences(text)
36
+ tokens = tokenize(text)
37
+
38
+ if len(sentences) < 30:
39
+ # SMOG requires at least 30 sentences for accuracy
40
+ # We'll compute anyway but note in metadata
41
+ pass
42
+
43
+ if len(sentences) == 0 or len(tokens) == 0:
44
+ return SMOGResult(
45
+ smog_index=0.0,
46
+ grade_level=0,
47
+ metadata={
48
+ "sentence_count": 0,
49
+ "word_count": 0,
50
+ "polysyllable_count": 0,
51
+ "warning": "Insufficient text",
52
+ },
53
+ )
54
+
55
+ # Count polysyllables (words with 3+ syllables)
56
+ polysyllable_count = sum(1 for word in tokens if count_syllables(word) >= 3)
57
+
58
+ # TODO: Implement SMOG formula
59
+ smog_index = 0.0 # Placeholder
60
+ grade_level = 0 # Placeholder
61
+
62
+ return SMOGResult(
63
+ smog_index=smog_index,
64
+ grade_level=grade_level,
65
+ metadata={
66
+ "sentence_count": len(sentences),
67
+ "word_count": len(tokens),
68
+ "polysyllable_count": polysyllable_count,
69
+ "warning": "Less than 30 sentences" if len(sentences) < 30 else None,
70
+ },
71
+ )
@@ -0,0 +1,54 @@
1
+ """Syllable counting utilities using CMU Pronouncing Dictionary."""
2
+
3
+
4
+ def count_syllables(word: str) -> int:
5
+ """
6
+ Count syllables in a word using CMU Pronouncing Dictionary with heuristic fallback.
7
+
8
+ Args:
9
+ word: The word to count syllables for
10
+
11
+ Returns:
12
+ Number of syllables in the word
13
+ """
14
+ # TODO: Implement with pronouncing library
15
+ # For now, use simple heuristic fallback
16
+ return _heuristic_syllable_count(word)
17
+
18
+
19
+ def _heuristic_syllable_count(word: str) -> int:
20
+ """
21
+ Simple heuristic syllable counter for fallback.
22
+
23
+ This is a basic implementation that counts vowel groups.
24
+ Should be replaced with CMU dict lookup when pronouncing is available.
25
+
26
+ Args:
27
+ word: The word to count syllables for
28
+
29
+ Returns:
30
+ Estimated number of syllables
31
+ """
32
+ word = word.lower().strip()
33
+ if len(word) == 0:
34
+ return 0
35
+
36
+ vowels = "aeiouy"
37
+ syllable_count = 0
38
+ previous_was_vowel = False
39
+
40
+ for char in word:
41
+ is_vowel = char in vowels
42
+ if is_vowel and not previous_was_vowel:
43
+ syllable_count += 1
44
+ previous_was_vowel = is_vowel
45
+
46
+ # Adjust for silent 'e'
47
+ if word.endswith("e") and syllable_count > 1:
48
+ syllable_count -= 1
49
+
50
+ # Ensure at least one syllable
51
+ if syllable_count == 0:
52
+ syllable_count = 1
53
+
54
+ return syllable_count
@@ -0,0 +1,9 @@
1
+ """Syntactic analysis metrics (requires spaCy)."""
2
+
3
+ from .pos_ratios import compute_pos_ratios
4
+ from .sentence_stats import compute_sentence_stats
5
+
6
+ __all__ = [
7
+ "compute_pos_ratios",
8
+ "compute_sentence_stats",
9
+ ]
@@ -0,0 +1,61 @@
1
+ """Part-of-Speech ratio analysis using spaCy."""
2
+
3
+ from .._types import POSResult
4
+ from .._utils import check_optional_dependency
5
+
6
+
7
+ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
8
+ """
9
+ Compute Part-of-Speech ratios and lexical density using spaCy.
10
+
11
+ Metrics computed:
12
+ - Noun ratio: nouns / total words
13
+ - Verb ratio: verbs / total words
14
+ - Adjective ratio: adjectives / total words
15
+ - Adverb ratio: adverbs / total words
16
+ - Noun-verb ratio: nouns / verbs
17
+ - Adjective-noun ratio: adjectives / nouns
18
+ - Lexical density: (nouns + verbs + adjectives + adverbs) / total words
19
+ - Function word ratio: (determiners + prepositions + conjunctions) / total words
20
+
21
+ References:
22
+ Biber, D. (1988). Variation across speech and writing.
23
+ Cambridge University Press.
24
+
25
+ Args:
26
+ text: Input text to analyze
27
+ model: spaCy model name (default: "en_core_web_sm")
28
+
29
+ Returns:
30
+ POSResult with all POS ratios and metadata
31
+
32
+ Raises:
33
+ ImportError: If spaCy is not installed
34
+
35
+ Example:
36
+ >>> result = compute_pos_ratios("The quick brown fox jumps over the lazy dog.")
37
+ >>> print(f"Noun ratio: {result.noun_ratio:.3f}")
38
+ >>> print(f"Verb ratio: {result.verb_ratio:.3f}")
39
+ >>> print(f"Lexical density: {result.lexical_density:.3f}")
40
+ """
41
+ check_optional_dependency("spacy", "syntactic")
42
+
43
+ # TODO: Implement spaCy-based POS analysis
44
+ # import spacy
45
+ # nlp = spacy.load(model)
46
+ # doc = nlp(text)
47
+
48
+ return POSResult(
49
+ noun_ratio=0.0,
50
+ verb_ratio=0.0,
51
+ adjective_ratio=0.0,
52
+ adverb_ratio=0.0,
53
+ noun_verb_ratio=0.0,
54
+ adjective_noun_ratio=0.0,
55
+ lexical_density=0.0,
56
+ function_word_ratio=0.0,
57
+ metadata={
58
+ "model": model,
59
+ "token_count": 0,
60
+ },
61
+ )
@@ -0,0 +1,60 @@
1
+ """Sentence-level statistics using spaCy."""
2
+
3
+ from .._types import SentenceStatsResult
4
+ from .._utils import check_optional_dependency, split_sentences
5
+
6
+
7
+ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> SentenceStatsResult:
8
+ """
9
+ Compute sentence-level statistics using spaCy.
10
+
11
+ Metrics computed:
12
+ - Mean sentence length (in words)
13
+ - Standard deviation of sentence lengths
14
+ - Range of sentence lengths (max - min)
15
+ - Minimum sentence length
16
+ - Maximum sentence length
17
+ - Total sentence count
18
+
19
+ References:
20
+ Hunt, K. W. (1965). Grammatical structures written at three grade levels.
21
+ NCTE Research Report No. 3.
22
+
23
+ Args:
24
+ text: Input text to analyze
25
+ model: spaCy model name (default: "en_core_web_sm")
26
+
27
+ Returns:
28
+ SentenceStatsResult with sentence statistics and metadata
29
+
30
+ Raises:
31
+ ImportError: If spaCy is not installed
32
+
33
+ Example:
34
+ >>> result = compute_sentence_stats("The quick brown fox. It jumps over the lazy dog.")
35
+ >>> print(f"Mean length: {result.mean_sentence_length:.1f} words")
36
+ >>> print(f"Std dev: {result.sentence_length_std:.1f}")
37
+ >>> print(f"Sentence count: {result.sentence_count}")
38
+ """
39
+ check_optional_dependency("spacy", "syntactic")
40
+
41
+ # TODO: Implement spaCy-based sentence analysis
42
+ # import spacy
43
+ # nlp = spacy.load(model)
44
+ # doc = nlp(text)
45
+ # sentences = list(doc.sents)
46
+
47
+ # For now, use simple fallback
48
+ sentences = split_sentences(text)
49
+
50
+ return SentenceStatsResult(
51
+ mean_sentence_length=0.0,
52
+ sentence_length_std=0.0,
53
+ sentence_length_range=0,
54
+ min_sentence_length=0,
55
+ max_sentence_length=0,
56
+ sentence_count=len(sentences),
57
+ metadata={
58
+ "model": model,
59
+ },
60
+ )