pystylometry 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +206 -0
- pystylometry/_types.py +172 -0
- pystylometry/_utils.py +197 -0
- pystylometry/authorship/__init__.py +10 -0
- pystylometry/authorship/burrows_delta.py +152 -0
- pystylometry/authorship/zeta.py +109 -0
- pystylometry/lexical/__init__.py +17 -0
- pystylometry/lexical/hapax.py +75 -0
- pystylometry/lexical/mtld.py +61 -0
- pystylometry/lexical/yule.py +66 -0
- pystylometry/ngrams/__init__.py +13 -0
- pystylometry/ngrams/entropy.py +130 -0
- pystylometry/readability/__init__.py +15 -0
- pystylometry/readability/ari.py +70 -0
- pystylometry/readability/coleman_liau.py +67 -0
- pystylometry/readability/flesch.py +81 -0
- pystylometry/readability/gunning_fog.py +63 -0
- pystylometry/readability/smog.py +71 -0
- pystylometry/readability/syllables.py +54 -0
- pystylometry/syntactic/__init__.py +9 -0
- pystylometry/syntactic/pos_ratios.py +61 -0
- pystylometry/syntactic/sentence_stats.py +60 -0
- pystylometry/tokenizer.py +598 -0
- pystylometry-0.1.0.dist-info/METADATA +238 -0
- pystylometry-0.1.0.dist-info/RECORD +26 -0
- pystylometry-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Automated Readability Index (ARI)."""
|
|
2
|
+
|
|
3
|
+
from .._types import ARIResult
|
|
4
|
+
from .._utils import split_sentences, tokenize
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def compute_ari(text: str) -> ARIResult:
|
|
8
|
+
"""
|
|
9
|
+
Compute Automated Readability Index (ARI).
|
|
10
|
+
|
|
11
|
+
Formula:
|
|
12
|
+
ARI = 4.71 × (characters/words) + 0.5 × (words/sentences) - 21.43
|
|
13
|
+
|
|
14
|
+
The ARI is designed to gauge the understandability of a text and produces
|
|
15
|
+
an approximate representation of the US grade level needed to comprehend the text.
|
|
16
|
+
|
|
17
|
+
Grade Level to Age mapping:
|
|
18
|
+
1-5: 5-11 years
|
|
19
|
+
6-8: 11-14 years
|
|
20
|
+
9-12: 14-18 years
|
|
21
|
+
13-14: 18-22 years
|
|
22
|
+
14+: 22+ years (college level)
|
|
23
|
+
|
|
24
|
+
References:
|
|
25
|
+
Senter, R. J., & Smith, E. A. (1967). Automated readability index.
|
|
26
|
+
AMRL-TR-6620. Aerospace Medical Research Laboratories.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
text: Input text to analyze
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
ARIResult with ARI score, grade level, and age range
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
>>> result = compute_ari("The quick brown fox jumps over the lazy dog.")
|
|
36
|
+
>>> print(f"ARI Score: {result.ari_score:.1f}")
|
|
37
|
+
>>> print(f"Grade Level: {result.grade_level}")
|
|
38
|
+
>>> print(f"Age Range: {result.age_range}")
|
|
39
|
+
"""
|
|
40
|
+
sentences = split_sentences(text)
|
|
41
|
+
tokens = tokenize(text)
|
|
42
|
+
|
|
43
|
+
if len(sentences) == 0 or len(tokens) == 0:
|
|
44
|
+
return ARIResult(
|
|
45
|
+
ari_score=0.0,
|
|
46
|
+
grade_level=0,
|
|
47
|
+
age_range="Unknown",
|
|
48
|
+
metadata={"sentence_count": 0, "word_count": 0, "character_count": 0},
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Count characters (letters, numbers, excluding spaces and punctuation)
|
|
52
|
+
character_count = sum(1 for char in text if char.isalnum())
|
|
53
|
+
|
|
54
|
+
# TODO: Implement ARI formula
|
|
55
|
+
ari_score = 0.0 # Placeholder
|
|
56
|
+
grade_level = 0 # Placeholder
|
|
57
|
+
age_range = "Unknown" # Placeholder
|
|
58
|
+
|
|
59
|
+
return ARIResult(
|
|
60
|
+
ari_score=ari_score,
|
|
61
|
+
grade_level=grade_level,
|
|
62
|
+
age_range=age_range,
|
|
63
|
+
metadata={
|
|
64
|
+
"sentence_count": len(sentences),
|
|
65
|
+
"word_count": len(tokens),
|
|
66
|
+
"character_count": character_count,
|
|
67
|
+
"characters_per_word": character_count / len(tokens) if tokens else 0,
|
|
68
|
+
"words_per_sentence": len(tokens) / len(sentences) if sentences else 0,
|
|
69
|
+
},
|
|
70
|
+
)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Coleman-Liau Index."""
|
|
2
|
+
|
|
3
|
+
from .._types import ColemanLiauResult
|
|
4
|
+
from .._utils import split_sentences, tokenize
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def compute_coleman_liau(text: str) -> ColemanLiauResult:
|
|
8
|
+
"""
|
|
9
|
+
Compute Coleman-Liau Index.
|
|
10
|
+
|
|
11
|
+
Formula:
|
|
12
|
+
CLI = 0.0588 × L - 0.296 × S - 15.8
|
|
13
|
+
|
|
14
|
+
Where:
|
|
15
|
+
L = average number of letters per 100 words
|
|
16
|
+
S = average number of sentences per 100 words
|
|
17
|
+
|
|
18
|
+
The Coleman-Liau index relies on characters rather than syllables,
|
|
19
|
+
making it easier to compute and potentially more language-agnostic.
|
|
20
|
+
|
|
21
|
+
References:
|
|
22
|
+
Coleman, M., & Liau, T. L. (1975). A computer readability formula
|
|
23
|
+
designed for machine scoring. Journal of Applied Psychology, 60(2), 283.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
text: Input text to analyze
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
ColemanLiauResult with CLI index and grade level
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
>>> result = compute_coleman_liau("The quick brown fox jumps over the lazy dog.")
|
|
33
|
+
>>> print(f"CLI Index: {result.cli_index:.1f}")
|
|
34
|
+
>>> print(f"Grade Level: {result.grade_level}")
|
|
35
|
+
"""
|
|
36
|
+
sentences = split_sentences(text)
|
|
37
|
+
tokens = tokenize(text)
|
|
38
|
+
|
|
39
|
+
if len(sentences) == 0 or len(tokens) == 0:
|
|
40
|
+
return ColemanLiauResult(
|
|
41
|
+
cli_index=0.0,
|
|
42
|
+
grade_level=0,
|
|
43
|
+
metadata={"sentence_count": 0, "word_count": 0, "letter_count": 0},
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Count letters (excluding spaces and punctuation)
|
|
47
|
+
letter_count = sum(1 for char in text if char.isalpha())
|
|
48
|
+
|
|
49
|
+
# Calculate per 100 words
|
|
50
|
+
L = (letter_count / len(tokens)) * 100 # noqa: N806
|
|
51
|
+
S = (len(sentences) / len(tokens)) * 100 # noqa: N806
|
|
52
|
+
|
|
53
|
+
# TODO: Implement Coleman-Liau formula
|
|
54
|
+
cli_index = 0.0 # Placeholder
|
|
55
|
+
grade_level = 0 # Placeholder
|
|
56
|
+
|
|
57
|
+
return ColemanLiauResult(
|
|
58
|
+
cli_index=cli_index,
|
|
59
|
+
grade_level=grade_level,
|
|
60
|
+
metadata={
|
|
61
|
+
"sentence_count": len(sentences),
|
|
62
|
+
"word_count": len(tokens),
|
|
63
|
+
"letter_count": letter_count,
|
|
64
|
+
"letters_per_100_words": L,
|
|
65
|
+
"sentences_per_100_words": S,
|
|
66
|
+
},
|
|
67
|
+
)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Flesch Reading Ease and Flesch-Kincaid Grade Level."""
|
|
2
|
+
|
|
3
|
+
from .._types import FleschResult
|
|
4
|
+
from .._utils import split_sentences, tokenize
|
|
5
|
+
from .syllables import count_syllables
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def compute_flesch(text: str) -> FleschResult:
|
|
9
|
+
"""
|
|
10
|
+
Compute Flesch Reading Ease and Flesch-Kincaid Grade Level.
|
|
11
|
+
|
|
12
|
+
Flesch Reading Ease:
|
|
13
|
+
Score = 206.835 - 1.015 × (words/sentences) - 84.6 × (syllables/words)
|
|
14
|
+
Higher scores = easier to read (0-100 scale)
|
|
15
|
+
|
|
16
|
+
Flesch-Kincaid Grade Level:
|
|
17
|
+
Grade = 0.39 × (words/sentences) + 11.8 × (syllables/words) - 15.59
|
|
18
|
+
|
|
19
|
+
Interpretation of Reading Ease:
|
|
20
|
+
90-100: Very Easy (5th grade)
|
|
21
|
+
80-89: Easy (6th grade)
|
|
22
|
+
70-79: Fairly Easy (7th grade)
|
|
23
|
+
60-69: Standard (8th-9th grade)
|
|
24
|
+
50-59: Fairly Difficult (10th-12th grade)
|
|
25
|
+
30-49: Difficult (College)
|
|
26
|
+
0-29: Very Difficult (College graduate)
|
|
27
|
+
|
|
28
|
+
References:
|
|
29
|
+
Flesch, R. (1948). A new readability yardstick.
|
|
30
|
+
Journal of Applied Psychology, 32(3), 221.
|
|
31
|
+
|
|
32
|
+
Kincaid, J. P., et al. (1975). Derivation of new readability formulas
|
|
33
|
+
for Navy enlisted personnel. Naval Technical Training Command.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
text: Input text to analyze
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
FleschResult with reading ease, grade level, and difficulty rating
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
>>> result = compute_flesch("The quick brown fox jumps over the lazy dog.")
|
|
43
|
+
>>> print(f"Reading Ease: {result.reading_ease:.1f}")
|
|
44
|
+
>>> print(f"Grade Level: {result.grade_level:.1f}")
|
|
45
|
+
>>> print(f"Difficulty: {result.difficulty}")
|
|
46
|
+
"""
|
|
47
|
+
sentences = split_sentences(text)
|
|
48
|
+
tokens = tokenize(text)
|
|
49
|
+
|
|
50
|
+
if len(sentences) == 0 or len(tokens) == 0:
|
|
51
|
+
return FleschResult(
|
|
52
|
+
reading_ease=0.0,
|
|
53
|
+
grade_level=0.0,
|
|
54
|
+
difficulty="Unknown",
|
|
55
|
+
metadata={"sentence_count": 0, "word_count": 0, "syllable_count": 0},
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Count syllables
|
|
59
|
+
total_syllables = sum(count_syllables(word) for word in tokens)
|
|
60
|
+
|
|
61
|
+
# Calculate metrics
|
|
62
|
+
words_per_sentence = len(tokens) / len(sentences)
|
|
63
|
+
syllables_per_word = total_syllables / len(tokens)
|
|
64
|
+
|
|
65
|
+
# TODO: Implement Flesch formulas
|
|
66
|
+
reading_ease = 0.0 # Placeholder
|
|
67
|
+
grade_level = 0.0 # Placeholder
|
|
68
|
+
difficulty = "Unknown" # Placeholder
|
|
69
|
+
|
|
70
|
+
return FleschResult(
|
|
71
|
+
reading_ease=reading_ease,
|
|
72
|
+
grade_level=grade_level,
|
|
73
|
+
difficulty=difficulty,
|
|
74
|
+
metadata={
|
|
75
|
+
"sentence_count": len(sentences),
|
|
76
|
+
"word_count": len(tokens),
|
|
77
|
+
"syllable_count": total_syllables,
|
|
78
|
+
"words_per_sentence": words_per_sentence,
|
|
79
|
+
"syllables_per_word": syllables_per_word,
|
|
80
|
+
},
|
|
81
|
+
)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Gunning Fog Index."""
|
|
2
|
+
|
|
3
|
+
from .._types import GunningFogResult
|
|
4
|
+
from .._utils import split_sentences, tokenize
|
|
5
|
+
from .syllables import count_syllables
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def compute_gunning_fog(text: str) -> GunningFogResult:
|
|
9
|
+
"""
|
|
10
|
+
Compute Gunning Fog Index.
|
|
11
|
+
|
|
12
|
+
Formula:
|
|
13
|
+
Fog Index = 0.4 × [(words/sentences) + 100 × (complex words/words)]
|
|
14
|
+
|
|
15
|
+
Where complex words are defined as words with 3+ syllables,
|
|
16
|
+
excluding proper nouns, compound words, and common suffixes.
|
|
17
|
+
|
|
18
|
+
The index estimates years of formal education needed to understand the text
|
|
19
|
+
on first reading.
|
|
20
|
+
|
|
21
|
+
References:
|
|
22
|
+
Gunning, R. (1952). The Technique of Clear Writing.
|
|
23
|
+
McGraw-Hill.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
text: Input text to analyze
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
GunningFogResult with fog index and grade level
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
>>> result = compute_gunning_fog("The quick brown fox jumps over the lazy dog.")
|
|
33
|
+
>>> print(f"Fog Index: {result.fog_index:.1f}")
|
|
34
|
+
>>> print(f"Grade Level: {result.grade_level}")
|
|
35
|
+
"""
|
|
36
|
+
sentences = split_sentences(text)
|
|
37
|
+
tokens = tokenize(text)
|
|
38
|
+
|
|
39
|
+
if len(sentences) == 0 or len(tokens) == 0:
|
|
40
|
+
return GunningFogResult(
|
|
41
|
+
fog_index=0.0,
|
|
42
|
+
grade_level=0,
|
|
43
|
+
metadata={"sentence_count": 0, "word_count": 0, "complex_word_count": 0},
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Count complex words (3+ syllables)
|
|
47
|
+
# TODO: Exclude proper nouns, compound words, and -es/-ed/-ing endings
|
|
48
|
+
complex_word_count = sum(1 for word in tokens if count_syllables(word) >= 3)
|
|
49
|
+
|
|
50
|
+
# TODO: Implement Gunning Fog formula
|
|
51
|
+
fog_index = 0.0 # Placeholder
|
|
52
|
+
grade_level = 0 # Placeholder
|
|
53
|
+
|
|
54
|
+
return GunningFogResult(
|
|
55
|
+
fog_index=fog_index,
|
|
56
|
+
grade_level=grade_level,
|
|
57
|
+
metadata={
|
|
58
|
+
"sentence_count": len(sentences),
|
|
59
|
+
"word_count": len(tokens),
|
|
60
|
+
"complex_word_count": complex_word_count,
|
|
61
|
+
"complex_word_percentage": (complex_word_count / len(tokens) * 100) if tokens else 0,
|
|
62
|
+
},
|
|
63
|
+
)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""SMOG (Simple Measure of Gobbledygook) Index."""
|
|
2
|
+
|
|
3
|
+
from .._types import SMOGResult
|
|
4
|
+
from .._utils import split_sentences, tokenize
|
|
5
|
+
from .syllables import count_syllables
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def compute_smog(text: str) -> SMOGResult:
|
|
9
|
+
"""
|
|
10
|
+
Compute SMOG (Simple Measure of Gobbledygook) Index.
|
|
11
|
+
|
|
12
|
+
Formula:
|
|
13
|
+
SMOG = 1.043 × √(polysyllables × 30/sentences) + 3.1291
|
|
14
|
+
|
|
15
|
+
Where polysyllables are words with 3 or more syllables.
|
|
16
|
+
|
|
17
|
+
The SMOG index estimates the years of education needed to understand the text.
|
|
18
|
+
It's particularly useful for healthcare materials.
|
|
19
|
+
|
|
20
|
+
References:
|
|
21
|
+
McLaughlin, G. H. (1969). SMOG grading: A new readability formula.
|
|
22
|
+
Journal of Reading, 12(8), 639-646.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
text: Input text to analyze
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
SMOGResult with SMOG index and grade level
|
|
29
|
+
|
|
30
|
+
Example:
|
|
31
|
+
>>> result = compute_smog("The quick brown fox jumps over the lazy dog.")
|
|
32
|
+
>>> print(f"SMOG Index: {result.smog_index:.1f}")
|
|
33
|
+
>>> print(f"Grade Level: {result.grade_level}")
|
|
34
|
+
"""
|
|
35
|
+
sentences = split_sentences(text)
|
|
36
|
+
tokens = tokenize(text)
|
|
37
|
+
|
|
38
|
+
if len(sentences) < 30:
|
|
39
|
+
# SMOG requires at least 30 sentences for accuracy
|
|
40
|
+
# We'll compute anyway but note in metadata
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
if len(sentences) == 0 or len(tokens) == 0:
|
|
44
|
+
return SMOGResult(
|
|
45
|
+
smog_index=0.0,
|
|
46
|
+
grade_level=0,
|
|
47
|
+
metadata={
|
|
48
|
+
"sentence_count": 0,
|
|
49
|
+
"word_count": 0,
|
|
50
|
+
"polysyllable_count": 0,
|
|
51
|
+
"warning": "Insufficient text",
|
|
52
|
+
},
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Count polysyllables (words with 3+ syllables)
|
|
56
|
+
polysyllable_count = sum(1 for word in tokens if count_syllables(word) >= 3)
|
|
57
|
+
|
|
58
|
+
# TODO: Implement SMOG formula
|
|
59
|
+
smog_index = 0.0 # Placeholder
|
|
60
|
+
grade_level = 0 # Placeholder
|
|
61
|
+
|
|
62
|
+
return SMOGResult(
|
|
63
|
+
smog_index=smog_index,
|
|
64
|
+
grade_level=grade_level,
|
|
65
|
+
metadata={
|
|
66
|
+
"sentence_count": len(sentences),
|
|
67
|
+
"word_count": len(tokens),
|
|
68
|
+
"polysyllable_count": polysyllable_count,
|
|
69
|
+
"warning": "Less than 30 sentences" if len(sentences) < 30 else None,
|
|
70
|
+
},
|
|
71
|
+
)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Syllable counting utilities using CMU Pronouncing Dictionary."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def count_syllables(word: str) -> int:
|
|
5
|
+
"""
|
|
6
|
+
Count syllables in a word using CMU Pronouncing Dictionary with heuristic fallback.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
word: The word to count syllables for
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
Number of syllables in the word
|
|
13
|
+
"""
|
|
14
|
+
# TODO: Implement with pronouncing library
|
|
15
|
+
# For now, use simple heuristic fallback
|
|
16
|
+
return _heuristic_syllable_count(word)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _heuristic_syllable_count(word: str) -> int:
|
|
20
|
+
"""
|
|
21
|
+
Simple heuristic syllable counter for fallback.
|
|
22
|
+
|
|
23
|
+
This is a basic implementation that counts vowel groups.
|
|
24
|
+
Should be replaced with CMU dict lookup when pronouncing is available.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
word: The word to count syllables for
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Estimated number of syllables
|
|
31
|
+
"""
|
|
32
|
+
word = word.lower().strip()
|
|
33
|
+
if len(word) == 0:
|
|
34
|
+
return 0
|
|
35
|
+
|
|
36
|
+
vowels = "aeiouy"
|
|
37
|
+
syllable_count = 0
|
|
38
|
+
previous_was_vowel = False
|
|
39
|
+
|
|
40
|
+
for char in word:
|
|
41
|
+
is_vowel = char in vowels
|
|
42
|
+
if is_vowel and not previous_was_vowel:
|
|
43
|
+
syllable_count += 1
|
|
44
|
+
previous_was_vowel = is_vowel
|
|
45
|
+
|
|
46
|
+
# Adjust for silent 'e'
|
|
47
|
+
if word.endswith("e") and syllable_count > 1:
|
|
48
|
+
syllable_count -= 1
|
|
49
|
+
|
|
50
|
+
# Ensure at least one syllable
|
|
51
|
+
if syllable_count == 0:
|
|
52
|
+
syllable_count = 1
|
|
53
|
+
|
|
54
|
+
return syllable_count
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Part-of-Speech ratio analysis using spaCy."""
|
|
2
|
+
|
|
3
|
+
from .._types import POSResult
|
|
4
|
+
from .._utils import check_optional_dependency
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
8
|
+
"""
|
|
9
|
+
Compute Part-of-Speech ratios and lexical density using spaCy.
|
|
10
|
+
|
|
11
|
+
Metrics computed:
|
|
12
|
+
- Noun ratio: nouns / total words
|
|
13
|
+
- Verb ratio: verbs / total words
|
|
14
|
+
- Adjective ratio: adjectives / total words
|
|
15
|
+
- Adverb ratio: adverbs / total words
|
|
16
|
+
- Noun-verb ratio: nouns / verbs
|
|
17
|
+
- Adjective-noun ratio: adjectives / nouns
|
|
18
|
+
- Lexical density: (nouns + verbs + adjectives + adverbs) / total words
|
|
19
|
+
- Function word ratio: (determiners + prepositions + conjunctions) / total words
|
|
20
|
+
|
|
21
|
+
References:
|
|
22
|
+
Biber, D. (1988). Variation across speech and writing.
|
|
23
|
+
Cambridge University Press.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
text: Input text to analyze
|
|
27
|
+
model: spaCy model name (default: "en_core_web_sm")
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
POSResult with all POS ratios and metadata
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
ImportError: If spaCy is not installed
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> result = compute_pos_ratios("The quick brown fox jumps over the lazy dog.")
|
|
37
|
+
>>> print(f"Noun ratio: {result.noun_ratio:.3f}")
|
|
38
|
+
>>> print(f"Verb ratio: {result.verb_ratio:.3f}")
|
|
39
|
+
>>> print(f"Lexical density: {result.lexical_density:.3f}")
|
|
40
|
+
"""
|
|
41
|
+
check_optional_dependency("spacy", "syntactic")
|
|
42
|
+
|
|
43
|
+
# TODO: Implement spaCy-based POS analysis
|
|
44
|
+
# import spacy
|
|
45
|
+
# nlp = spacy.load(model)
|
|
46
|
+
# doc = nlp(text)
|
|
47
|
+
|
|
48
|
+
return POSResult(
|
|
49
|
+
noun_ratio=0.0,
|
|
50
|
+
verb_ratio=0.0,
|
|
51
|
+
adjective_ratio=0.0,
|
|
52
|
+
adverb_ratio=0.0,
|
|
53
|
+
noun_verb_ratio=0.0,
|
|
54
|
+
adjective_noun_ratio=0.0,
|
|
55
|
+
lexical_density=0.0,
|
|
56
|
+
function_word_ratio=0.0,
|
|
57
|
+
metadata={
|
|
58
|
+
"model": model,
|
|
59
|
+
"token_count": 0,
|
|
60
|
+
},
|
|
61
|
+
)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Sentence-level statistics using spaCy."""
|
|
2
|
+
|
|
3
|
+
from .._types import SentenceStatsResult
|
|
4
|
+
from .._utils import check_optional_dependency, split_sentences
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> SentenceStatsResult:
|
|
8
|
+
"""
|
|
9
|
+
Compute sentence-level statistics using spaCy.
|
|
10
|
+
|
|
11
|
+
Metrics computed:
|
|
12
|
+
- Mean sentence length (in words)
|
|
13
|
+
- Standard deviation of sentence lengths
|
|
14
|
+
- Range of sentence lengths (max - min)
|
|
15
|
+
- Minimum sentence length
|
|
16
|
+
- Maximum sentence length
|
|
17
|
+
- Total sentence count
|
|
18
|
+
|
|
19
|
+
References:
|
|
20
|
+
Hunt, K. W. (1965). Grammatical structures written at three grade levels.
|
|
21
|
+
NCTE Research Report No. 3.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
text: Input text to analyze
|
|
25
|
+
model: spaCy model name (default: "en_core_web_sm")
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
SentenceStatsResult with sentence statistics and metadata
|
|
29
|
+
|
|
30
|
+
Raises:
|
|
31
|
+
ImportError: If spaCy is not installed
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
>>> result = compute_sentence_stats("The quick brown fox. It jumps over the lazy dog.")
|
|
35
|
+
>>> print(f"Mean length: {result.mean_sentence_length:.1f} words")
|
|
36
|
+
>>> print(f"Std dev: {result.sentence_length_std:.1f}")
|
|
37
|
+
>>> print(f"Sentence count: {result.sentence_count}")
|
|
38
|
+
"""
|
|
39
|
+
check_optional_dependency("spacy", "syntactic")
|
|
40
|
+
|
|
41
|
+
# TODO: Implement spaCy-based sentence analysis
|
|
42
|
+
# import spacy
|
|
43
|
+
# nlp = spacy.load(model)
|
|
44
|
+
# doc = nlp(text)
|
|
45
|
+
# sentences = list(doc.sents)
|
|
46
|
+
|
|
47
|
+
# For now, use simple fallback
|
|
48
|
+
sentences = split_sentences(text)
|
|
49
|
+
|
|
50
|
+
return SentenceStatsResult(
|
|
51
|
+
mean_sentence_length=0.0,
|
|
52
|
+
sentence_length_std=0.0,
|
|
53
|
+
sentence_length_range=0,
|
|
54
|
+
min_sentence_length=0,
|
|
55
|
+
max_sentence_length=0,
|
|
56
|
+
sentence_count=len(sentences),
|
|
57
|
+
metadata={
|
|
58
|
+
"model": model,
|
|
59
|
+
},
|
|
60
|
+
)
|