pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
pystylometry/readability/smog.py
CHANGED
|
@@ -1,14 +1,62 @@
|
|
|
1
|
-
"""SMOG (Simple Measure of Gobbledygook) Index.
|
|
1
|
+
"""SMOG (Simple Measure of Gobbledygook) Index.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
This module implements the SMOG readability formula with native chunked
|
|
4
|
+
analysis for stylometric fingerprinting.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
|
|
13
|
+
from .._normalize import normalize_for_readability
|
|
14
|
+
from .._types import Distribution, SMOGResult, chunk_text, make_distribution
|
|
4
15
|
from .._utils import split_sentences, tokenize
|
|
5
16
|
from .syllables import count_syllables
|
|
6
17
|
|
|
7
18
|
|
|
8
|
-
def
|
|
19
|
+
def _compute_smog_single(text: str) -> tuple[float, float, dict]:
|
|
20
|
+
"""Compute SMOG metrics for a single chunk of text.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Tuple of (smog_index, grade_level, metadata_dict).
|
|
24
|
+
Returns (nan, nan, metadata) for empty/invalid input.
|
|
25
|
+
"""
|
|
26
|
+
sentences = split_sentences(text)
|
|
27
|
+
tokens = tokenize(text)
|
|
28
|
+
word_tokens = normalize_for_readability(tokens)
|
|
29
|
+
|
|
30
|
+
if len(sentences) == 0 or len(word_tokens) == 0:
|
|
31
|
+
return (
|
|
32
|
+
float("nan"),
|
|
33
|
+
float("nan"),
|
|
34
|
+
{"sentence_count": 0, "word_count": 0, "polysyllable_count": 0},
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Count polysyllables (words with 3+ syllables)
|
|
38
|
+
polysyllable_count = sum(1 for word in word_tokens if count_syllables(word) >= 3)
|
|
39
|
+
|
|
40
|
+
# SMOG formula
|
|
41
|
+
smog_index = 1.043 * math.sqrt(polysyllable_count * 30 / len(sentences)) + 3.1291
|
|
42
|
+
grade_level = max(0, min(20, math.floor(smog_index + 0.5)))
|
|
43
|
+
|
|
44
|
+
metadata = {
|
|
45
|
+
"sentence_count": len(sentences),
|
|
46
|
+
"word_count": len(word_tokens),
|
|
47
|
+
"polysyllable_count": polysyllable_count,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return (smog_index, float(grade_level), metadata)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def compute_smog(text: str, chunk_size: int = 1000) -> SMOGResult:
|
|
9
54
|
"""
|
|
10
55
|
Compute SMOG (Simple Measure of Gobbledygook) Index.
|
|
11
56
|
|
|
57
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
58
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
59
|
+
|
|
12
60
|
Formula:
|
|
13
61
|
SMOG = 1.043 × √(polysyllables × 30/sentences) + 3.1291
|
|
14
62
|
|
|
@@ -17,55 +65,105 @@ def compute_smog(text: str) -> SMOGResult:
|
|
|
17
65
|
The SMOG index estimates the years of education needed to understand the text.
|
|
18
66
|
It's particularly useful for healthcare materials.
|
|
19
67
|
|
|
68
|
+
Related GitHub Issue:
|
|
69
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
70
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
71
|
+
|
|
20
72
|
References:
|
|
21
73
|
McLaughlin, G. H. (1969). SMOG grading: A new readability formula.
|
|
22
74
|
Journal of Reading, 12(8), 639-646.
|
|
23
75
|
|
|
24
76
|
Args:
|
|
25
77
|
text: Input text to analyze
|
|
78
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
79
|
+
The text is divided into chunks of this size, and metrics are
|
|
80
|
+
computed per-chunk.
|
|
26
81
|
|
|
27
82
|
Returns:
|
|
28
|
-
SMOGResult with
|
|
83
|
+
SMOGResult with:
|
|
84
|
+
- smog_index: Mean SMOG index across chunks
|
|
85
|
+
- grade_level: Mean grade level across chunks
|
|
86
|
+
- smog_index_dist: Distribution with per-chunk values and stats
|
|
87
|
+
- grade_level_dist: Distribution with per-chunk values and stats
|
|
88
|
+
- chunk_size: The chunk size used
|
|
89
|
+
- chunk_count: Number of chunks analyzed
|
|
29
90
|
|
|
30
91
|
Example:
|
|
31
|
-
>>> result = compute_smog("
|
|
32
|
-
>>>
|
|
33
|
-
|
|
92
|
+
>>> result = compute_smog("Long text here...", chunk_size=1000)
|
|
93
|
+
>>> result.smog_index # Mean across chunks
|
|
94
|
+
12.5
|
|
95
|
+
>>> result.smog_index_dist.std # Variance reveals fingerprint
|
|
96
|
+
1.8
|
|
34
97
|
"""
|
|
35
|
-
|
|
36
|
-
|
|
98
|
+
# Chunk the text
|
|
99
|
+
chunks = chunk_text(text, chunk_size)
|
|
100
|
+
|
|
101
|
+
# Compute metrics per chunk
|
|
102
|
+
smog_values = []
|
|
103
|
+
grade_values = []
|
|
104
|
+
total_sentences = 0
|
|
105
|
+
total_words = 0
|
|
106
|
+
total_polysyllables = 0
|
|
37
107
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
108
|
+
for chunk in chunks:
|
|
109
|
+
si, gl, meta = _compute_smog_single(chunk)
|
|
110
|
+
if not math.isnan(si):
|
|
111
|
+
smog_values.append(si)
|
|
112
|
+
grade_values.append(gl)
|
|
113
|
+
total_sentences += meta.get("sentence_count", 0)
|
|
114
|
+
total_words += meta.get("word_count", 0)
|
|
115
|
+
total_polysyllables += meta.get("polysyllable_count", 0)
|
|
42
116
|
|
|
43
|
-
|
|
117
|
+
# Handle empty or all-invalid chunks
|
|
118
|
+
if not smog_values:
|
|
119
|
+
empty_dist = Distribution(
|
|
120
|
+
values=[],
|
|
121
|
+
mean=float("nan"),
|
|
122
|
+
median=float("nan"),
|
|
123
|
+
std=0.0,
|
|
124
|
+
range=0.0,
|
|
125
|
+
iqr=0.0,
|
|
126
|
+
)
|
|
44
127
|
return SMOGResult(
|
|
45
|
-
smog_index=
|
|
46
|
-
grade_level=
|
|
128
|
+
smog_index=float("nan"),
|
|
129
|
+
grade_level=float("nan"),
|
|
130
|
+
smog_index_dist=empty_dist,
|
|
131
|
+
grade_level_dist=empty_dist,
|
|
132
|
+
chunk_size=chunk_size,
|
|
133
|
+
chunk_count=len(chunks),
|
|
47
134
|
metadata={
|
|
135
|
+
# Backward-compatible keys
|
|
48
136
|
"sentence_count": 0,
|
|
49
137
|
"word_count": 0,
|
|
50
138
|
"polysyllable_count": 0,
|
|
139
|
+
# New prefixed keys for consistency
|
|
140
|
+
"total_sentence_count": 0,
|
|
141
|
+
"total_word_count": 0,
|
|
142
|
+
"total_polysyllable_count": 0,
|
|
51
143
|
"warning": "Insufficient text",
|
|
52
144
|
},
|
|
53
145
|
)
|
|
54
146
|
|
|
55
|
-
#
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
# TODO: Implement SMOG formula
|
|
59
|
-
smog_index = 0.0 # Placeholder
|
|
60
|
-
grade_level = 0 # Placeholder
|
|
147
|
+
# Build distributions
|
|
148
|
+
smog_dist = make_distribution(smog_values)
|
|
149
|
+
grade_dist = make_distribution(grade_values)
|
|
61
150
|
|
|
62
151
|
return SMOGResult(
|
|
63
|
-
smog_index=
|
|
64
|
-
grade_level=
|
|
152
|
+
smog_index=smog_dist.mean,
|
|
153
|
+
grade_level=grade_dist.mean,
|
|
154
|
+
smog_index_dist=smog_dist,
|
|
155
|
+
grade_level_dist=grade_dist,
|
|
156
|
+
chunk_size=chunk_size,
|
|
157
|
+
chunk_count=len(chunks),
|
|
65
158
|
metadata={
|
|
66
|
-
|
|
67
|
-
"
|
|
68
|
-
"
|
|
69
|
-
"
|
|
159
|
+
# Backward-compatible keys
|
|
160
|
+
"sentence_count": total_sentences,
|
|
161
|
+
"word_count": total_words,
|
|
162
|
+
"polysyllable_count": total_polysyllables,
|
|
163
|
+
# New prefixed keys for consistency
|
|
164
|
+
"total_sentence_count": total_sentences,
|
|
165
|
+
"total_word_count": total_words,
|
|
166
|
+
"total_polysyllable_count": total_polysyllables,
|
|
167
|
+
"warning": "Less than 30 sentences" if total_sentences < 30 else None,
|
|
70
168
|
},
|
|
71
169
|
)
|
|
@@ -1,54 +1,161 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
Syllable counting using CMU Pronouncing Dictionary.
|
|
2
3
|
|
|
4
|
+
Uses the pronouncing library which provides access to the CMU Pronouncing
|
|
5
|
+
Dictionary for high-accuracy syllable counting based on phonetic transcriptions.
|
|
6
|
+
"""
|
|
3
7
|
|
|
8
|
+
import re
|
|
9
|
+
from functools import lru_cache
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import pronouncing # type: ignore[import-untyped]
|
|
13
|
+
except ImportError:
|
|
14
|
+
raise ImportError(
|
|
15
|
+
"The 'pronouncing' library is required for syllable counting. "
|
|
16
|
+
"Install it with: pip install pystylometry[readability]"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@lru_cache(maxsize=4096)
|
|
4
21
|
def count_syllables(word: str) -> int:
|
|
5
22
|
"""
|
|
6
|
-
Count syllables
|
|
23
|
+
Count syllables using CMU Pronouncing Dictionary.
|
|
24
|
+
|
|
25
|
+
Uses phonetic transcriptions from CMU dictionary. For words with multiple
|
|
26
|
+
pronunciations, uses the first pronunciation (typically the most common).
|
|
27
|
+
Falls back to simple vowel counting for words not in the dictionary.
|
|
7
28
|
|
|
8
29
|
Args:
|
|
9
|
-
word:
|
|
30
|
+
word: Input word (handles mixed case, strips whitespace)
|
|
10
31
|
|
|
11
32
|
Returns:
|
|
12
|
-
|
|
33
|
+
Syllable count (minimum 1 for non-empty input)
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> count_syllables("beautiful")
|
|
37
|
+
3
|
|
38
|
+
>>> count_syllables("fire")
|
|
39
|
+
2
|
|
40
|
+
>>> count_syllables("cruel")
|
|
41
|
+
1
|
|
13
42
|
"""
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
43
|
+
word = word.lower().strip()
|
|
44
|
+
if not word:
|
|
45
|
+
return 0
|
|
17
46
|
|
|
47
|
+
# Strip common punctuation
|
|
48
|
+
word = word.strip(".,;:!?\"'()-")
|
|
49
|
+
if not word:
|
|
50
|
+
return 0
|
|
18
51
|
|
|
19
|
-
|
|
20
|
-
""
|
|
21
|
-
|
|
52
|
+
# Handle contractions by removing apostrophes
|
|
53
|
+
if "'" in word:
|
|
54
|
+
word = word.replace("'", "")
|
|
22
55
|
|
|
23
|
-
|
|
24
|
-
|
|
56
|
+
# Handle hyphenated compounds
|
|
57
|
+
if "-" in word:
|
|
58
|
+
return sum(count_syllables(part) for part in word.split("-") if part)
|
|
25
59
|
|
|
26
|
-
|
|
27
|
-
|
|
60
|
+
# Get pronunciations from CMU dictionary
|
|
61
|
+
phones_list = pronouncing.phones_for_word(word)
|
|
28
62
|
|
|
29
|
-
|
|
30
|
-
|
|
63
|
+
if phones_list:
|
|
64
|
+
# Use first pronunciation (most common)
|
|
65
|
+
# Count stress markers (0, 1, 2) in phoneme representation
|
|
66
|
+
phones = phones_list[0]
|
|
67
|
+
return pronouncing.syllable_count(phones) # type: ignore[no-any-return]
|
|
68
|
+
|
|
69
|
+
# Fallback for words not in dictionary: simple vowel counting
|
|
70
|
+
return _fallback_count(word)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _fallback_count(word: str) -> int:
|
|
31
74
|
"""
|
|
32
|
-
|
|
33
|
-
if len(word) == 0:
|
|
34
|
-
return 0
|
|
75
|
+
Simple fallback syllable counter for words not in CMU dictionary.
|
|
35
76
|
|
|
77
|
+
Uses basic vowel counting with silent-e adjustment.
|
|
78
|
+
Less accurate than CMU but handles rare/technical words.
|
|
79
|
+
"""
|
|
36
80
|
vowels = "aeiouy"
|
|
37
|
-
|
|
38
|
-
|
|
81
|
+
count = 0
|
|
82
|
+
prev_was_vowel = False
|
|
39
83
|
|
|
40
84
|
for char in word:
|
|
41
85
|
is_vowel = char in vowels
|
|
42
|
-
if is_vowel and not
|
|
43
|
-
|
|
44
|
-
|
|
86
|
+
if is_vowel and not prev_was_vowel:
|
|
87
|
+
count += 1
|
|
88
|
+
prev_was_vowel = is_vowel
|
|
45
89
|
|
|
46
90
|
# Adjust for silent 'e'
|
|
47
|
-
if word.endswith("e") and
|
|
48
|
-
|
|
91
|
+
if word.endswith("e") and count > 1:
|
|
92
|
+
count -= 1
|
|
93
|
+
|
|
94
|
+
# Ensure minimum of 1
|
|
95
|
+
return max(1, count)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def count_syllables_text(text: str) -> list[tuple[str, int]]:
|
|
99
|
+
"""
|
|
100
|
+
Count syllables for all words in a text.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
text: Input text
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
List of (word, syllable_count) tuples
|
|
107
|
+
|
|
108
|
+
Example:
|
|
109
|
+
>>> count_syllables_text("The quick brown fox")
|
|
110
|
+
[('The', 1), ('quick', 1), ('brown', 1), ('fox', 1)]
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
words = re.findall(r"[a-zA-Z']+", text)
|
|
114
|
+
return [(w, count_syllables(w)) for w in words]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def total_syllables(text: str) -> int:
|
|
118
|
+
"""
|
|
119
|
+
Return total syllable count for text.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
text: Input text
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Total number of syllables
|
|
126
|
+
|
|
127
|
+
Example:
|
|
128
|
+
>>> total_syllables("The quick brown fox")
|
|
129
|
+
4
|
|
130
|
+
"""
|
|
131
|
+
return sum(count for _, count in count_syllables_text(text))
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def validate_accuracy(
|
|
135
|
+
test_pairs: list[tuple[str, int]],
|
|
136
|
+
) -> tuple[float, list[tuple[str, int, int]]]:
|
|
137
|
+
"""
|
|
138
|
+
Test accuracy against known word-syllable pairs.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
test_pairs: List of (word, expected_syllables) tuples
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
(accuracy_percentage, list of (word, expected, got) for failures)
|
|
145
|
+
|
|
146
|
+
Example:
|
|
147
|
+
>>> test_pairs = [("hello", 2), ("world", 1), ("beautiful", 3)]
|
|
148
|
+
>>> accuracy, failures = validate_accuracy(test_pairs)
|
|
149
|
+
>>> print(f"Accuracy: {accuracy:.1f}%")
|
|
150
|
+
"""
|
|
151
|
+
failures = []
|
|
152
|
+
for word, expected in test_pairs:
|
|
153
|
+
got = count_syllables(word)
|
|
154
|
+
if got != expected:
|
|
155
|
+
failures.append((word, expected, got))
|
|
49
156
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
syllable_count = 1
|
|
157
|
+
if not test_pairs:
|
|
158
|
+
return 0.0, []
|
|
53
159
|
|
|
54
|
-
|
|
160
|
+
accuracy = (len(test_pairs) - len(failures)) / len(test_pairs) * 100
|
|
161
|
+
return accuracy, failures
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Stylistic analysis metrics.
|
|
2
|
+
|
|
3
|
+
Related GitHub Issues:
|
|
4
|
+
#20 - Stylistic Markers
|
|
5
|
+
#21 - Vocabulary Overlap and Similarity Metrics
|
|
6
|
+
#22 - Cohesion and Coherence Metrics
|
|
7
|
+
#23 - Genre and Register Features
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .cohesion_coherence import compute_cohesion_coherence
|
|
11
|
+
from .genre_register import compute_genre_register
|
|
12
|
+
from .markers import compute_stylistic_markers
|
|
13
|
+
from .vocabulary_overlap import compute_vocabulary_overlap
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"compute_stylistic_markers",
|
|
17
|
+
"compute_vocabulary_overlap",
|
|
18
|
+
"compute_cohesion_coherence",
|
|
19
|
+
"compute_genre_register",
|
|
20
|
+
]
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Cohesion and coherence metrics.
|
|
2
|
+
|
|
3
|
+
This module measures how well a text holds together structurally (cohesion)
|
|
4
|
+
and semantically (coherence). Important for analyzing writing quality and
|
|
5
|
+
authorial sophistication.
|
|
6
|
+
|
|
7
|
+
Related GitHub Issue:
|
|
8
|
+
#22 - Cohesion and Coherence Metrics
|
|
9
|
+
https://github.com/craigtrim/pystylometry/issues/22
|
|
10
|
+
|
|
11
|
+
References:
|
|
12
|
+
Halliday, M. A. K., & Hasan, R. (1976). Cohesion in English. Longman.
|
|
13
|
+
Graesser, A. C., McNamara, D. S., & Kulikowich, J. M. (2011). Coh-Metrix.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from .._types import CohesionCoherenceResult
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def compute_cohesion_coherence(text: str, model: str = "en_core_web_sm") -> CohesionCoherenceResult:
|
|
20
|
+
"""
|
|
21
|
+
Compute cohesion and coherence metrics.
|
|
22
|
+
|
|
23
|
+
Related GitHub Issue:
|
|
24
|
+
#22 - Cohesion and Coherence Metrics
|
|
25
|
+
https://github.com/craigtrim/pystylometry/issues/22
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
text: Input text to analyze
|
|
29
|
+
model: spaCy model for linguistic analysis
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
CohesionCoherenceResult with referential cohesion, lexical cohesion,
|
|
33
|
+
connective density, and coherence scores.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> result = compute_cohesion_coherence("Multi-paragraph text...")
|
|
37
|
+
>>> print(f"Pronoun density: {result.pronoun_density:.2f}")
|
|
38
|
+
>>> print(f"Connective density: {result.connective_density:.2f}")
|
|
39
|
+
"""
|
|
40
|
+
# TODO: Implement cohesion/coherence analysis
|
|
41
|
+
# GitHub Issue #22: https://github.com/craigtrim/pystylometry/issues/22
|
|
42
|
+
raise NotImplementedError(
|
|
43
|
+
"Cohesion/coherence metrics not yet implemented. "
|
|
44
|
+
"See GitHub Issue #22: https://github.com/craigtrim/pystylometry/issues/22"
|
|
45
|
+
)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Genre and register classification features.
|
|
2
|
+
|
|
3
|
+
This module extracts features that distinguish between different text types
|
|
4
|
+
(academic, journalistic, fiction, legal, etc.) and formality levels.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#23 - Genre and Register Features
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/23
|
|
9
|
+
|
|
10
|
+
References:
|
|
11
|
+
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
12
|
+
Biber, D., & Conrad, S. (2009). Register, genre, and style.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from .._types import GenreRegisterResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def compute_genre_register(text: str, model: str = "en_core_web_sm") -> GenreRegisterResult:
|
|
19
|
+
"""
|
|
20
|
+
Analyze genre and register features for text classification.
|
|
21
|
+
|
|
22
|
+
Related GitHub Issue:
|
|
23
|
+
#23 - Genre and Register Features
|
|
24
|
+
https://github.com/craigtrim/pystylometry/issues/23
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
text: Input text to analyze
|
|
28
|
+
model: spaCy model for linguistic analysis
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
GenreRegisterResult with formality scores, register classification,
|
|
32
|
+
genre predictions, and feature scores for major genres.
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
>>> result = compute_genre_register("Academic paper text...")
|
|
36
|
+
>>> print(f"Formality score: {result.formality_score:.2f}")
|
|
37
|
+
>>> print(f"Predicted genre: {result.predicted_genre}")
|
|
38
|
+
>>> print(f"Academic score: {result.academic_score:.3f}")
|
|
39
|
+
"""
|
|
40
|
+
# TODO: Implement genre/register analysis
|
|
41
|
+
# GitHub Issue #23: https://github.com/craigtrim/pystylometry/issues/23
|
|
42
|
+
raise NotImplementedError(
|
|
43
|
+
"Genre/register classification not yet implemented. "
|
|
44
|
+
"See GitHub Issue #23: https://github.com/craigtrim/pystylometry/issues/23"
|
|
45
|
+
)
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Stylistic markers for authorship attribution.
|
|
2
|
+
|
|
3
|
+
This module identifies and analyzes specific linguistic features that authors
|
|
4
|
+
use consistently and often subconsciously. These markers include contraction
|
|
5
|
+
preferences, intensifier usage, hedging patterns, modal auxiliaries, negation
|
|
6
|
+
patterns, and punctuation style habits.
|
|
7
|
+
|
|
8
|
+
Related GitHub Issue:
|
|
9
|
+
#20 - Stylistic Markers
|
|
10
|
+
https://github.com/craigtrim/pystylometry/issues/20
|
|
11
|
+
|
|
12
|
+
Categories of stylistic markers:
|
|
13
|
+
- Contraction patterns (can't vs. cannot, I'm vs. I am)
|
|
14
|
+
- Intensifiers (very, really, extremely, quite)
|
|
15
|
+
- Hedges (maybe, perhaps, probably, somewhat)
|
|
16
|
+
- Modal auxiliaries (can, could, may, might, must, should, will, would)
|
|
17
|
+
- Negation patterns (not, no, never, none, neither)
|
|
18
|
+
- Punctuation style (exclamations, questions, quotes, parentheticals)
|
|
19
|
+
|
|
20
|
+
References:
|
|
21
|
+
Argamon, S., & Levitan, S. (2005). Measuring the usefulness of function
|
|
22
|
+
words for authorship attribution. ACH/ALLC.
|
|
23
|
+
Pennebaker, J. W. (2011). The secret life of pronouns. Bloomsbury Press.
|
|
24
|
+
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from .._types import StylisticMarkersResult
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def compute_stylistic_markers(text: str) -> StylisticMarkersResult:
|
|
31
|
+
"""
|
|
32
|
+
Analyze stylistic markers for authorship attribution.
|
|
33
|
+
|
|
34
|
+
Identifies and quantifies specific linguistic features that reveal authorial
|
|
35
|
+
style. These features are often used subconsciously and remain consistent
|
|
36
|
+
across an author's works, making them valuable for attribution.
|
|
37
|
+
|
|
38
|
+
Related GitHub Issue:
|
|
39
|
+
#20 - Stylistic Markers
|
|
40
|
+
https://github.com/craigtrim/pystylometry/issues/20
|
|
41
|
+
|
|
42
|
+
Why stylistic markers matter:
|
|
43
|
+
|
|
44
|
+
Subconscious usage:
|
|
45
|
+
- Authors don't deliberately vary these features
|
|
46
|
+
- Remain consistent even when author tries to disguise style
|
|
47
|
+
- Difficult to consciously control
|
|
48
|
+
|
|
49
|
+
Genre-independent:
|
|
50
|
+
- Used similarly across different topics
|
|
51
|
+
- More stable than content words
|
|
52
|
+
- Complement content-based features
|
|
53
|
+
|
|
54
|
+
Psychologically meaningful:
|
|
55
|
+
- Reveal personality traits (Pennebaker's research)
|
|
56
|
+
- Indicate emotional state
|
|
57
|
+
- Show cognitive patterns
|
|
58
|
+
|
|
59
|
+
Marker Categories Analyzed:
|
|
60
|
+
|
|
61
|
+
1. Contractions:
|
|
62
|
+
- Preference for contracted vs. expanded forms
|
|
63
|
+
- Examples: can't/cannot, I'm/I am, won't/will not
|
|
64
|
+
- Formality indicator (more contractions = informal)
|
|
65
|
+
|
|
66
|
+
2. Intensifiers:
|
|
67
|
+
- Words that amplify meaning
|
|
68
|
+
- Examples: very, really, extremely, quite, rather
|
|
69
|
+
- Indicate emphatic style
|
|
70
|
+
|
|
71
|
+
3. Hedges:
|
|
72
|
+
- Words that weaken or qualify statements
|
|
73
|
+
- Examples: maybe, perhaps, probably, somewhat, kind of
|
|
74
|
+
- Indicate tentative or cautious style
|
|
75
|
+
|
|
76
|
+
4. Modal Auxiliaries:
|
|
77
|
+
- Express necessity, possibility, permission
|
|
78
|
+
- Epistemic modals: may, might, could (possibility)
|
|
79
|
+
- Deontic modals: must, should, ought (obligation)
|
|
80
|
+
|
|
81
|
+
5. Negation:
|
|
82
|
+
- Patterns of negative expression
|
|
83
|
+
- not, no, never, none, neither, nowhere
|
|
84
|
+
- Frequency and type vary by author
|
|
85
|
+
|
|
86
|
+
6. Punctuation Style:
|
|
87
|
+
- Exclamation marks: Emphatic, emotional
|
|
88
|
+
- Question marks: Interactive, rhetorical
|
|
89
|
+
- Quotation marks: Dialogue, scare quotes
|
|
90
|
+
- Parentheticals: Asides, additional info
|
|
91
|
+
- Ellipses: Trailing off, suspense
|
|
92
|
+
- Dashes: Interruptions, emphasis
|
|
93
|
+
- Semicolons/colons: Sophisticated syntax
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
text: Input text to analyze. Should contain at least 200+ words for
|
|
97
|
+
reliable statistics. Shorter texts may have unstable marker ratios.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
StylisticMarkersResult containing extensive marker statistics.
|
|
101
|
+
See _types.py for complete field list.
|
|
102
|
+
|
|
103
|
+
Example:
|
|
104
|
+
>>> result = compute_stylistic_markers("Sample text with markers...")
|
|
105
|
+
>>> print(f"Contraction ratio: {result.contraction_ratio * 100:.1f}%")
|
|
106
|
+
Contraction ratio: 42.3%
|
|
107
|
+
>>> print(f"Intensifiers/100 words: {result.intensifier_density:.2f}")
|
|
108
|
+
Intensifiers/100 words: 3.45
|
|
109
|
+
>>> print(f"Top intensifiers: {result.top_intensifiers[:3]}")
|
|
110
|
+
Top intensifiers: [('very', 12), ('really', 8), ('quite', 5)]
|
|
111
|
+
>>> print(f"Exclamation density: {result.exclamation_density:.2f}")
|
|
112
|
+
Exclamation density: 2.10
|
|
113
|
+
|
|
114
|
+
Note:
|
|
115
|
+
- Densities are per 100 words for interpretability
|
|
116
|
+
- Contraction detection requires pattern matching
|
|
117
|
+
- Modal auxiliaries classified as epistemic or deontic
|
|
118
|
+
- Punctuation counts include all occurrences
|
|
119
|
+
- Empty text returns NaN for ratios, 0 for counts
|
|
120
|
+
"""
|
|
121
|
+
# TODO: Implement stylistic marker analysis
|
|
122
|
+
# GitHub Issue #20: https://github.com/craigtrim/pystylometry/issues/20
|
|
123
|
+
#
|
|
124
|
+
# This is a comprehensive implementation with many components.
|
|
125
|
+
# Break it down into logical sections.
|
|
126
|
+
#
|
|
127
|
+
# See GitHub issue for full implementation plan and word lists.
|
|
128
|
+
raise NotImplementedError(
|
|
129
|
+
"Stylistic markers not yet implemented. "
|
|
130
|
+
"See GitHub Issue #20: https://github.com/craigtrim/pystylometry/issues/20"
|
|
131
|
+
)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Vocabulary overlap and similarity metrics.
|
|
2
|
+
|
|
3
|
+
This module computes similarity measures between two texts based on their
|
|
4
|
+
shared vocabulary. Useful for authorship verification, plagiarism detection,
|
|
5
|
+
and measuring stylistic consistency.
|
|
6
|
+
|
|
7
|
+
Related GitHub Issue:
|
|
8
|
+
#21 - Vocabulary Overlap and Similarity Metrics
|
|
9
|
+
https://github.com/craigtrim/pystylometry/issues/21
|
|
10
|
+
|
|
11
|
+
References:
|
|
12
|
+
Jaccard, P. (1912). The distribution of the flora in the alpine zone.
|
|
13
|
+
Salton, G., & McGill, M. J. (1983). Introduction to Modern Information Retrieval.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from .._types import VocabularyOverlapResult
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def compute_vocabulary_overlap(text1: str, text2: str) -> VocabularyOverlapResult:
|
|
20
|
+
"""
|
|
21
|
+
Compute vocabulary overlap and similarity between two texts.
|
|
22
|
+
|
|
23
|
+
Related GitHub Issue:
|
|
24
|
+
#21 - Vocabulary Overlap and Similarity Metrics
|
|
25
|
+
https://github.com/craigtrim/pystylometry/issues/21
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
text1: First text to compare
|
|
29
|
+
text2: Second text to compare
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
VocabularyOverlapResult with Jaccard, Dice, cosine similarities,
|
|
33
|
+
shared vocabulary statistics, and distinctive words for each text.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> result = compute_vocabulary_overlap(text1, text2)
|
|
37
|
+
>>> print(f"Jaccard similarity: {result.jaccard_similarity:.3f}")
|
|
38
|
+
Jaccard similarity: 0.456
|
|
39
|
+
>>> print(f"Shared words: {result.shared_vocab_size}")
|
|
40
|
+
Shared words: 234
|
|
41
|
+
"""
|
|
42
|
+
# TODO: Implement vocabulary overlap analysis
|
|
43
|
+
# GitHub Issue #21: https://github.com/craigtrim/pystylometry/issues/21
|
|
44
|
+
raise NotImplementedError(
|
|
45
|
+
"Vocabulary overlap not yet implemented. "
|
|
46
|
+
"See GitHub Issue #21: https://github.com/craigtrim/pystylometry/issues/21"
|
|
47
|
+
)
|