pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +45 -3
- pystylometry/_types.py +1017 -259
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +28 -4
- pystylometry/authorship/additional_methods.py +260 -40
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +354 -0
- pystylometry/character/README.md +17 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/README.md +27 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- pystylometry-1.3.0.dist-info/RECORD +76 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
- pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/METADATA +0 -275
- pystylometry-1.0.0.dist-info/RECORD +0 -46
|
@@ -10,36 +10,692 @@ Related GitHub Issue:
|
|
|
10
10
|
|
|
11
11
|
References:
|
|
12
12
|
Halliday, M. A. K., & Hasan, R. (1976). Cohesion in English. Longman.
|
|
13
|
-
Graesser, A. C., McNamara, D. S., & Kulikowich, J. M. (2011). Coh-Metrix
|
|
13
|
+
Graesser, A. C., McNamara, D. S., & Kulikowich, J. M. (2011). Coh-Metrix:
|
|
14
|
+
Providing multilevel analyses of text characteristics. Educational
|
|
15
|
+
Researcher, 40(5), 223-234.
|
|
16
|
+
McNamara, D. S., et al. (2010). Automated evaluation of text and discourse
|
|
17
|
+
with Coh-Metrix. Cambridge University Press.
|
|
14
18
|
"""
|
|
15
19
|
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import re
|
|
23
|
+
from collections import Counter
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
16
26
|
from .._types import CohesionCoherenceResult
|
|
27
|
+
from .._utils import check_optional_dependency
|
|
17
28
|
|
|
29
|
+
# ========== Connective Word Lists ==========
|
|
30
|
+
# Categorized based on Halliday & Hasan (1976) and Coh-Metrix documentation
|
|
18
31
|
|
|
19
|
-
|
|
32
|
+
ADDITIVE_CONNECTIVES: set[str] = {
|
|
33
|
+
# Addition
|
|
34
|
+
"and",
|
|
35
|
+
"also",
|
|
36
|
+
"furthermore",
|
|
37
|
+
"moreover",
|
|
38
|
+
"additionally",
|
|
39
|
+
"besides",
|
|
40
|
+
"likewise",
|
|
41
|
+
"similarly",
|
|
42
|
+
"equally",
|
|
43
|
+
"too",
|
|
44
|
+
"as well",
|
|
45
|
+
"in addition",
|
|
46
|
+
"what is more",
|
|
47
|
+
"not only",
|
|
48
|
+
"along with",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
ADVERSATIVE_CONNECTIVES: set[str] = {
|
|
52
|
+
# Contrast/opposition
|
|
53
|
+
"but",
|
|
54
|
+
"however",
|
|
55
|
+
"nevertheless",
|
|
56
|
+
"nonetheless",
|
|
57
|
+
"yet",
|
|
58
|
+
"although",
|
|
59
|
+
"though",
|
|
60
|
+
"whereas",
|
|
61
|
+
"while",
|
|
62
|
+
"despite",
|
|
63
|
+
"in spite of",
|
|
64
|
+
"on the other hand",
|
|
65
|
+
"conversely",
|
|
66
|
+
"instead",
|
|
67
|
+
"rather",
|
|
68
|
+
"still",
|
|
69
|
+
"even so",
|
|
70
|
+
"on the contrary",
|
|
71
|
+
"by contrast",
|
|
72
|
+
"notwithstanding",
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
CAUSAL_CONNECTIVES: set[str] = {
|
|
76
|
+
# Cause and effect
|
|
77
|
+
"because",
|
|
78
|
+
"therefore",
|
|
79
|
+
"thus",
|
|
80
|
+
"hence",
|
|
81
|
+
"consequently",
|
|
82
|
+
"accordingly",
|
|
83
|
+
"so",
|
|
84
|
+
"since",
|
|
85
|
+
"as a result",
|
|
86
|
+
"for this reason",
|
|
87
|
+
"due to",
|
|
88
|
+
"owing to",
|
|
89
|
+
"thereby",
|
|
90
|
+
"wherefore",
|
|
91
|
+
"for",
|
|
92
|
+
"as",
|
|
93
|
+
"given that",
|
|
94
|
+
"in order to",
|
|
95
|
+
"so that",
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
TEMPORAL_CONNECTIVES: set[str] = {
|
|
99
|
+
# Time/sequence
|
|
100
|
+
"then",
|
|
101
|
+
"after",
|
|
102
|
+
"before",
|
|
103
|
+
"when",
|
|
104
|
+
"while",
|
|
105
|
+
"during",
|
|
106
|
+
"afterwards",
|
|
107
|
+
"meanwhile",
|
|
108
|
+
"subsequently",
|
|
109
|
+
"previously",
|
|
110
|
+
"first",
|
|
111
|
+
"second",
|
|
112
|
+
"third",
|
|
113
|
+
"finally",
|
|
114
|
+
"next",
|
|
115
|
+
"later",
|
|
116
|
+
"earlier",
|
|
117
|
+
"soon",
|
|
118
|
+
"immediately",
|
|
119
|
+
"eventually",
|
|
120
|
+
"at last",
|
|
121
|
+
"in the end",
|
|
122
|
+
"at first",
|
|
123
|
+
"at the same time",
|
|
124
|
+
"once",
|
|
125
|
+
"until",
|
|
126
|
+
"since",
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
# All connectives combined for lookup
|
|
130
|
+
ALL_CONNECTIVES: set[str] = (
|
|
131
|
+
ADDITIVE_CONNECTIVES | ADVERSATIVE_CONNECTIVES | CAUSAL_CONNECTIVES | TEMPORAL_CONNECTIVES
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Demonstrative pronouns/determiners (for referential cohesion)
|
|
135
|
+
DEMONSTRATIVES: set[str] = {"this", "that", "these", "those"}
|
|
136
|
+
|
|
137
|
+
# Content word POS tags (for lexical cohesion)
|
|
138
|
+
CONTENT_POS_TAGS: set[str] = {"NOUN", "PROPN", "VERB", "ADJ", "ADV"}
|
|
139
|
+
|
|
140
|
+
# Pronoun POS tags
|
|
141
|
+
PRONOUN_POS_TAGS: set[str] = {"PRON"}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _count_words(text: str) -> int:
|
|
145
|
+
"""Count words in text using simple tokenization."""
|
|
146
|
+
words = re.findall(r"\b[a-zA-Z]+\b", text.lower())
|
|
147
|
+
return len(words)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _tokenize_simple(text: str) -> list[str]:
|
|
151
|
+
"""Simple word tokenization."""
|
|
152
|
+
return re.findall(r"\b[a-zA-Z]+\b", text.lower())
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _split_into_sentences(text: str) -> list[str]:
|
|
156
|
+
"""Split text into sentences using simple heuristics."""
|
|
157
|
+
# Split on sentence-ending punctuation followed by space or end of string
|
|
158
|
+
sentences = re.split(r"(?<=[.!?])\s+", text.strip())
|
|
159
|
+
# Filter out empty sentences
|
|
160
|
+
return [s.strip() for s in sentences if s.strip()]
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _split_into_paragraphs(text: str) -> list[str]:
|
|
164
|
+
"""Split text into paragraphs based on blank lines."""
|
|
165
|
+
# Split on double newlines or multiple newlines
|
|
166
|
+
paragraphs = re.split(r"\n\s*\n", text.strip())
|
|
167
|
+
# Filter out empty paragraphs
|
|
168
|
+
return [p.strip() for p in paragraphs if p.strip()]
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _jaccard_similarity(set1: set[str], set2: set[str]) -> float:
|
|
172
|
+
"""Compute Jaccard similarity between two sets."""
|
|
173
|
+
if not set1 and not set2:
|
|
174
|
+
return 1.0 # Both empty sets are identical
|
|
175
|
+
if not set1 or not set2:
|
|
176
|
+
return 0.0
|
|
177
|
+
intersection = len(set1 & set2)
|
|
178
|
+
union = len(set1 | set2)
|
|
179
|
+
return intersection / union if union > 0 else 0.0
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _count_connectives(tokens: list[str]) -> dict[str, int]:
|
|
183
|
+
"""Count connectives by category from tokenized text."""
|
|
184
|
+
text_lower = " ".join(tokens)
|
|
185
|
+
|
|
186
|
+
counts = {
|
|
187
|
+
"additive": 0,
|
|
188
|
+
"adversative": 0,
|
|
189
|
+
"causal": 0,
|
|
190
|
+
"temporal": 0,
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
# Check multi-word connectives first (in the joined text)
|
|
194
|
+
multi_word_connectives = [c for c in ALL_CONNECTIVES if " " in c]
|
|
195
|
+
for connective in multi_word_connectives:
|
|
196
|
+
occurrences = text_lower.count(connective)
|
|
197
|
+
if occurrences > 0:
|
|
198
|
+
if connective in ADDITIVE_CONNECTIVES:
|
|
199
|
+
counts["additive"] += occurrences
|
|
200
|
+
elif connective in ADVERSATIVE_CONNECTIVES:
|
|
201
|
+
counts["adversative"] += occurrences
|
|
202
|
+
elif connective in CAUSAL_CONNECTIVES:
|
|
203
|
+
counts["causal"] += occurrences
|
|
204
|
+
elif connective in TEMPORAL_CONNECTIVES:
|
|
205
|
+
counts["temporal"] += occurrences
|
|
206
|
+
|
|
207
|
+
# Check single-word connectives
|
|
208
|
+
single_word_connectives = [c for c in ALL_CONNECTIVES if " " not in c]
|
|
209
|
+
for token in tokens:
|
|
210
|
+
if token in single_word_connectives:
|
|
211
|
+
if token in ADDITIVE_CONNECTIVES:
|
|
212
|
+
counts["additive"] += 1
|
|
213
|
+
elif token in ADVERSATIVE_CONNECTIVES:
|
|
214
|
+
counts["adversative"] += 1
|
|
215
|
+
elif token in CAUSAL_CONNECTIVES:
|
|
216
|
+
counts["causal"] += 1
|
|
217
|
+
elif token in TEMPORAL_CONNECTIVES:
|
|
218
|
+
counts["temporal"] += 1
|
|
219
|
+
|
|
220
|
+
return counts
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _get_content_words_from_doc(doc: Any) -> list[str]:
|
|
224
|
+
"""Extract lemmatized content words from a spaCy doc."""
|
|
225
|
+
return [
|
|
226
|
+
token.lemma_.lower() for token in doc if token.pos_ in CONTENT_POS_TAGS and token.is_alpha
|
|
227
|
+
]
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _compute_word_repetition(sentences: list[list[str]]) -> float:
|
|
231
|
+
"""Compute word repetition ratio across sentences.
|
|
232
|
+
|
|
233
|
+
Measures how many content words appear in multiple sentences.
|
|
234
|
+
"""
|
|
235
|
+
if len(sentences) < 2:
|
|
236
|
+
return 0.0
|
|
237
|
+
|
|
238
|
+
# Flatten all words
|
|
239
|
+
all_words = [w for sent in sentences for w in sent]
|
|
240
|
+
if not all_words:
|
|
241
|
+
return 0.0
|
|
242
|
+
|
|
243
|
+
# Count words appearing in more than one sentence
|
|
244
|
+
word_to_sentences: dict[str, set[int]] = {}
|
|
245
|
+
for i, sent in enumerate(sentences):
|
|
246
|
+
for word in sent:
|
|
247
|
+
if word not in word_to_sentences:
|
|
248
|
+
word_to_sentences[word] = set()
|
|
249
|
+
word_to_sentences[word].add(i)
|
|
250
|
+
|
|
251
|
+
repeated_words = sum(1 for word, sents in word_to_sentences.items() if len(sents) > 1)
|
|
252
|
+
unique_words = len(word_to_sentences)
|
|
253
|
+
|
|
254
|
+
return repeated_words / unique_words if unique_words > 0 else 0.0
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _compute_lexical_chains(
|
|
258
|
+
sentences: list[list[str]], min_chain_length: int = 2
|
|
259
|
+
) -> list[list[str]]:
|
|
260
|
+
"""Compute simplified lexical chains based on word repetition.
|
|
261
|
+
|
|
262
|
+
A lexical chain is a sequence of related words spanning multiple sentences.
|
|
263
|
+
This simplified version uses exact word matches (lemmatized).
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
sentences: List of sentences, each as list of content words
|
|
267
|
+
min_chain_length: Minimum occurrences to form a chain
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
List of lexical chains (each chain is a list of word occurrences)
|
|
271
|
+
"""
|
|
272
|
+
if len(sentences) < 2:
|
|
273
|
+
return []
|
|
274
|
+
|
|
275
|
+
# Track word appearances across sentences
|
|
276
|
+
word_positions: dict[str, list[tuple[int, str]]] = {}
|
|
277
|
+
for sent_idx, sent in enumerate(sentences):
|
|
278
|
+
for word in sent:
|
|
279
|
+
if word not in word_positions:
|
|
280
|
+
word_positions[word] = []
|
|
281
|
+
word_positions[word].append((sent_idx, word))
|
|
282
|
+
|
|
283
|
+
# Words appearing in multiple sentences form chains
|
|
284
|
+
chains = []
|
|
285
|
+
for word, positions in word_positions.items():
|
|
286
|
+
# Get unique sentences this word appears in
|
|
287
|
+
unique_sentences = set(pos[0] for pos in positions)
|
|
288
|
+
if len(unique_sentences) >= min_chain_length:
|
|
289
|
+
chains.append([word] * len(positions))
|
|
290
|
+
|
|
291
|
+
return chains
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _compute_anaphora_metrics(doc: Any) -> tuple[int, float]:
|
|
295
|
+
"""Compute anaphora count and resolution ratio.
|
|
296
|
+
|
|
297
|
+
Uses heuristics to detect anaphoric references (pronouns with potential antecedents).
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Tuple of (anaphora_count, resolution_ratio)
|
|
301
|
+
"""
|
|
302
|
+
pronouns = []
|
|
303
|
+
nouns = []
|
|
304
|
+
|
|
305
|
+
for token in doc:
|
|
306
|
+
if token.pos_ == "PRON" and token.is_alpha:
|
|
307
|
+
pronouns.append(token)
|
|
308
|
+
elif token.pos_ in ("NOUN", "PROPN") and token.is_alpha:
|
|
309
|
+
nouns.append(token)
|
|
310
|
+
|
|
311
|
+
anaphora_count = len(pronouns)
|
|
312
|
+
|
|
313
|
+
if anaphora_count == 0:
|
|
314
|
+
return 0, 1.0 # No pronouns, perfect resolution (vacuously true)
|
|
315
|
+
|
|
316
|
+
# Heuristic: pronouns that have a noun before them are "resolvable"
|
|
317
|
+
# This is a simplification - true anaphora resolution requires coreference
|
|
318
|
+
resolved = 0
|
|
319
|
+
for pron in pronouns:
|
|
320
|
+
# Check if there's a noun before this pronoun in the text
|
|
321
|
+
if any(noun.i < pron.i for noun in nouns):
|
|
322
|
+
resolved += 1
|
|
323
|
+
|
|
324
|
+
resolution_ratio = resolved / anaphora_count if anaphora_count > 0 else 1.0
|
|
325
|
+
return anaphora_count, resolution_ratio
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _compute_adjacent_overlap(sentences: list[list[str]]) -> float:
|
|
329
|
+
"""Compute mean content word overlap between adjacent sentences."""
|
|
330
|
+
if len(sentences) < 2:
|
|
331
|
+
return 0.0
|
|
332
|
+
|
|
333
|
+
overlaps = []
|
|
334
|
+
for i in range(len(sentences) - 1):
|
|
335
|
+
set1 = set(sentences[i])
|
|
336
|
+
set2 = set(sentences[i + 1])
|
|
337
|
+
overlaps.append(_jaccard_similarity(set1, set2))
|
|
338
|
+
|
|
339
|
+
return sum(overlaps) / len(overlaps) if overlaps else 0.0
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _compute_mean_sentence_similarity(sentences: list[list[str]]) -> float:
|
|
343
|
+
"""Compute mean pairwise similarity between all sentences."""
|
|
344
|
+
if len(sentences) < 2:
|
|
345
|
+
return 1.0 # Single sentence is perfectly coherent with itself
|
|
346
|
+
|
|
347
|
+
similarities = []
|
|
348
|
+
for i in range(len(sentences)):
|
|
349
|
+
for j in range(i + 1, len(sentences)):
|
|
350
|
+
set1 = set(sentences[i])
|
|
351
|
+
set2 = set(sentences[j])
|
|
352
|
+
similarities.append(_jaccard_similarity(set1, set2))
|
|
353
|
+
|
|
354
|
+
return sum(similarities) / len(similarities) if similarities else 0.0
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _compute_paragraph_topic_consistency(paragraphs: list[list[str]]) -> float:
|
|
358
|
+
"""Compute topic consistency within paragraphs.
|
|
359
|
+
|
|
360
|
+
Measures how consistent the vocabulary is within each paragraph.
|
|
361
|
+
"""
|
|
362
|
+
if not paragraphs:
|
|
363
|
+
return 0.0
|
|
364
|
+
|
|
365
|
+
consistencies = []
|
|
366
|
+
for para_words in paragraphs:
|
|
367
|
+
if len(para_words) < 2:
|
|
368
|
+
continue
|
|
369
|
+
# Consistency = repetition rate within paragraph
|
|
370
|
+
word_counts = Counter(para_words)
|
|
371
|
+
total_words = len(para_words)
|
|
372
|
+
unique_words = len(word_counts)
|
|
373
|
+
if unique_words > 0:
|
|
374
|
+
# Higher repetition = more topical consistency
|
|
375
|
+
consistency = 1 - (unique_words / total_words)
|
|
376
|
+
consistencies.append(consistency)
|
|
377
|
+
|
|
378
|
+
return sum(consistencies) / len(consistencies) if consistencies else 0.0
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def _compute_discourse_structure_score(paragraphs: list[str], sentences: list[str]) -> float:
|
|
382
|
+
"""Compute discourse structure quality score.
|
|
383
|
+
|
|
384
|
+
Evaluates whether the text has clear intro/body/conclusion structure.
|
|
385
|
+
This is a heuristic-based approximation.
|
|
20
386
|
"""
|
|
21
|
-
|
|
387
|
+
if len(paragraphs) < 2:
|
|
388
|
+
return 0.5 # Single paragraph - neutral score
|
|
389
|
+
|
|
390
|
+
if len(paragraphs) < 3:
|
|
391
|
+
return 0.6 # Two paragraphs - minimal structure
|
|
392
|
+
|
|
393
|
+
# Heuristics for good structure:
|
|
394
|
+
# 1. Multiple paragraphs (✓ if we get here)
|
|
395
|
+
# 2. First paragraph is introduction-like (shorter or similar length)
|
|
396
|
+
# 3. Last paragraph is conclusion-like
|
|
397
|
+
|
|
398
|
+
para_lengths = [len(_split_into_sentences(p)) for p in paragraphs]
|
|
399
|
+
mean_length = sum(para_lengths) / len(para_lengths)
|
|
400
|
+
|
|
401
|
+
score = 0.5 # Base score
|
|
402
|
+
|
|
403
|
+
# Reward having an intro (first paragraph not too long)
|
|
404
|
+
if para_lengths[0] <= mean_length * 1.5:
|
|
405
|
+
score += 0.15
|
|
406
|
+
|
|
407
|
+
# Reward having a conclusion (last paragraph exists and is reasonable)
|
|
408
|
+
if para_lengths[-1] <= mean_length * 1.5:
|
|
409
|
+
score += 0.15
|
|
410
|
+
|
|
411
|
+
# Reward having body paragraphs
|
|
412
|
+
if len(paragraphs) >= 3:
|
|
413
|
+
score += 0.1
|
|
414
|
+
|
|
415
|
+
# Reward reasonable paragraph count (not too fragmented)
|
|
416
|
+
if 3 <= len(paragraphs) <= 10:
|
|
417
|
+
score += 0.1
|
|
418
|
+
|
|
419
|
+
return min(score, 1.0)
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def compute_cohesion_coherence(text: str, model: str = "en_core_web_sm") -> CohesionCoherenceResult:
|
|
423
|
+
"""Compute cohesion and coherence metrics for text.
|
|
424
|
+
|
|
425
|
+
This function analyzes how well a text holds together structurally (cohesion)
|
|
426
|
+
and semantically (coherence). These metrics are important for analyzing
|
|
427
|
+
writing quality, readability, and authorial sophistication.
|
|
22
428
|
|
|
23
429
|
Related GitHub Issue:
|
|
24
430
|
#22 - Cohesion and Coherence Metrics
|
|
25
431
|
https://github.com/craigtrim/pystylometry/issues/22
|
|
26
432
|
|
|
433
|
+
Cohesion metrics:
|
|
434
|
+
- Referential cohesion: pronouns, demonstratives, anaphora
|
|
435
|
+
- Lexical cohesion: word repetition, content word overlap, lexical chains
|
|
436
|
+
- Connective density: discourse markers categorized by type
|
|
437
|
+
|
|
438
|
+
Coherence metrics:
|
|
439
|
+
- Adjacent sentence overlap
|
|
440
|
+
- Paragraph topic consistency
|
|
441
|
+
- Mean sentence similarity
|
|
442
|
+
- Discourse structure quality
|
|
443
|
+
|
|
444
|
+
References:
|
|
445
|
+
Halliday, M. A. K., & Hasan, R. (1976). Cohesion in English. Longman.
|
|
446
|
+
Graesser, A. C., McNamara, D. S., & Kulikowich, J. M. (2011). Coh-Metrix.
|
|
447
|
+
|
|
27
448
|
Args:
|
|
28
|
-
text: Input text to analyze
|
|
29
|
-
model: spaCy model for linguistic analysis
|
|
449
|
+
text: Input text to analyze (multi-sentence/paragraph text recommended)
|
|
450
|
+
model: spaCy model name for linguistic analysis (default: "en_core_web_sm")
|
|
30
451
|
|
|
31
452
|
Returns:
|
|
32
|
-
CohesionCoherenceResult with
|
|
33
|
-
|
|
453
|
+
CohesionCoherenceResult with all cohesion and coherence metrics
|
|
454
|
+
|
|
455
|
+
Raises:
|
|
456
|
+
ImportError: If spaCy is not installed
|
|
34
457
|
|
|
35
458
|
Example:
|
|
36
|
-
>>> result = compute_cohesion_coherence(
|
|
459
|
+
>>> result = compute_cohesion_coherence('''
|
|
460
|
+
... The cat sat on the mat. It was comfortable there.
|
|
461
|
+
... The mat was soft and warm. The cat purred contentedly.
|
|
462
|
+
... ''')
|
|
37
463
|
>>> print(f"Pronoun density: {result.pronoun_density:.2f}")
|
|
464
|
+
>>> print(f"Adjacent overlap: {result.adjacent_sentence_overlap:.3f}")
|
|
38
465
|
>>> print(f"Connective density: {result.connective_density:.2f}")
|
|
39
466
|
"""
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
467
|
+
check_optional_dependency("spacy", "stylistic (cohesion)")
|
|
468
|
+
|
|
469
|
+
import spacy
|
|
470
|
+
|
|
471
|
+
# Handle empty text
|
|
472
|
+
if not text or not text.strip():
|
|
473
|
+
return CohesionCoherenceResult(
|
|
474
|
+
pronoun_density=0.0,
|
|
475
|
+
demonstrative_density=0.0,
|
|
476
|
+
anaphora_count=0,
|
|
477
|
+
anaphora_resolution_ratio=1.0,
|
|
478
|
+
word_repetition_ratio=0.0,
|
|
479
|
+
synonym_density=0.0,
|
|
480
|
+
lexical_chain_count=0,
|
|
481
|
+
mean_chain_length=0.0,
|
|
482
|
+
content_word_overlap=0.0,
|
|
483
|
+
connective_density=0.0,
|
|
484
|
+
additive_connective_ratio=0.0,
|
|
485
|
+
adversative_connective_ratio=0.0,
|
|
486
|
+
causal_connective_ratio=0.0,
|
|
487
|
+
temporal_connective_ratio=0.0,
|
|
488
|
+
adjacent_sentence_overlap=0.0,
|
|
489
|
+
paragraph_topic_consistency=0.0,
|
|
490
|
+
mean_sentence_similarity=0.0,
|
|
491
|
+
semantic_coherence_score=0.0,
|
|
492
|
+
paragraph_count=0,
|
|
493
|
+
mean_paragraph_length=0.0,
|
|
494
|
+
discourse_structure_score=0.0,
|
|
495
|
+
metadata={
|
|
496
|
+
"model": model,
|
|
497
|
+
"word_count": 0,
|
|
498
|
+
"sentence_count": 0,
|
|
499
|
+
"pronoun_count": 0,
|
|
500
|
+
"demonstrative_count": 0,
|
|
501
|
+
"connective_counts": {"additive": 0, "adversative": 0, "causal": 0, "temporal": 0},
|
|
502
|
+
"lexical_chains": [],
|
|
503
|
+
},
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
# Load spaCy model
|
|
507
|
+
try:
|
|
508
|
+
nlp = spacy.load(model)
|
|
509
|
+
except OSError:
|
|
510
|
+
raise OSError(
|
|
511
|
+
f"spaCy model '{model}' not found. Download it with: python -m spacy download {model}"
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
# Process text with spaCy
|
|
515
|
+
doc = nlp(text)
|
|
516
|
+
|
|
517
|
+
# Basic counts
|
|
518
|
+
word_count = sum(1 for token in doc if token.is_alpha)
|
|
519
|
+
if word_count == 0:
|
|
520
|
+
return CohesionCoherenceResult(
|
|
521
|
+
pronoun_density=0.0,
|
|
522
|
+
demonstrative_density=0.0,
|
|
523
|
+
anaphora_count=0,
|
|
524
|
+
anaphora_resolution_ratio=1.0,
|
|
525
|
+
word_repetition_ratio=0.0,
|
|
526
|
+
synonym_density=0.0,
|
|
527
|
+
lexical_chain_count=0,
|
|
528
|
+
mean_chain_length=0.0,
|
|
529
|
+
content_word_overlap=0.0,
|
|
530
|
+
connective_density=0.0,
|
|
531
|
+
additive_connective_ratio=0.0,
|
|
532
|
+
adversative_connective_ratio=0.0,
|
|
533
|
+
causal_connective_ratio=0.0,
|
|
534
|
+
temporal_connective_ratio=0.0,
|
|
535
|
+
adjacent_sentence_overlap=0.0,
|
|
536
|
+
paragraph_topic_consistency=0.0,
|
|
537
|
+
mean_sentence_similarity=0.0,
|
|
538
|
+
semantic_coherence_score=0.0,
|
|
539
|
+
paragraph_count=0,
|
|
540
|
+
mean_paragraph_length=0.0,
|
|
541
|
+
discourse_structure_score=0.0,
|
|
542
|
+
metadata={
|
|
543
|
+
"model": model,
|
|
544
|
+
"word_count": 0,
|
|
545
|
+
"sentence_count": 0,
|
|
546
|
+
"pronoun_count": 0,
|
|
547
|
+
"demonstrative_count": 0,
|
|
548
|
+
"connective_counts": {"additive": 0, "adversative": 0, "causal": 0, "temporal": 0},
|
|
549
|
+
"lexical_chains": [],
|
|
550
|
+
},
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
# ========== Referential Cohesion ==========
|
|
554
|
+
|
|
555
|
+
# Count pronouns
|
|
556
|
+
pronoun_count = sum(1 for token in doc if token.pos_ == "PRON" and token.is_alpha)
|
|
557
|
+
pronoun_density = (pronoun_count / word_count) * 100
|
|
558
|
+
|
|
559
|
+
# Count demonstratives
|
|
560
|
+
demonstrative_count = sum(
|
|
561
|
+
1 for token in doc if token.text.lower() in DEMONSTRATIVES and token.is_alpha
|
|
562
|
+
)
|
|
563
|
+
demonstrative_density = (demonstrative_count / word_count) * 100
|
|
564
|
+
|
|
565
|
+
# Anaphora metrics
|
|
566
|
+
anaphora_count, anaphora_resolution_ratio = _compute_anaphora_metrics(doc)
|
|
567
|
+
|
|
568
|
+
# ========== Lexical Cohesion ==========
|
|
569
|
+
|
|
570
|
+
# Split into sentences for sentence-level analysis
|
|
571
|
+
sentences_text = _split_into_sentences(text)
|
|
572
|
+
sentence_count = len(sentences_text)
|
|
573
|
+
|
|
574
|
+
# Get content words per sentence using spaCy
|
|
575
|
+
sentences_content_words: list[list[str]] = []
|
|
576
|
+
for sent_text in sentences_text:
|
|
577
|
+
sent_doc = nlp(sent_text)
|
|
578
|
+
content_words = _get_content_words_from_doc(sent_doc)
|
|
579
|
+
sentences_content_words.append(content_words)
|
|
580
|
+
|
|
581
|
+
# Word repetition ratio
|
|
582
|
+
word_repetition_ratio = _compute_word_repetition(sentences_content_words)
|
|
583
|
+
|
|
584
|
+
# Lexical chains
|
|
585
|
+
lexical_chains = _compute_lexical_chains(sentences_content_words)
|
|
586
|
+
lexical_chain_count = len(lexical_chains)
|
|
587
|
+
mean_chain_length = (
|
|
588
|
+
sum(len(chain) for chain in lexical_chains) / lexical_chain_count
|
|
589
|
+
if lexical_chain_count > 0
|
|
590
|
+
else 0.0
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
# Content word overlap between adjacent sentences
|
|
594
|
+
content_word_overlap = _compute_adjacent_overlap(sentences_content_words)
|
|
595
|
+
|
|
596
|
+
# Synonym density: simplified as 0 (would require WordNet for true synonyms)
|
|
597
|
+
# This is a placeholder - full implementation would use NLTK WordNet
|
|
598
|
+
synonym_density = 0.0
|
|
599
|
+
|
|
600
|
+
# ========== Connectives ==========
|
|
601
|
+
|
|
602
|
+
tokens = _tokenize_simple(text)
|
|
603
|
+
connective_counts = _count_connectives(tokens)
|
|
604
|
+
total_connectives = sum(connective_counts.values())
|
|
605
|
+
connective_density = (total_connectives / word_count) * 100 if word_count > 0 else 0.0
|
|
606
|
+
|
|
607
|
+
# Connective ratios
|
|
608
|
+
additive_ratio = (
|
|
609
|
+
connective_counts["additive"] / total_connectives if total_connectives > 0 else 0.0
|
|
610
|
+
)
|
|
611
|
+
adversative_ratio = (
|
|
612
|
+
connective_counts["adversative"] / total_connectives if total_connectives > 0 else 0.0
|
|
613
|
+
)
|
|
614
|
+
causal_ratio = connective_counts["causal"] / total_connectives if total_connectives > 0 else 0.0
|
|
615
|
+
temporal_ratio = (
|
|
616
|
+
connective_counts["temporal"] / total_connectives if total_connectives > 0 else 0.0
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
# ========== Coherence Measures ==========
|
|
620
|
+
|
|
621
|
+
# Adjacent sentence overlap
|
|
622
|
+
adjacent_sentence_overlap = _compute_adjacent_overlap(sentences_content_words)
|
|
623
|
+
|
|
624
|
+
# Mean pairwise sentence similarity
|
|
625
|
+
mean_sentence_similarity = _compute_mean_sentence_similarity(sentences_content_words)
|
|
626
|
+
|
|
627
|
+
# Paragraphs
|
|
628
|
+
paragraphs = _split_into_paragraphs(text)
|
|
629
|
+
paragraph_count = len(paragraphs)
|
|
630
|
+
|
|
631
|
+
# Mean paragraph length (in sentences)
|
|
632
|
+
if paragraph_count > 0:
|
|
633
|
+
para_sentence_counts = [len(_split_into_sentences(p)) for p in paragraphs]
|
|
634
|
+
mean_paragraph_length = sum(para_sentence_counts) / paragraph_count
|
|
635
|
+
else:
|
|
636
|
+
mean_paragraph_length = 0.0
|
|
637
|
+
|
|
638
|
+
# Paragraph topic consistency
|
|
639
|
+
paragraphs_content_words = []
|
|
640
|
+
for para in paragraphs:
|
|
641
|
+
para_doc = nlp(para)
|
|
642
|
+
paragraphs_content_words.append(_get_content_words_from_doc(para_doc))
|
|
643
|
+
paragraph_topic_consistency = _compute_paragraph_topic_consistency(paragraphs_content_words)
|
|
644
|
+
|
|
645
|
+
# Discourse structure score
|
|
646
|
+
discourse_structure_score = _compute_discourse_structure_score(paragraphs, sentences_text)
|
|
647
|
+
|
|
648
|
+
# Composite semantic coherence score (0-1)
|
|
649
|
+
# Weighted combination of coherence metrics
|
|
650
|
+
semantic_coherence_score = (
|
|
651
|
+
0.3 * adjacent_sentence_overlap
|
|
652
|
+
+ 0.2 * mean_sentence_similarity
|
|
653
|
+
+ 0.2 * paragraph_topic_consistency
|
|
654
|
+
+ 0.15 * min(connective_density / 5.0, 1.0) # Normalize connective density
|
|
655
|
+
+ 0.15 * discourse_structure_score
|
|
656
|
+
)
|
|
657
|
+
semantic_coherence_score = min(max(semantic_coherence_score, 0.0), 1.0)
|
|
658
|
+
|
|
659
|
+
return CohesionCoherenceResult(
|
|
660
|
+
# Referential cohesion
|
|
661
|
+
pronoun_density=pronoun_density,
|
|
662
|
+
demonstrative_density=demonstrative_density,
|
|
663
|
+
anaphora_count=anaphora_count,
|
|
664
|
+
anaphora_resolution_ratio=anaphora_resolution_ratio,
|
|
665
|
+
# Lexical cohesion
|
|
666
|
+
word_repetition_ratio=word_repetition_ratio,
|
|
667
|
+
synonym_density=synonym_density,
|
|
668
|
+
lexical_chain_count=lexical_chain_count,
|
|
669
|
+
mean_chain_length=mean_chain_length,
|
|
670
|
+
content_word_overlap=content_word_overlap,
|
|
671
|
+
# Connectives
|
|
672
|
+
connective_density=connective_density,
|
|
673
|
+
additive_connective_ratio=additive_ratio,
|
|
674
|
+
adversative_connective_ratio=adversative_ratio,
|
|
675
|
+
causal_connective_ratio=causal_ratio,
|
|
676
|
+
temporal_connective_ratio=temporal_ratio,
|
|
677
|
+
# Coherence
|
|
678
|
+
adjacent_sentence_overlap=adjacent_sentence_overlap,
|
|
679
|
+
paragraph_topic_consistency=paragraph_topic_consistency,
|
|
680
|
+
mean_sentence_similarity=mean_sentence_similarity,
|
|
681
|
+
semantic_coherence_score=semantic_coherence_score,
|
|
682
|
+
# Structural
|
|
683
|
+
paragraph_count=paragraph_count,
|
|
684
|
+
mean_paragraph_length=mean_paragraph_length,
|
|
685
|
+
discourse_structure_score=discourse_structure_score,
|
|
686
|
+
# Metadata
|
|
687
|
+
metadata={
|
|
688
|
+
"model": model,
|
|
689
|
+
"word_count": word_count,
|
|
690
|
+
"sentence_count": sentence_count,
|
|
691
|
+
"pronoun_count": pronoun_count,
|
|
692
|
+
"demonstrative_count": demonstrative_count,
|
|
693
|
+
"connective_counts": connective_counts,
|
|
694
|
+
"total_connectives": total_connectives,
|
|
695
|
+
"lexical_chains": [
|
|
696
|
+
{"word": chain[0] if chain else "", "length": len(chain)}
|
|
697
|
+
for chain in lexical_chains
|
|
698
|
+
],
|
|
699
|
+
"content_words_per_sentence": [len(s) for s in sentences_content_words],
|
|
700
|
+
},
|
|
45
701
|
)
|