pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pystylometry/README.md +42 -0
  2. pystylometry/__init__.py +45 -3
  3. pystylometry/_types.py +1017 -259
  4. pystylometry/authorship/README.md +21 -0
  5. pystylometry/authorship/__init__.py +28 -4
  6. pystylometry/authorship/additional_methods.py +260 -40
  7. pystylometry/authorship/compression.py +175 -0
  8. pystylometry/authorship/kilgarriff.py +354 -0
  9. pystylometry/character/README.md +17 -0
  10. pystylometry/character/character_metrics.py +267 -179
  11. pystylometry/cli.py +427 -0
  12. pystylometry/consistency/README.md +27 -0
  13. pystylometry/consistency/__init__.py +57 -0
  14. pystylometry/consistency/_thresholds.py +162 -0
  15. pystylometry/consistency/drift.py +549 -0
  16. pystylometry/dialect/README.md +26 -0
  17. pystylometry/dialect/__init__.py +65 -0
  18. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  19. pystylometry/dialect/_loader.py +360 -0
  20. pystylometry/dialect/detector.py +533 -0
  21. pystylometry/lexical/README.md +23 -0
  22. pystylometry/lexical/advanced_diversity.py +61 -22
  23. pystylometry/lexical/function_words.py +255 -56
  24. pystylometry/lexical/hapax.py +182 -52
  25. pystylometry/lexical/mtld.py +108 -26
  26. pystylometry/lexical/ttr.py +76 -10
  27. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  28. pystylometry/lexical/yule.py +136 -50
  29. pystylometry/ngrams/README.md +18 -0
  30. pystylometry/ngrams/entropy.py +150 -49
  31. pystylometry/ngrams/extended_ngrams.py +314 -69
  32. pystylometry/prosody/README.md +17 -0
  33. pystylometry/prosody/rhythm_prosody.py +773 -11
  34. pystylometry/readability/README.md +23 -0
  35. pystylometry/readability/additional_formulas.py +1887 -762
  36. pystylometry/readability/ari.py +144 -82
  37. pystylometry/readability/coleman_liau.py +136 -109
  38. pystylometry/readability/flesch.py +177 -73
  39. pystylometry/readability/gunning_fog.py +165 -161
  40. pystylometry/readability/smog.py +123 -42
  41. pystylometry/stylistic/README.md +20 -0
  42. pystylometry/stylistic/cohesion_coherence.py +669 -13
  43. pystylometry/stylistic/genre_register.py +1560 -17
  44. pystylometry/stylistic/markers.py +611 -17
  45. pystylometry/stylistic/vocabulary_overlap.py +354 -13
  46. pystylometry/syntactic/README.md +20 -0
  47. pystylometry/syntactic/advanced_syntactic.py +76 -14
  48. pystylometry/syntactic/pos_ratios.py +70 -6
  49. pystylometry/syntactic/sentence_stats.py +55 -12
  50. pystylometry/syntactic/sentence_types.py +71 -15
  51. pystylometry/viz/README.md +27 -0
  52. pystylometry/viz/__init__.py +71 -0
  53. pystylometry/viz/drift.py +589 -0
  54. pystylometry/viz/jsx/__init__.py +31 -0
  55. pystylometry/viz/jsx/_base.py +144 -0
  56. pystylometry/viz/jsx/report.py +677 -0
  57. pystylometry/viz/jsx/timeline.py +716 -0
  58. pystylometry/viz/jsx/viewer.py +1032 -0
  59. pystylometry-1.3.0.dist-info/METADATA +136 -0
  60. pystylometry-1.3.0.dist-info/RECORD +76 -0
  61. {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
  62. pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
  63. pystylometry-1.0.0.dist-info/METADATA +0 -275
  64. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -10,36 +10,692 @@ Related GitHub Issue:
10
10
 
11
11
  References:
12
12
  Halliday, M. A. K., & Hasan, R. (1976). Cohesion in English. Longman.
13
- Graesser, A. C., McNamara, D. S., & Kulikowich, J. M. (2011). Coh-Metrix.
13
+ Graesser, A. C., McNamara, D. S., & Kulikowich, J. M. (2011). Coh-Metrix:
14
+ Providing multilevel analyses of text characteristics. Educational
15
+ Researcher, 40(5), 223-234.
16
+ McNamara, D. S., et al. (2010). Automated evaluation of text and discourse
17
+ with Coh-Metrix. Cambridge University Press.
14
18
  """
15
19
 
20
+ from __future__ import annotations
21
+
22
+ import re
23
+ from collections import Counter
24
+ from typing import Any
25
+
16
26
  from .._types import CohesionCoherenceResult
27
+ from .._utils import check_optional_dependency
17
28
 
29
+ # ========== Connective Word Lists ==========
30
+ # Categorized based on Halliday & Hasan (1976) and Coh-Metrix documentation
18
31
 
19
- def compute_cohesion_coherence(text: str, model: str = "en_core_web_sm") -> CohesionCoherenceResult:
32
+ ADDITIVE_CONNECTIVES: set[str] = {
33
+ # Addition
34
+ "and",
35
+ "also",
36
+ "furthermore",
37
+ "moreover",
38
+ "additionally",
39
+ "besides",
40
+ "likewise",
41
+ "similarly",
42
+ "equally",
43
+ "too",
44
+ "as well",
45
+ "in addition",
46
+ "what is more",
47
+ "not only",
48
+ "along with",
49
+ }
50
+
51
+ ADVERSATIVE_CONNECTIVES: set[str] = {
52
+ # Contrast/opposition
53
+ "but",
54
+ "however",
55
+ "nevertheless",
56
+ "nonetheless",
57
+ "yet",
58
+ "although",
59
+ "though",
60
+ "whereas",
61
+ "while",
62
+ "despite",
63
+ "in spite of",
64
+ "on the other hand",
65
+ "conversely",
66
+ "instead",
67
+ "rather",
68
+ "still",
69
+ "even so",
70
+ "on the contrary",
71
+ "by contrast",
72
+ "notwithstanding",
73
+ }
74
+
75
+ CAUSAL_CONNECTIVES: set[str] = {
76
+ # Cause and effect
77
+ "because",
78
+ "therefore",
79
+ "thus",
80
+ "hence",
81
+ "consequently",
82
+ "accordingly",
83
+ "so",
84
+ "since",
85
+ "as a result",
86
+ "for this reason",
87
+ "due to",
88
+ "owing to",
89
+ "thereby",
90
+ "wherefore",
91
+ "for",
92
+ "as",
93
+ "given that",
94
+ "in order to",
95
+ "so that",
96
+ }
97
+
98
+ TEMPORAL_CONNECTIVES: set[str] = {
99
+ # Time/sequence
100
+ "then",
101
+ "after",
102
+ "before",
103
+ "when",
104
+ "while",
105
+ "during",
106
+ "afterwards",
107
+ "meanwhile",
108
+ "subsequently",
109
+ "previously",
110
+ "first",
111
+ "second",
112
+ "third",
113
+ "finally",
114
+ "next",
115
+ "later",
116
+ "earlier",
117
+ "soon",
118
+ "immediately",
119
+ "eventually",
120
+ "at last",
121
+ "in the end",
122
+ "at first",
123
+ "at the same time",
124
+ "once",
125
+ "until",
126
+ "since",
127
+ }
128
+
129
+ # All connectives combined for lookup
130
+ ALL_CONNECTIVES: set[str] = (
131
+ ADDITIVE_CONNECTIVES | ADVERSATIVE_CONNECTIVES | CAUSAL_CONNECTIVES | TEMPORAL_CONNECTIVES
132
+ )
133
+
134
+ # Demonstrative pronouns/determiners (for referential cohesion)
135
+ DEMONSTRATIVES: set[str] = {"this", "that", "these", "those"}
136
+
137
+ # Content word POS tags (for lexical cohesion)
138
+ CONTENT_POS_TAGS: set[str] = {"NOUN", "PROPN", "VERB", "ADJ", "ADV"}
139
+
140
+ # Pronoun POS tags
141
+ PRONOUN_POS_TAGS: set[str] = {"PRON"}
142
+
143
+
144
+ def _count_words(text: str) -> int:
145
+ """Count words in text using simple tokenization."""
146
+ words = re.findall(r"\b[a-zA-Z]+\b", text.lower())
147
+ return len(words)
148
+
149
+
150
+ def _tokenize_simple(text: str) -> list[str]:
151
+ """Simple word tokenization."""
152
+ return re.findall(r"\b[a-zA-Z]+\b", text.lower())
153
+
154
+
155
+ def _split_into_sentences(text: str) -> list[str]:
156
+ """Split text into sentences using simple heuristics."""
157
+ # Split on sentence-ending punctuation followed by space or end of string
158
+ sentences = re.split(r"(?<=[.!?])\s+", text.strip())
159
+ # Filter out empty sentences
160
+ return [s.strip() for s in sentences if s.strip()]
161
+
162
+
163
+ def _split_into_paragraphs(text: str) -> list[str]:
164
+ """Split text into paragraphs based on blank lines."""
165
+ # Split on double newlines or multiple newlines
166
+ paragraphs = re.split(r"\n\s*\n", text.strip())
167
+ # Filter out empty paragraphs
168
+ return [p.strip() for p in paragraphs if p.strip()]
169
+
170
+
171
+ def _jaccard_similarity(set1: set[str], set2: set[str]) -> float:
172
+ """Compute Jaccard similarity between two sets."""
173
+ if not set1 and not set2:
174
+ return 1.0 # Both empty sets are identical
175
+ if not set1 or not set2:
176
+ return 0.0
177
+ intersection = len(set1 & set2)
178
+ union = len(set1 | set2)
179
+ return intersection / union if union > 0 else 0.0
180
+
181
+
182
+ def _count_connectives(tokens: list[str]) -> dict[str, int]:
183
+ """Count connectives by category from tokenized text."""
184
+ text_lower = " ".join(tokens)
185
+
186
+ counts = {
187
+ "additive": 0,
188
+ "adversative": 0,
189
+ "causal": 0,
190
+ "temporal": 0,
191
+ }
192
+
193
+ # Check multi-word connectives first (in the joined text)
194
+ multi_word_connectives = [c for c in ALL_CONNECTIVES if " " in c]
195
+ for connective in multi_word_connectives:
196
+ occurrences = text_lower.count(connective)
197
+ if occurrences > 0:
198
+ if connective in ADDITIVE_CONNECTIVES:
199
+ counts["additive"] += occurrences
200
+ elif connective in ADVERSATIVE_CONNECTIVES:
201
+ counts["adversative"] += occurrences
202
+ elif connective in CAUSAL_CONNECTIVES:
203
+ counts["causal"] += occurrences
204
+ elif connective in TEMPORAL_CONNECTIVES:
205
+ counts["temporal"] += occurrences
206
+
207
+ # Check single-word connectives
208
+ single_word_connectives = [c for c in ALL_CONNECTIVES if " " not in c]
209
+ for token in tokens:
210
+ if token in single_word_connectives:
211
+ if token in ADDITIVE_CONNECTIVES:
212
+ counts["additive"] += 1
213
+ elif token in ADVERSATIVE_CONNECTIVES:
214
+ counts["adversative"] += 1
215
+ elif token in CAUSAL_CONNECTIVES:
216
+ counts["causal"] += 1
217
+ elif token in TEMPORAL_CONNECTIVES:
218
+ counts["temporal"] += 1
219
+
220
+ return counts
221
+
222
+
223
+ def _get_content_words_from_doc(doc: Any) -> list[str]:
224
+ """Extract lemmatized content words from a spaCy doc."""
225
+ return [
226
+ token.lemma_.lower() for token in doc if token.pos_ in CONTENT_POS_TAGS and token.is_alpha
227
+ ]
228
+
229
+
230
+ def _compute_word_repetition(sentences: list[list[str]]) -> float:
231
+ """Compute word repetition ratio across sentences.
232
+
233
+ Measures how many content words appear in multiple sentences.
234
+ """
235
+ if len(sentences) < 2:
236
+ return 0.0
237
+
238
+ # Flatten all words
239
+ all_words = [w for sent in sentences for w in sent]
240
+ if not all_words:
241
+ return 0.0
242
+
243
+ # Count words appearing in more than one sentence
244
+ word_to_sentences: dict[str, set[int]] = {}
245
+ for i, sent in enumerate(sentences):
246
+ for word in sent:
247
+ if word not in word_to_sentences:
248
+ word_to_sentences[word] = set()
249
+ word_to_sentences[word].add(i)
250
+
251
+ repeated_words = sum(1 for word, sents in word_to_sentences.items() if len(sents) > 1)
252
+ unique_words = len(word_to_sentences)
253
+
254
+ return repeated_words / unique_words if unique_words > 0 else 0.0
255
+
256
+
257
+ def _compute_lexical_chains(
258
+ sentences: list[list[str]], min_chain_length: int = 2
259
+ ) -> list[list[str]]:
260
+ """Compute simplified lexical chains based on word repetition.
261
+
262
+ A lexical chain is a sequence of related words spanning multiple sentences.
263
+ This simplified version uses exact word matches (lemmatized).
264
+
265
+ Args:
266
+ sentences: List of sentences, each as list of content words
267
+ min_chain_length: Minimum occurrences to form a chain
268
+
269
+ Returns:
270
+ List of lexical chains (each chain is a list of word occurrences)
271
+ """
272
+ if len(sentences) < 2:
273
+ return []
274
+
275
+ # Track word appearances across sentences
276
+ word_positions: dict[str, list[tuple[int, str]]] = {}
277
+ for sent_idx, sent in enumerate(sentences):
278
+ for word in sent:
279
+ if word not in word_positions:
280
+ word_positions[word] = []
281
+ word_positions[word].append((sent_idx, word))
282
+
283
+ # Words appearing in multiple sentences form chains
284
+ chains = []
285
+ for word, positions in word_positions.items():
286
+ # Get unique sentences this word appears in
287
+ unique_sentences = set(pos[0] for pos in positions)
288
+ if len(unique_sentences) >= min_chain_length:
289
+ chains.append([word] * len(positions))
290
+
291
+ return chains
292
+
293
+
294
+ def _compute_anaphora_metrics(doc: Any) -> tuple[int, float]:
295
+ """Compute anaphora count and resolution ratio.
296
+
297
+ Uses heuristics to detect anaphoric references (pronouns with potential antecedents).
298
+
299
+ Returns:
300
+ Tuple of (anaphora_count, resolution_ratio)
301
+ """
302
+ pronouns = []
303
+ nouns = []
304
+
305
+ for token in doc:
306
+ if token.pos_ == "PRON" and token.is_alpha:
307
+ pronouns.append(token)
308
+ elif token.pos_ in ("NOUN", "PROPN") and token.is_alpha:
309
+ nouns.append(token)
310
+
311
+ anaphora_count = len(pronouns)
312
+
313
+ if anaphora_count == 0:
314
+ return 0, 1.0 # No pronouns, perfect resolution (vacuously true)
315
+
316
+ # Heuristic: pronouns that have a noun before them are "resolvable"
317
+ # This is a simplification - true anaphora resolution requires coreference
318
+ resolved = 0
319
+ for pron in pronouns:
320
+ # Check if there's a noun before this pronoun in the text
321
+ if any(noun.i < pron.i for noun in nouns):
322
+ resolved += 1
323
+
324
+ resolution_ratio = resolved / anaphora_count if anaphora_count > 0 else 1.0
325
+ return anaphora_count, resolution_ratio
326
+
327
+
328
+ def _compute_adjacent_overlap(sentences: list[list[str]]) -> float:
329
+ """Compute mean content word overlap between adjacent sentences."""
330
+ if len(sentences) < 2:
331
+ return 0.0
332
+
333
+ overlaps = []
334
+ for i in range(len(sentences) - 1):
335
+ set1 = set(sentences[i])
336
+ set2 = set(sentences[i + 1])
337
+ overlaps.append(_jaccard_similarity(set1, set2))
338
+
339
+ return sum(overlaps) / len(overlaps) if overlaps else 0.0
340
+
341
+
342
+ def _compute_mean_sentence_similarity(sentences: list[list[str]]) -> float:
343
+ """Compute mean pairwise similarity between all sentences."""
344
+ if len(sentences) < 2:
345
+ return 1.0 # Single sentence is perfectly coherent with itself
346
+
347
+ similarities = []
348
+ for i in range(len(sentences)):
349
+ for j in range(i + 1, len(sentences)):
350
+ set1 = set(sentences[i])
351
+ set2 = set(sentences[j])
352
+ similarities.append(_jaccard_similarity(set1, set2))
353
+
354
+ return sum(similarities) / len(similarities) if similarities else 0.0
355
+
356
+
357
+ def _compute_paragraph_topic_consistency(paragraphs: list[list[str]]) -> float:
358
+ """Compute topic consistency within paragraphs.
359
+
360
+ Measures how consistent the vocabulary is within each paragraph.
361
+ """
362
+ if not paragraphs:
363
+ return 0.0
364
+
365
+ consistencies = []
366
+ for para_words in paragraphs:
367
+ if len(para_words) < 2:
368
+ continue
369
+ # Consistency = repetition rate within paragraph
370
+ word_counts = Counter(para_words)
371
+ total_words = len(para_words)
372
+ unique_words = len(word_counts)
373
+ if unique_words > 0:
374
+ # Higher repetition = more topical consistency
375
+ consistency = 1 - (unique_words / total_words)
376
+ consistencies.append(consistency)
377
+
378
+ return sum(consistencies) / len(consistencies) if consistencies else 0.0
379
+
380
+
381
+ def _compute_discourse_structure_score(paragraphs: list[str], sentences: list[str]) -> float:
382
+ """Compute discourse structure quality score.
383
+
384
+ Evaluates whether the text has clear intro/body/conclusion structure.
385
+ This is a heuristic-based approximation.
20
386
  """
21
- Compute cohesion and coherence metrics.
387
+ if len(paragraphs) < 2:
388
+ return 0.5 # Single paragraph - neutral score
389
+
390
+ if len(paragraphs) < 3:
391
+ return 0.6 # Two paragraphs - minimal structure
392
+
393
+ # Heuristics for good structure:
394
+ # 1. Multiple paragraphs (✓ if we get here)
395
+ # 2. First paragraph is introduction-like (shorter or similar length)
396
+ # 3. Last paragraph is conclusion-like
397
+
398
+ para_lengths = [len(_split_into_sentences(p)) for p in paragraphs]
399
+ mean_length = sum(para_lengths) / len(para_lengths)
400
+
401
+ score = 0.5 # Base score
402
+
403
+ # Reward having an intro (first paragraph not too long)
404
+ if para_lengths[0] <= mean_length * 1.5:
405
+ score += 0.15
406
+
407
+ # Reward having a conclusion (last paragraph exists and is reasonable)
408
+ if para_lengths[-1] <= mean_length * 1.5:
409
+ score += 0.15
410
+
411
+ # Reward having body paragraphs
412
+ if len(paragraphs) >= 3:
413
+ score += 0.1
414
+
415
+ # Reward reasonable paragraph count (not too fragmented)
416
+ if 3 <= len(paragraphs) <= 10:
417
+ score += 0.1
418
+
419
+ return min(score, 1.0)
420
+
421
+
422
+ def compute_cohesion_coherence(text: str, model: str = "en_core_web_sm") -> CohesionCoherenceResult:
423
+ """Compute cohesion and coherence metrics for text.
424
+
425
+ This function analyzes how well a text holds together structurally (cohesion)
426
+ and semantically (coherence). These metrics are important for analyzing
427
+ writing quality, readability, and authorial sophistication.
22
428
 
23
429
  Related GitHub Issue:
24
430
  #22 - Cohesion and Coherence Metrics
25
431
  https://github.com/craigtrim/pystylometry/issues/22
26
432
 
433
+ Cohesion metrics:
434
+ - Referential cohesion: pronouns, demonstratives, anaphora
435
+ - Lexical cohesion: word repetition, content word overlap, lexical chains
436
+ - Connective density: discourse markers categorized by type
437
+
438
+ Coherence metrics:
439
+ - Adjacent sentence overlap
440
+ - Paragraph topic consistency
441
+ - Mean sentence similarity
442
+ - Discourse structure quality
443
+
444
+ References:
445
+ Halliday, M. A. K., & Hasan, R. (1976). Cohesion in English. Longman.
446
+ Graesser, A. C., McNamara, D. S., & Kulikowich, J. M. (2011). Coh-Metrix.
447
+
27
448
  Args:
28
- text: Input text to analyze
29
- model: spaCy model for linguistic analysis
449
+ text: Input text to analyze (multi-sentence/paragraph text recommended)
450
+ model: spaCy model name for linguistic analysis (default: "en_core_web_sm")
30
451
 
31
452
  Returns:
32
- CohesionCoherenceResult with referential cohesion, lexical cohesion,
33
- connective density, and coherence scores.
453
+ CohesionCoherenceResult with all cohesion and coherence metrics
454
+
455
+ Raises:
456
+ ImportError: If spaCy is not installed
34
457
 
35
458
  Example:
36
- >>> result = compute_cohesion_coherence("Multi-paragraph text...")
459
+ >>> result = compute_cohesion_coherence('''
460
+ ... The cat sat on the mat. It was comfortable there.
461
+ ... The mat was soft and warm. The cat purred contentedly.
462
+ ... ''')
37
463
  >>> print(f"Pronoun density: {result.pronoun_density:.2f}")
464
+ >>> print(f"Adjacent overlap: {result.adjacent_sentence_overlap:.3f}")
38
465
  >>> print(f"Connective density: {result.connective_density:.2f}")
39
466
  """
40
- # TODO: Implement cohesion/coherence analysis
41
- # GitHub Issue #22: https://github.com/craigtrim/pystylometry/issues/22
42
- raise NotImplementedError(
43
- "Cohesion/coherence metrics not yet implemented. "
44
- "See GitHub Issue #22: https://github.com/craigtrim/pystylometry/issues/22"
467
+ check_optional_dependency("spacy", "stylistic (cohesion)")
468
+
469
+ import spacy
470
+
471
+ # Handle empty text
472
+ if not text or not text.strip():
473
+ return CohesionCoherenceResult(
474
+ pronoun_density=0.0,
475
+ demonstrative_density=0.0,
476
+ anaphora_count=0,
477
+ anaphora_resolution_ratio=1.0,
478
+ word_repetition_ratio=0.0,
479
+ synonym_density=0.0,
480
+ lexical_chain_count=0,
481
+ mean_chain_length=0.0,
482
+ content_word_overlap=0.0,
483
+ connective_density=0.0,
484
+ additive_connective_ratio=0.0,
485
+ adversative_connective_ratio=0.0,
486
+ causal_connective_ratio=0.0,
487
+ temporal_connective_ratio=0.0,
488
+ adjacent_sentence_overlap=0.0,
489
+ paragraph_topic_consistency=0.0,
490
+ mean_sentence_similarity=0.0,
491
+ semantic_coherence_score=0.0,
492
+ paragraph_count=0,
493
+ mean_paragraph_length=0.0,
494
+ discourse_structure_score=0.0,
495
+ metadata={
496
+ "model": model,
497
+ "word_count": 0,
498
+ "sentence_count": 0,
499
+ "pronoun_count": 0,
500
+ "demonstrative_count": 0,
501
+ "connective_counts": {"additive": 0, "adversative": 0, "causal": 0, "temporal": 0},
502
+ "lexical_chains": [],
503
+ },
504
+ )
505
+
506
+ # Load spaCy model
507
+ try:
508
+ nlp = spacy.load(model)
509
+ except OSError:
510
+ raise OSError(
511
+ f"spaCy model '{model}' not found. Download it with: python -m spacy download {model}"
512
+ )
513
+
514
+ # Process text with spaCy
515
+ doc = nlp(text)
516
+
517
+ # Basic counts
518
+ word_count = sum(1 for token in doc if token.is_alpha)
519
+ if word_count == 0:
520
+ return CohesionCoherenceResult(
521
+ pronoun_density=0.0,
522
+ demonstrative_density=0.0,
523
+ anaphora_count=0,
524
+ anaphora_resolution_ratio=1.0,
525
+ word_repetition_ratio=0.0,
526
+ synonym_density=0.0,
527
+ lexical_chain_count=0,
528
+ mean_chain_length=0.0,
529
+ content_word_overlap=0.0,
530
+ connective_density=0.0,
531
+ additive_connective_ratio=0.0,
532
+ adversative_connective_ratio=0.0,
533
+ causal_connective_ratio=0.0,
534
+ temporal_connective_ratio=0.0,
535
+ adjacent_sentence_overlap=0.0,
536
+ paragraph_topic_consistency=0.0,
537
+ mean_sentence_similarity=0.0,
538
+ semantic_coherence_score=0.0,
539
+ paragraph_count=0,
540
+ mean_paragraph_length=0.0,
541
+ discourse_structure_score=0.0,
542
+ metadata={
543
+ "model": model,
544
+ "word_count": 0,
545
+ "sentence_count": 0,
546
+ "pronoun_count": 0,
547
+ "demonstrative_count": 0,
548
+ "connective_counts": {"additive": 0, "adversative": 0, "causal": 0, "temporal": 0},
549
+ "lexical_chains": [],
550
+ },
551
+ )
552
+
553
+ # ========== Referential Cohesion ==========
554
+
555
+ # Count pronouns
556
+ pronoun_count = sum(1 for token in doc if token.pos_ == "PRON" and token.is_alpha)
557
+ pronoun_density = (pronoun_count / word_count) * 100
558
+
559
+ # Count demonstratives
560
+ demonstrative_count = sum(
561
+ 1 for token in doc if token.text.lower() in DEMONSTRATIVES and token.is_alpha
562
+ )
563
+ demonstrative_density = (demonstrative_count / word_count) * 100
564
+
565
+ # Anaphora metrics
566
+ anaphora_count, anaphora_resolution_ratio = _compute_anaphora_metrics(doc)
567
+
568
+ # ========== Lexical Cohesion ==========
569
+
570
+ # Split into sentences for sentence-level analysis
571
+ sentences_text = _split_into_sentences(text)
572
+ sentence_count = len(sentences_text)
573
+
574
+ # Get content words per sentence using spaCy
575
+ sentences_content_words: list[list[str]] = []
576
+ for sent_text in sentences_text:
577
+ sent_doc = nlp(sent_text)
578
+ content_words = _get_content_words_from_doc(sent_doc)
579
+ sentences_content_words.append(content_words)
580
+
581
+ # Word repetition ratio
582
+ word_repetition_ratio = _compute_word_repetition(sentences_content_words)
583
+
584
+ # Lexical chains
585
+ lexical_chains = _compute_lexical_chains(sentences_content_words)
586
+ lexical_chain_count = len(lexical_chains)
587
+ mean_chain_length = (
588
+ sum(len(chain) for chain in lexical_chains) / lexical_chain_count
589
+ if lexical_chain_count > 0
590
+ else 0.0
591
+ )
592
+
593
+ # Content word overlap between adjacent sentences
594
+ content_word_overlap = _compute_adjacent_overlap(sentences_content_words)
595
+
596
+ # Synonym density: simplified as 0 (would require WordNet for true synonyms)
597
+ # This is a placeholder - full implementation would use NLTK WordNet
598
+ synonym_density = 0.0
599
+
600
+ # ========== Connectives ==========
601
+
602
+ tokens = _tokenize_simple(text)
603
+ connective_counts = _count_connectives(tokens)
604
+ total_connectives = sum(connective_counts.values())
605
+ connective_density = (total_connectives / word_count) * 100 if word_count > 0 else 0.0
606
+
607
+ # Connective ratios
608
+ additive_ratio = (
609
+ connective_counts["additive"] / total_connectives if total_connectives > 0 else 0.0
610
+ )
611
+ adversative_ratio = (
612
+ connective_counts["adversative"] / total_connectives if total_connectives > 0 else 0.0
613
+ )
614
+ causal_ratio = connective_counts["causal"] / total_connectives if total_connectives > 0 else 0.0
615
+ temporal_ratio = (
616
+ connective_counts["temporal"] / total_connectives if total_connectives > 0 else 0.0
617
+ )
618
+
619
+ # ========== Coherence Measures ==========
620
+
621
+ # Adjacent sentence overlap
622
+ adjacent_sentence_overlap = _compute_adjacent_overlap(sentences_content_words)
623
+
624
+ # Mean pairwise sentence similarity
625
+ mean_sentence_similarity = _compute_mean_sentence_similarity(sentences_content_words)
626
+
627
+ # Paragraphs
628
+ paragraphs = _split_into_paragraphs(text)
629
+ paragraph_count = len(paragraphs)
630
+
631
+ # Mean paragraph length (in sentences)
632
+ if paragraph_count > 0:
633
+ para_sentence_counts = [len(_split_into_sentences(p)) for p in paragraphs]
634
+ mean_paragraph_length = sum(para_sentence_counts) / paragraph_count
635
+ else:
636
+ mean_paragraph_length = 0.0
637
+
638
+ # Paragraph topic consistency
639
+ paragraphs_content_words = []
640
+ for para in paragraphs:
641
+ para_doc = nlp(para)
642
+ paragraphs_content_words.append(_get_content_words_from_doc(para_doc))
643
+ paragraph_topic_consistency = _compute_paragraph_topic_consistency(paragraphs_content_words)
644
+
645
+ # Discourse structure score
646
+ discourse_structure_score = _compute_discourse_structure_score(paragraphs, sentences_text)
647
+
648
+ # Composite semantic coherence score (0-1)
649
+ # Weighted combination of coherence metrics
650
+ semantic_coherence_score = (
651
+ 0.3 * adjacent_sentence_overlap
652
+ + 0.2 * mean_sentence_similarity
653
+ + 0.2 * paragraph_topic_consistency
654
+ + 0.15 * min(connective_density / 5.0, 1.0) # Normalize connective density
655
+ + 0.15 * discourse_structure_score
656
+ )
657
+ semantic_coherence_score = min(max(semantic_coherence_score, 0.0), 1.0)
658
+
659
+ return CohesionCoherenceResult(
660
+ # Referential cohesion
661
+ pronoun_density=pronoun_density,
662
+ demonstrative_density=demonstrative_density,
663
+ anaphora_count=anaphora_count,
664
+ anaphora_resolution_ratio=anaphora_resolution_ratio,
665
+ # Lexical cohesion
666
+ word_repetition_ratio=word_repetition_ratio,
667
+ synonym_density=synonym_density,
668
+ lexical_chain_count=lexical_chain_count,
669
+ mean_chain_length=mean_chain_length,
670
+ content_word_overlap=content_word_overlap,
671
+ # Connectives
672
+ connective_density=connective_density,
673
+ additive_connective_ratio=additive_ratio,
674
+ adversative_connective_ratio=adversative_ratio,
675
+ causal_connective_ratio=causal_ratio,
676
+ temporal_connective_ratio=temporal_ratio,
677
+ # Coherence
678
+ adjacent_sentence_overlap=adjacent_sentence_overlap,
679
+ paragraph_topic_consistency=paragraph_topic_consistency,
680
+ mean_sentence_similarity=mean_sentence_similarity,
681
+ semantic_coherence_score=semantic_coherence_score,
682
+ # Structural
683
+ paragraph_count=paragraph_count,
684
+ mean_paragraph_length=mean_paragraph_length,
685
+ discourse_structure_score=discourse_structure_score,
686
+ # Metadata
687
+ metadata={
688
+ "model": model,
689
+ "word_count": word_count,
690
+ "sentence_count": sentence_count,
691
+ "pronoun_count": pronoun_count,
692
+ "demonstrative_count": demonstrative_count,
693
+ "connective_counts": connective_counts,
694
+ "total_connectives": total_connectives,
695
+ "lexical_chains": [
696
+ {"word": chain[0] if chain else "", "length": len(chain)}
697
+ for chain in lexical_chains
698
+ ],
699
+ "content_words_per_sentence": [len(s) for s in sentences_content_words],
700
+ },
45
701
  )