pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pystylometry/__init__.py +1 -2
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1224 -2
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +4 -0
  6. pystylometry/authorship/additional_methods.py +100 -0
  7. pystylometry/character/__init__.py +15 -0
  8. pystylometry/character/character_metrics.py +301 -0
  9. pystylometry/lexical/__init__.py +13 -6
  10. pystylometry/lexical/advanced_diversity.py +641 -0
  11. pystylometry/lexical/function_words.py +391 -0
  12. pystylometry/lexical/hapax.py +154 -7
  13. pystylometry/lexical/mtld.py +83 -7
  14. pystylometry/lexical/ttr.py +83 -0
  15. pystylometry/lexical/word_frequency_sophistication.py +581 -0
  16. pystylometry/lexical/yule.py +34 -7
  17. pystylometry/ngrams/__init__.py +2 -0
  18. pystylometry/ngrams/extended_ngrams.py +235 -0
  19. pystylometry/prosody/__init__.py +12 -0
  20. pystylometry/prosody/rhythm_prosody.py +53 -0
  21. pystylometry/readability/__init__.py +12 -0
  22. pystylometry/readability/additional_formulas.py +985 -0
  23. pystylometry/readability/ari.py +93 -17
  24. pystylometry/readability/coleman_liau.py +102 -9
  25. pystylometry/readability/complex_words.py +531 -0
  26. pystylometry/readability/flesch.py +59 -14
  27. pystylometry/readability/gunning_fog.py +194 -25
  28. pystylometry/readability/smog.py +31 -14
  29. pystylometry/readability/syllables.py +137 -30
  30. pystylometry/stylistic/__init__.py +20 -0
  31. pystylometry/stylistic/cohesion_coherence.py +45 -0
  32. pystylometry/stylistic/genre_register.py +45 -0
  33. pystylometry/stylistic/markers.py +131 -0
  34. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  35. pystylometry/syntactic/__init__.py +4 -0
  36. pystylometry/syntactic/advanced_syntactic.py +432 -0
  37. pystylometry/syntactic/pos_ratios.py +104 -13
  38. pystylometry/syntactic/sentence_stats.py +57 -13
  39. pystylometry/syntactic/sentence_types.py +470 -0
  40. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
  41. pystylometry-1.0.0.dist-info/RECORD +46 -0
  42. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
  43. pystylometry-0.1.0.dist-info/RECORD +0 -26
pystylometry/_types.py CHANGED
@@ -40,6 +40,68 @@ class HapaxResult:
40
40
  metadata: dict[str, Any]
41
41
 
42
42
 
43
+ @dataclass
44
+ class LexiconCategories:
45
+ """Categorization of words by lexicon presence."""
46
+
47
+ neologisms: list[str] # Not in WordNet AND not in BNC
48
+ rare_words: list[str] # In one lexicon but not both
49
+ common_words: list[str] # In both WordNet AND BNC
50
+ neologism_ratio: float # Ratio of neologisms to total hapax
51
+ rare_word_ratio: float # Ratio of rare words to total hapax
52
+ metadata: dict[str, Any]
53
+
54
+
55
+ @dataclass
56
+ class HapaxLexiconResult:
57
+ """Result from Hapax Legomena analysis with lexicon categorization.
58
+
59
+ Extends basic hapax analysis by categorizing hapax legomena based on
60
+ presence in WordNet and British National Corpus (BNC):
61
+
62
+ - Neologisms: Words not in WordNet AND not in BNC (true novel words)
63
+ - Rare words: Words in BNC but not WordNet, or vice versa
64
+ - Common words: Words in both lexicons (just happen to appear once in text)
65
+
66
+ This categorization is valuable for stylometric analysis as it distinguishes
67
+ between vocabulary innovation (neologisms) and incidental hapax occurrence
68
+ (common words that appear once).
69
+ """
70
+
71
+ hapax_result: HapaxResult # Standard hapax metrics
72
+ lexicon_analysis: LexiconCategories # Lexicon-based categorization
73
+ metadata: dict[str, Any]
74
+
75
+
76
+ @dataclass
77
+ class TTRResult:
78
+ """Result from Type-Token Ratio (TTR) analysis.
79
+
80
+ Wraps stylometry-ttr package functionality to measure vocabulary richness
81
+ through the ratio of unique words (types) to total words (tokens).
82
+
83
+ Includes multiple TTR variants for length normalization:
84
+ - Raw TTR: Direct ratio of unique to total words
85
+ - Root TTR (Guiraud's index): types / sqrt(tokens)
86
+ - Log TTR (Herdan's C): log(types) / log(tokens)
87
+ - STTR: Standardized TTR across fixed-size chunks
88
+ - Delta Std: Measures vocabulary consistency across chunks
89
+
90
+ References:
91
+ Guiraud, P. (1960). Problèmes et méthodes de la statistique linguistique.
92
+ Herdan, G. (1960). Type-token Mathematics.
93
+ """
94
+
95
+ total_words: int
96
+ unique_words: int
97
+ ttr: float # Raw TTR
98
+ root_ttr: float # Guiraud's index
99
+ log_ttr: float # Herdan's C
100
+ sttr: float # Standardized TTR
101
+ delta_std: float # Vocabulary consistency
102
+ metadata: dict[str, Any]
103
+
104
+
43
105
  # ===== Readability Results =====
44
106
 
45
107
 
@@ -58,7 +120,7 @@ class SMOGResult:
58
120
  """Result from SMOG Index computation."""
59
121
 
60
122
  smog_index: float
61
- grade_level: int
123
+ grade_level: float
62
124
  metadata: dict[str, Any]
63
125
 
64
126
 
@@ -67,7 +129,7 @@ class GunningFogResult:
67
129
  """Result from Gunning Fog Index computation."""
68
130
 
69
131
  fog_index: float
70
- grade_level: int
132
+ grade_level: float
71
133
  metadata: dict[str, Any]
72
134
 
73
135
 
@@ -157,6 +219,1166 @@ class EntropyResult:
157
219
  metadata: dict[str, Any]
158
220
 
159
221
 
222
+ # ===== Character-Level Results =====
223
+ # Related to GitHub Issue #12: Character-Level Metrics
224
+ # https://github.com/craigtrim/pystylometry/issues/12
225
+
226
+
227
+ @dataclass
228
+ class CharacterMetricsResult:
229
+ """Result from character-level metrics analysis.
230
+
231
+ This dataclass holds character-level stylometric features that provide
232
+ low-level insights into writing style. Character-level metrics are
233
+ fundamental for authorship attribution and can capture distinctive
234
+ patterns in punctuation, formatting, and word construction.
235
+
236
+ Related GitHub Issue:
237
+ #12 - Character-Level Metrics
238
+ https://github.com/craigtrim/pystylometry/issues/12
239
+
240
+ Metrics included:
241
+ - Average word length (characters per word)
242
+ - Average sentence length (characters per sentence)
243
+ - Punctuation density (punctuation marks per 100 words)
244
+ - Punctuation variety (count of unique punctuation types)
245
+ - Letter frequency distribution (26-element vector for a-z)
246
+ - Vowel-to-consonant ratio
247
+ - Digit frequency (count/ratio of numeric characters)
248
+ - Uppercase ratio (uppercase letters / total letters)
249
+ - Whitespace ratio (whitespace characters / total characters)
250
+
251
+ References:
252
+ Grieve, J. (2007). Quantitative authorship attribution: An evaluation
253
+ of techniques. Literary and Linguistic Computing, 22(3), 251-270.
254
+ Stamatatos, E. (2009). A survey of modern authorship attribution methods.
255
+ JASIST, 60(3), 538-556.
256
+
257
+ Example:
258
+ >>> result = compute_character_metrics("Sample text here.")
259
+ >>> print(f"Avg word length: {result.avg_word_length:.2f} chars")
260
+ >>> print(f"Punctuation density: {result.punctuation_density:.2f}")
261
+ >>> print(f"Vowel/consonant ratio: {result.vowel_consonant_ratio:.2f}")
262
+ """
263
+
264
+ avg_word_length: float # Mean characters per word
265
+ avg_sentence_length_chars: float # Mean characters per sentence
266
+ punctuation_density: float # Punctuation marks per 100 words
267
+ punctuation_variety: int # Count of unique punctuation types used
268
+ letter_frequency: dict[str, float] # Frequency distribution for a-z
269
+ vowel_consonant_ratio: float # Ratio of vowels to consonants
270
+ digit_count: int # Total count of digit characters (0-9)
271
+ digit_ratio: float # Digits / total characters
272
+ uppercase_ratio: float # Uppercase letters / total letters
273
+ whitespace_ratio: float # Whitespace characters / total characters
274
+ metadata: dict[str, Any] # Additional info (character counts, etc.)
275
+
276
+
277
+ # ===== Function Word Results =====
278
+ # Related to GitHub Issue #13: Function Word Analysis
279
+ # https://github.com/craigtrim/pystylometry/issues/13
280
+
281
+
282
+ @dataclass
283
+ class FunctionWordResult:
284
+ """Result from function word analysis.
285
+
286
+ Function words (determiners, prepositions, conjunctions, pronouns, auxiliary
287
+ verbs) are highly frequent, content-independent words that are often used
288
+ subconsciously. They are considered strong authorship markers because authors
289
+ use them consistently across different topics and genres.
290
+
291
+ Related GitHub Issue:
292
+ #13 - Function Word Analysis
293
+ https://github.com/craigtrim/pystylometry/issues/13
294
+
295
+ This analysis computes:
296
+ - Frequency profiles for all function word categories
297
+ - Ratios for specific grammatical categories
298
+ - Most/least frequently used function words
299
+ - Function word diversity metrics
300
+
301
+ Function word categories analyzed:
302
+ - Determiners: the, a, an, this, that, these, those, etc.
303
+ - Prepositions: in, on, at, by, for, with, from, to, etc.
304
+ - Conjunctions: and, but, or, nor, for, yet, so, etc.
305
+ - Pronouns: I, you, he, she, it, we, they, etc.
306
+ - Auxiliary verbs: be, have, do, can, will, shall, may, etc.
307
+ - Particles: up, down, out, off, over, etc.
308
+
309
+ References:
310
+ Mosteller, F., & Wallace, D. L. (1964). Inference and disputed authorship:
311
+ The Federalist. Addison-Wesley.
312
+ Burrows, J. (2002). 'Delta': A measure of stylistic difference and a guide
313
+ to likely authorship. Literary and Linguistic Computing, 17(3), 267-287.
314
+
315
+ Example:
316
+ >>> result = compute_function_words("Sample text for analysis.")
317
+ >>> print(f"Determiner ratio: {result.determiner_ratio:.3f}")
318
+ >>> print(f"Preposition ratio: {result.preposition_ratio:.3f}")
319
+ >>> print(f"Most frequent: {result.most_frequent_function_words[:5]}")
320
+ """
321
+
322
+ determiner_ratio: float # Determiners / total words
323
+ preposition_ratio: float # Prepositions / total words
324
+ conjunction_ratio: float # Conjunctions / total words
325
+ pronoun_ratio: float # Pronouns / total words
326
+ auxiliary_ratio: float # Auxiliary verbs / total words
327
+ particle_ratio: float # Particles / total words
328
+ total_function_word_ratio: float # All function words / total words
329
+ function_word_diversity: float # Unique function words / total function words
330
+ most_frequent_function_words: list[tuple[str, int]] # Top N with counts
331
+ least_frequent_function_words: list[tuple[str, int]] # Bottom N with counts
332
+ function_word_distribution: dict[str, int] # All function words with counts
333
+ metadata: dict[str, Any] # Category-specific counts, total counts, etc.
334
+
335
+
336
+ # ===== Advanced Lexical Diversity Results =====
337
+ # Related to GitHub Issue #14: Advanced Lexical Diversity Metrics
338
+ # https://github.com/craigtrim/pystylometry/issues/14
339
+
340
+
341
+ @dataclass
342
+ class VocdDResult:
343
+ """Result from voc-D computation.
344
+
345
+ voc-D is a sophisticated measure of lexical diversity that uses a mathematical
346
+ model to estimate vocabulary richness while controlling for text length.
347
+ It fits a curve to the relationship between tokens and types across multiple
348
+ random samples of the text.
349
+
350
+ Related GitHub Issue:
351
+ #14 - Advanced Lexical Diversity Metrics
352
+ https://github.com/craigtrim/pystylometry/issues/14
353
+
354
+ The D parameter represents the theoretical vocabulary size and is more
355
+ stable across different text lengths than simple TTR measures.
356
+
357
+ References:
358
+ Malvern, D., Richards, B., Chipere, N., & Durán, P. (2004).
359
+ Lexical Diversity and Language Development. Palgrave Macmillan.
360
+ McKee, G., Malvern, D., & Richards, B. (2000). Measuring vocabulary
361
+ diversity using dedicated software. Literary and Linguistic Computing,
362
+ 15(3), 323-337.
363
+
364
+ Example:
365
+ >>> result = compute_vocd_d("Long sample text for voc-D analysis...")
366
+ >>> print(f"D parameter: {result.d_parameter:.2f}")
367
+ >>> print(f"Curve fit R²: {result.curve_fit_r_squared:.3f}")
368
+ """
369
+
370
+ d_parameter: float # The D value (theoretical vocabulary size)
371
+ curve_fit_r_squared: float # Quality of curve fit (0-1)
372
+ sample_count: int # Number of random samples used
373
+ optimal_sample_size: int # Optimal token sample size used
374
+ metadata: dict[str, Any] # Sampling parameters, convergence info, etc.
375
+
376
+
377
+ @dataclass
378
+ class MATTRResult:
379
+ """Result from MATTR (Moving-Average Type-Token Ratio) computation.
380
+
381
+ MATTR computes TTR using a moving window of fixed size, which provides
382
+ a more stable measure of lexical diversity than simple TTR, especially
383
+ for longer texts. The moving window approach reduces the impact of text
384
+ length on the TTR calculation.
385
+
386
+ Related GitHub Issue:
387
+ #14 - Advanced Lexical Diversity Metrics
388
+ https://github.com/craigtrim/pystylometry/issues/14
389
+
390
+ References:
391
+ Covington, M. A., & McFall, J. D. (2010). Cutting the Gordian knot:
392
+ The moving-average type-token ratio (MATTR). Journal of Quantitative
393
+ Linguistics, 17(2), 94-100.
394
+
395
+ Example:
396
+ >>> result = compute_mattr("Sample text here...", window_size=50)
397
+ >>> print(f"MATTR score: {result.mattr_score:.3f}")
398
+ >>> print(f"Window size: {result.window_size}")
399
+ """
400
+
401
+ mattr_score: float # Average TTR across all windows
402
+ window_size: int # Size of moving window used
403
+ window_count: int # Number of windows analyzed
404
+ ttr_std_dev: float # Standard deviation of TTR across windows
405
+ min_ttr: float # Minimum TTR in any window
406
+ max_ttr: float # Maximum TTR in any window
407
+ metadata: dict[str, Any] # Window-by-window TTR values, etc.
408
+
409
+
410
+ @dataclass
411
+ class HDDResult:
412
+ """Result from HD-D (Hypergeometric Distribution D) computation.
413
+
414
+ HD-D is a probabilistic measure of lexical diversity based on the
415
+ hypergeometric distribution. It estimates the probability of encountering
416
+ new word types as text length increases, providing a mathematically
417
+ rigorous measure that is less sensitive to text length than TTR.
418
+
419
+ Related GitHub Issue:
420
+ #14 - Advanced Lexical Diversity Metrics
421
+ https://github.com/craigtrim/pystylometry/issues/14
422
+
423
+ References:
424
+ McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D: A validation
425
+ study of sophisticated approaches to lexical diversity assessment.
426
+ Behavior Research Methods, 42(2), 381-392.
427
+
428
+ Example:
429
+ >>> result = compute_hdd("Sample text for HD-D analysis...")
430
+ >>> print(f"HD-D score: {result.hdd_score:.3f}")
431
+ >>> print(f"Sample size: {result.sample_size}")
432
+ """
433
+
434
+ hdd_score: float # The HD-D value
435
+ sample_size: int # Sample size used for calculation
436
+ type_count: int # Number of unique types in sample
437
+ token_count: int # Number of tokens in sample
438
+ metadata: dict[str, Any] # Probability distribution info, etc.
439
+
440
+
441
+ @dataclass
442
+ class MSTTRResult:
443
+ """Result from MSTTR (Mean Segmental Type-Token Ratio) computation.
444
+
445
+ MSTTR divides the text into sequential segments of equal length and
446
+ computes the average TTR across all segments. This provides a length-
447
+ normalized measure of lexical diversity that is more comparable across
448
+ texts of different lengths.
449
+
450
+ Related GitHub Issue:
451
+ #14 - Advanced Lexical Diversity Metrics
452
+ https://github.com/craigtrim/pystylometry/issues/14
453
+
454
+ References:
455
+ Johnson, W. (1944). Studies in language behavior: I. A program of research.
456
+ Psychological Monographs, 56(2), 1-15.
457
+
458
+ Example:
459
+ >>> result = compute_msttr("Sample text...", segment_size=100)
460
+ >>> print(f"MSTTR score: {result.msttr_score:.3f}")
461
+ >>> print(f"Segments analyzed: {result.segment_count}")
462
+ """
463
+
464
+ msttr_score: float # Mean TTR across all segments
465
+ segment_size: int # Size of each segment
466
+ segment_count: int # Number of segments analyzed
467
+ ttr_std_dev: float # Standard deviation of TTR across segments
468
+ min_ttr: float # Minimum TTR in any segment
469
+ max_ttr: float # Maximum TTR in any segment
470
+ segment_ttrs: list[float] # TTR for each individual segment
471
+ metadata: dict[str, Any] # Segment details, remaining tokens, etc.
472
+
473
+
474
+ # ===== Word Frequency Sophistication Results =====
475
+ # Related to GitHub Issue #15: Word Frequency Sophistication Metrics
476
+ # https://github.com/craigtrim/pystylometry/issues/15
477
+
478
+
479
+ @dataclass
480
+ class WordFrequencySophisticationResult:
481
+ """Result from word frequency sophistication analysis.
482
+
483
+ Word frequency sophistication metrics measure how common or rare the
484
+ vocabulary used in a text is, based on reference frequency lists from
485
+ large corpora. Authors who use less frequent (more sophisticated) words
486
+ score higher on these metrics.
487
+
488
+ Related GitHub Issue:
489
+ #15 - Word Frequency Sophistication Metrics
490
+ https://github.com/craigtrim/pystylometry/issues/15
491
+
492
+ This analysis uses reference frequency data from:
493
+ - COCA (Corpus of Contemporary American English)
494
+ - BNC (British National Corpus)
495
+ - Google N-grams
496
+ - SUBTLEXus (subtitle frequencies)
497
+
498
+ Metrics computed:
499
+ - Mean word frequency (average frequency rank)
500
+ - Median word frequency
501
+ - Rare word ratio (words beyond frequency threshold)
502
+ - Academic word ratio (from Academic Word List)
503
+ - Advanced word ratio (sophisticated vocabulary)
504
+
505
+ References:
506
+ Brysbaert, M., & New, B. (2009). Moving beyond Kučera and Francis:
507
+ A critical evaluation of current word frequency norms. Behavior
508
+ Research Methods, Instruments, & Computers, 41(4), 977-990.
509
+ Coxhead, A. (2000). A new academic word list. TESOL Quarterly, 34(2), 213-238.
510
+
511
+ Example:
512
+ >>> result = compute_word_frequency_sophistication("Sample text...")
513
+ >>> print(f"Mean frequency rank: {result.mean_frequency_rank:.1f}")
514
+ >>> print(f"Rare word ratio: {result.rare_word_ratio:.3f}")
515
+ >>> print(f"Academic word ratio: {result.academic_word_ratio:.3f}")
516
+ """
517
+
518
+ mean_frequency_rank: float # Average frequency rank of words
519
+ median_frequency_rank: float # Median frequency rank
520
+ rare_word_ratio: float # Words beyond frequency threshold / total
521
+ common_word_ratio: float # High-frequency words / total
522
+ academic_word_ratio: float # Academic Word List words / total
523
+ advanced_word_ratio: float # Sophisticated vocabulary / total
524
+ frequency_band_distribution: dict[str, float] # Distribution across frequency bands
525
+ rarest_words: list[tuple[str, float]] # Least frequent words with ranks
526
+ most_common_words: list[tuple[str, float]] # Most frequent words with ranks
527
+ metadata: dict[str, Any] # Corpus source, band thresholds, total words, etc.
528
+
529
+
530
+ # ===== Additional Readability Results =====
531
+ # Related to GitHub Issue #16: Additional Readability Formulas
532
+ # https://github.com/craigtrim/pystylometry/issues/16
533
+
534
+
535
+ @dataclass
536
+ class DaleChallResult:
537
+ """Result from Dale-Chall Readability Formula.
538
+
539
+ The Dale-Chall formula uses a list of 3000 familiar words that 80% of
540
+ fourth-graders understand. Words not on this list are considered "difficult."
541
+ The formula provides a grade level estimate based on sentence length and
542
+ the percentage of difficult words.
543
+
544
+ Related GitHub Issue:
545
+ #16 - Additional Readability Formulas
546
+ https://github.com/craigtrim/pystylometry/issues/16
547
+
548
+ Formula: 0.1579 * (difficult_words / total_words * 100) + 0.0496 * avg_sentence_length
549
+
550
+ If % difficult words > 5%, add 3.6365 to the raw score.
551
+
552
+ References:
553
+ Dale, E., & Chall, J. S. (1948). A formula for predicting readability.
554
+ Educational Research Bulletin, 27(1), 11-28.
555
+ Chall, J. S., & Dale, E. (1995). Readability revisited: The new Dale-Chall
556
+ readability formula. Brookline Books.
557
+
558
+ Example:
559
+ >>> result = compute_dale_chall("Sample text to analyze...")
560
+ >>> print(f"Dale-Chall score: {result.dale_chall_score:.2f}")
561
+ >>> print(f"Grade level: {result.grade_level}")
562
+ >>> print(f"Difficult word %: {result.difficult_word_ratio * 100:.1f}%")
563
+ """
564
+
565
+ dale_chall_score: float # The Dale-Chall readability score
566
+ grade_level: str # Corresponding grade level (e.g., "7-8", "College")
567
+ difficult_word_count: int # Words not on Dale-Chall list
568
+ difficult_word_ratio: float # Difficult words / total words
569
+ avg_sentence_length: float # Average words per sentence
570
+ total_words: int # Total word count
571
+ metadata: dict[str, Any] # List of difficult words, adjusted score, etc.
572
+
573
+
574
+ @dataclass
575
+ class LinsearWriteResult:
576
+ """Result from Linsear Write Formula.
577
+
578
+ The Linsear Write Formula was developed for the U.S. Air Force to calculate
579
+ the readability of technical manuals. It categorizes words as "easy" (1-2
580
+ syllables) or "hard" (3+ syllables) and uses sentence length to estimate
581
+ grade level. It's particularly effective for technical writing.
582
+
583
+ Related GitHub Issue:
584
+ #16 - Additional Readability Formulas
585
+ https://github.com/craigtrim/pystylometry/issues/16
586
+
587
+ References:
588
+ Klare, G. R. (1974-1975). Assessing readability. Reading Research Quarterly,
589
+ 10(1), 62-102.
590
+
591
+ Example:
592
+ >>> result = compute_linsear_write("Technical manual text...")
593
+ >>> print(f"Linsear Write score: {result.linsear_score:.2f}")
594
+ >>> print(f"Grade level: {result.grade_level}")
595
+ """
596
+
597
+ linsear_score: float # The Linsear Write score
598
+ grade_level: int # Corresponding U.S. grade level
599
+ easy_word_count: int # Words with 1-2 syllables
600
+ hard_word_count: int # Words with 3+ syllables
601
+ avg_sentence_length: float # Average words per sentence
602
+ metadata: dict[str, Any] # Calculation details, sentence count, etc.
603
+
604
+
605
+ @dataclass
606
+ class FryResult:
607
+ """Result from Fry Readability Graph.
608
+
609
+ The Fry Readability Graph uses average sentence length and average syllables
610
+ per word to determine reading difficulty. It plots these values on a graph
611
+ to determine the grade level. This implementation provides the numerical
612
+ coordinates and estimated grade level.
613
+
614
+ Related GitHub Issue:
615
+ #16 - Additional Readability Formulas
616
+ https://github.com/craigtrim/pystylometry/issues/16
617
+
618
+ References:
619
+ Fry, E. (1968). A readability formula that saves time. Journal of Reading,
620
+ 11(7), 513-578.
621
+ Fry, E. (1977). Fry's readability graph: Clarifications, validity, and
622
+ extension to level 17. Journal of Reading, 21(3), 242-252.
623
+
624
+ Example:
625
+ >>> result = compute_fry("Sample educational text...")
626
+ >>> print(f"Avg sentence length: {result.avg_sentence_length:.1f}")
627
+ >>> print(f"Avg syllables/100 words: {result.avg_syllables_per_100:.1f}")
628
+ >>> print(f"Grade level: {result.grade_level}")
629
+ """
630
+
631
+ avg_sentence_length: float # Average words per sentence
632
+ avg_syllables_per_100: float # Average syllables per 100 words
633
+ grade_level: str # Estimated grade level (e.g., "5", "7", "College")
634
+ graph_zone: str # Which zone of Fry graph (for validity checking)
635
+ metadata: dict[str, Any] # Total sentences, total syllables, etc.
636
+
637
+
638
+ @dataclass
639
+ class FORCASTResult:
640
+ """Result from FORCAST Readability Formula.
641
+
642
+ FORCAST (FORmula for CASTing readability) was developed by the U.S. military
643
+ to assess readability without counting syllables. It uses only single-syllable
644
+ words as a measure, making it faster to compute than syllable-based formulas.
645
+ Particularly useful for technical and military documents.
646
+
647
+ Related GitHub Issue:
648
+ #16 - Additional Readability Formulas
649
+ https://github.com/craigtrim/pystylometry/issues/16
650
+
651
+ Formula: 20 - (N / 10), where N is the number of single-syllable words
652
+ per 150-word sample.
653
+
654
+ References:
655
+ Caylor, J. S., Sticht, T. G., Fox, L. C., & Ford, J. P. (1973).
656
+ Methodologies for determining reading requirements of military
657
+ occupational specialties. Human Resources Research Organization.
658
+
659
+ Example:
660
+ >>> result = compute_forcast("Military technical document text...")
661
+ >>> print(f"FORCAST score: {result.forcast_score:.2f}")
662
+ >>> print(f"Grade level: {result.grade_level}")
663
+ """
664
+
665
+ forcast_score: float # The FORCAST readability score
666
+ grade_level: int # Corresponding U.S. grade level
667
+ single_syllable_ratio: float # Single-syllable words / total words
668
+ single_syllable_count: int # Count of single-syllable words
669
+ total_words: int # Total word count
670
+ metadata: dict[str, Any] # Samples used, calculation details, etc.
671
+
672
+
673
+ @dataclass
674
+ class PowersSumnerKearlResult:
675
+ """Result from Powers-Sumner-Kearl Readability Formula.
676
+
677
+ The Powers-Sumner-Kearl formula is a variation of the Flesch Reading Ease
678
+ formula, recalibrated for primary grade levels (grades 1-4). It uses
679
+ average sentence length and average syllables per word, but with different
680
+ coefficients optimized for younger readers.
681
+
682
+ Related GitHub Issue:
683
+ #16 - Additional Readability Formulas
684
+ https://github.com/craigtrim/pystylometry/issues/16
685
+
686
+ Formula: 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
687
+
688
+ References:
689
+ Powers, R. D., Sumner, W. A., & Kearl, B. E. (1958). A recalculation of
690
+ four adult readability formulas. Journal of Educational Psychology,
691
+ 49(2), 99-105.
692
+
693
+ Example:
694
+ >>> result = compute_powers_sumner_kearl("Children's book text...")
695
+ >>> print(f"PSK score: {result.psk_score:.2f}")
696
+ >>> print(f"Grade level: {result.grade_level}")
697
+ """
698
+
699
+ psk_score: float # The Powers-Sumner-Kearl score
700
+ grade_level: float # Corresponding grade level (can be decimal for primary grades)
701
+ avg_sentence_length: float # Average words per sentence
702
+ avg_syllables_per_word: float # Average syllables per word
703
+ total_sentences: int # Total sentence count
704
+ total_words: int # Total word count
705
+ total_syllables: int # Total syllable count
706
+ metadata: dict[str, Any] # Calculation details, comparison to Flesch, etc.
707
+
708
+
709
+ # ===== Advanced Syntactic Results =====
710
+ # Related to GitHub Issue #17: Advanced Syntactic Analysis
711
+ # https://github.com/craigtrim/pystylometry/issues/17
712
+
713
+
714
+ @dataclass
715
+ class AdvancedSyntacticResult:
716
+ """Result from advanced syntactic analysis using dependency parsing.
717
+
718
+ Advanced syntactic analysis uses dependency parsing to extract sophisticated
719
+ grammatical features that go beyond simple POS tagging. These features
720
+ capture sentence complexity, grammatical sophistication, and syntactic
721
+ style preferences.
722
+
723
+ Related GitHub Issue:
724
+ #17 - Advanced Syntactic Analysis
725
+ https://github.com/craigtrim/pystylometry/issues/17
726
+
727
+ Features analyzed:
728
+ - Parse tree depth (sentence structural complexity)
729
+ - T-units (minimal terminable units - independent clauses with modifiers)
730
+ - Clausal density (clauses per T-unit)
731
+ - Dependent clause ratio
732
+ - Passive voice ratio
733
+ - Subordination index
734
+ - Coordination index
735
+ - Sentence complexity score
736
+
737
+ References:
738
+ Hunt, K. W. (1965). Grammatical structures written at three grade levels.
739
+ NCTE Research Report No. 3.
740
+ Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
741
+ Lu, X. (2010). Automatic analysis of syntactic complexity in second language
742
+ writing. International Journal of Corpus Linguistics, 15(4), 474-496.
743
+
744
+ Example:
745
+ >>> result = compute_advanced_syntactic("Complex sentence structures...")
746
+ >>> print(f"Parse tree depth: {result.mean_parse_tree_depth:.1f}")
747
+ >>> print(f"T-units: {result.t_unit_count}")
748
+ >>> print(f"Passive voice %: {result.passive_voice_ratio * 100:.1f}%")
749
+ """
750
+
751
+ mean_parse_tree_depth: float # Average depth of dependency parse trees
752
+ max_parse_tree_depth: int # Maximum parse tree depth in text
753
+ t_unit_count: int # Number of T-units (minimal terminable units)
754
+ mean_t_unit_length: float # Average words per T-unit
755
+ clausal_density: float # Clauses per T-unit
756
+ dependent_clause_ratio: float # Dependent clauses / total clauses
757
+ passive_voice_ratio: float # Passive constructions / total sentences
758
+ subordination_index: float # Subordinate clauses / total clauses
759
+ coordination_index: float # Coordinate clauses / total clauses
760
+ sentence_complexity_score: float # Composite complexity metric
761
+ dependency_distance: float # Mean distance between heads and dependents
762
+ left_branching_ratio: float # Left-branching structures / total
763
+ right_branching_ratio: float # Right-branching structures / total
764
+ metadata: dict[str, Any] # Parse tree details, clause counts, etc.
765
+
766
+
767
+ # ===== Sentence Type Results =====
768
+ # Related to GitHub Issue #18: Sentence Type Classification
769
+ # https://github.com/craigtrim/pystylometry/issues/18
770
+
771
+
772
+ @dataclass
773
+ class SentenceTypeResult:
774
+ """Result from sentence type classification analysis.
775
+
776
+ Sentence type classification categorizes sentences by their grammatical
777
+ structure (simple, compound, complex, compound-complex) and communicative
778
+ function (declarative, interrogative, imperative, exclamatory). Different
779
+ authors and genres show distinct patterns in sentence type distribution.
780
+
781
+ Related GitHub Issue:
782
+ #18 - Sentence Type Classification
783
+ https://github.com/craigtrim/pystylometry/issues/18
784
+
785
+ Structural types:
786
+ - Simple: One independent clause (e.g., "The cat sat.")
787
+ - Compound: Multiple independent clauses (e.g., "I came, I saw, I conquered.")
788
+ - Complex: One independent + dependent clause(s) (e.g., "When I arrived, I saw her.")
789
+ - Compound-Complex: Multiple independent + dependent (e.g., "I came when called, and I stayed.")
790
+
791
+ Functional types:
792
+ - Declarative: Statement (e.g., "The sky is blue.")
793
+ - Interrogative: Question (e.g., "Is the sky blue?")
794
+ - Imperative: Command (e.g., "Look at the sky!")
795
+ - Exclamatory: Exclamation (e.g., "What a blue sky!")
796
+
797
+ References:
798
+ Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
799
+ Huddleston, R., & Pullum, G. K. (2002). The Cambridge Grammar of the English Language.
800
+
801
+ Example:
802
+ >>> result = compute_sentence_types("Mix of sentence types here...")
803
+ >>> print(f"Simple: {result.simple_ratio * 100:.1f}%")
804
+ >>> print(f"Complex: {result.complex_ratio * 100:.1f}%")
805
+ >>> print(f"Questions: {result.interrogative_ratio * 100:.1f}%")
806
+ """
807
+
808
+ # Structural type ratios (sum to 1.0)
809
+ simple_ratio: float # Simple sentences / total
810
+ compound_ratio: float # Compound sentences / total
811
+ complex_ratio: float # Complex sentences / total
812
+ compound_complex_ratio: float # Compound-complex / total
813
+
814
+ # Functional type ratios (sum to 1.0)
815
+ declarative_ratio: float # Declarative sentences / total
816
+ interrogative_ratio: float # Interrogative (questions) / total
817
+ imperative_ratio: float # Imperative (commands) / total
818
+ exclamatory_ratio: float # Exclamatory sentences / total
819
+
820
+ # Counts
821
+ simple_count: int
822
+ compound_count: int
823
+ complex_count: int
824
+ compound_complex_count: int
825
+ declarative_count: int
826
+ interrogative_count: int
827
+ imperative_count: int
828
+ exclamatory_count: int
829
+ total_sentences: int
830
+
831
+ # Diversity
832
+ structural_diversity: float # Shannon entropy of structural type distribution
833
+ functional_diversity: float # Shannon entropy of functional type distribution
834
+
835
+ metadata: dict[str, Any] # Sentence-by-sentence classifications, etc.
836
+
837
+
838
+ # ===== Extended N-gram Results =====
839
+ # Related to GitHub Issue #19: Extended N-gram Features
840
+ # https://github.com/craigtrim/pystylometry/issues/19
841
+
842
+
843
+ @dataclass
844
+ class ExtendedNgramResult:
845
+ """Result from extended n-gram analysis.
846
+
847
+ Extended n-gram analysis goes beyond basic bigram/trigram entropy to provide
848
+ comprehensive n-gram statistics including frequency distributions, most
849
+ distinctive n-grams, skipgrams, and part-of-speech n-grams. These features
850
+ are valuable for authorship attribution and style analysis.
851
+
852
+ Related GitHub Issue:
853
+ #19 - Extended N-gram Features
854
+ https://github.com/craigtrim/pystylometry/issues/19
855
+
856
+ Features computed:
857
+ - Trigram frequency distributions and top trigrams
858
+ - 4-gram frequency distributions and top 4-grams
859
+ - Skipgrams (n-grams with gaps, e.g., "the * dog")
860
+ - POS n-grams (e.g., "DET ADJ NOUN")
861
+ - Character trigrams and 4-grams
862
+ - N-gram diversity metrics
863
+ - Entropy for each n-gram order
864
+
865
+ References:
866
+ Guthrie, D., Allison, B., Liu, W., Guthrie, L., & Wilks, Y. (2006).
867
+ A closer look at skip-gram modelling. LREC.
868
+ Stamatatos, E. (2009). A survey of modern authorship attribution methods.
869
+ JASIST, 60(3), 538-556.
870
+
871
+ Example:
872
+ >>> result = compute_extended_ngrams("Sample text for n-gram analysis...")
873
+ >>> print(f"Top trigrams: {result.top_word_trigrams[:5]}")
874
+ >>> print(f"Trigram entropy: {result.word_trigram_entropy:.2f}")
875
+ """
876
+
877
+ # Word n-grams
878
+ top_word_trigrams: list[tuple[str, int]] # Most frequent word trigrams
879
+ top_word_4grams: list[tuple[str, int]] # Most frequent word 4-grams
880
+ word_trigram_count: int # Total unique word trigrams
881
+ word_4gram_count: int # Total unique word 4-grams
882
+ word_trigram_entropy: float # Shannon entropy of trigram distribution
883
+ word_4gram_entropy: float # Shannon entropy of 4-gram distribution
884
+
885
+ # Skipgrams (n-grams with gaps)
886
+ top_skipgrams_2_1: list[tuple[str, int]] # Top 2-skipgrams (gap of 1)
887
+ top_skipgrams_3_1: list[tuple[str, int]] # Top 3-skipgrams (gap of 1)
888
+ skipgram_2_1_count: int # Unique 2-skipgrams
889
+ skipgram_3_1_count: int # Unique 3-skipgrams
890
+
891
+ # POS n-grams
892
+ top_pos_trigrams: list[tuple[str, int]] # Most frequent POS trigrams
893
+ top_pos_4grams: list[tuple[str, int]] # Most frequent POS 4-grams
894
+ pos_trigram_count: int # Unique POS trigrams
895
+ pos_4gram_count: int # Unique POS 4-grams
896
+ pos_trigram_entropy: float # Shannon entropy of POS trigram distribution
897
+
898
+ # Character n-grams
899
+ top_char_trigrams: list[tuple[str, int]] # Most frequent character trigrams
900
+ top_char_4grams: list[tuple[str, int]] # Most frequent character 4-grams
901
+ char_trigram_entropy: float # Shannon entropy of char trigram distribution
902
+ char_4gram_entropy: float # Shannon entropy of char 4-gram distribution
903
+
904
+ metadata: dict[str, Any] # Full frequency distributions, parameters, etc.
905
+
906
+
907
+ # ===== Stylistic Markers Results =====
908
+ # Related to GitHub Issue #20: Stylistic Markers
909
+ # https://github.com/craigtrim/pystylometry/issues/20
910
+
911
+
912
+ @dataclass
913
+ class StylisticMarkersResult:
914
+ """Result from stylistic markers analysis.
915
+
916
+ Stylistic markers are specific linguistic features that authors tend to use
917
+ consistently and often subconsciously. These include contraction usage,
918
+ intensifier preferences, hedging expressions, punctuation habits, and more.
919
+ They are powerful indicators of authorial identity.
920
+
921
+ Related GitHub Issue:
922
+ #20 - Stylistic Markers
923
+ https://github.com/craigtrim/pystylometry/issues/20
924
+
925
+ Markers analyzed:
926
+ - Contraction usage (don't vs. do not, I'm vs. I am, etc.)
927
+ - Intensifiers (very, really, extremely, quite, etc.)
928
+ - Hedges (maybe, perhaps, probably, somewhat, etc.)
929
+ - Modal auxiliaries (can, could, may, might, must, should, will, would)
930
+ - Negation patterns (not, no, never, none, neither, etc.)
931
+ - Exclamation frequency
932
+ - Question frequency
933
+ - Quotation usage
934
+ - Parenthetical expressions
935
+ - Ellipses and dashes
936
+
937
+ References:
938
+ Argamon, S., & Levitan, S. (2005). Measuring the usefulness of function
939
+ words for authorship attribution. ACH/ALLC.
940
+ Pennebaker, J. W. (2011). The secret life of pronouns. Bloomsbury Press.
941
+
942
+ Example:
943
+ >>> result = compute_stylistic_markers("Sample text with various markers...")
944
+ >>> print(f"Contraction ratio: {result.contraction_ratio * 100:.1f}%")
945
+ >>> print(f"Intensifier density: {result.intensifier_density:.2f}")
946
+ >>> print(f"Hedging density: {result.hedging_density:.2f}")
947
+ """
948
+
949
+ # Contraction patterns
950
+ contraction_ratio: float # Contractions / (contractions + full forms)
951
+ contraction_count: int # Total contractions
952
+ expanded_form_count: int # Total expanded forms (e.g., "do not" vs "don't")
953
+ top_contractions: list[tuple[str, int]] # Most frequent contractions
954
+
955
+ # Intensifiers and hedges
956
+ intensifier_density: float # Intensifiers per 100 words
957
+ intensifier_count: int # Total intensifier count
958
+ top_intensifiers: list[tuple[str, int]] # Most frequent intensifiers
959
+ hedging_density: float # Hedges per 100 words
960
+ hedging_count: int # Total hedge count
961
+ top_hedges: list[tuple[str, int]] # Most frequent hedges
962
+
963
+ # Modal auxiliaries
964
+ modal_density: float # Modal auxiliaries per 100 words
965
+ modal_distribution: dict[str, int] # Count per modal (can, could, may, etc.)
966
+ epistemic_modal_ratio: float # Epistemic modals / all modals
967
+ deontic_modal_ratio: float # Deontic modals / all modals
968
+
969
+ # Negation
970
+ negation_density: float # Negation markers per 100 words
971
+ negation_count: int # Total negation markers
972
+ negation_types: dict[str, int] # not, no, never, etc. with counts
973
+
974
+ # Punctuation style
975
+ exclamation_density: float # Exclamation marks per 100 words
976
+ question_density: float # Question marks per 100 words
977
+ quotation_density: float # Quotation marks per 100 words
978
+ parenthetical_density: float # Parentheses per 100 words
979
+ ellipsis_density: float # Ellipses per 100 words
980
+ dash_density: float # Dashes (em/en) per 100 words
981
+ semicolon_density: float # Semicolons per 100 words
982
+ colon_density: float # Colons per 100 words
983
+
984
+ metadata: dict[str, Any] # Full lists, total word count, etc.
985
+
986
+
987
+ # ===== Vocabulary Overlap Results =====
988
+ # Related to GitHub Issue #21: Vocabulary Overlap and Similarity Metrics
989
+ # https://github.com/craigtrim/pystylometry/issues/21
990
+
991
+
992
+ @dataclass
993
+ class VocabularyOverlapResult:
994
+ """Result from vocabulary overlap and similarity analysis.
995
+
996
+ Vocabulary overlap metrics measure the similarity between two texts based on
997
+ their shared vocabulary. These metrics are useful for authorship verification,
998
+ plagiarism detection, and measuring stylistic consistency across texts.
999
+
1000
+ Related GitHub Issue:
1001
+ #21 - Vocabulary Overlap and Similarity Metrics
1002
+ https://github.com/craigtrim/pystylometry/issues/21
1003
+
1004
+ Metrics computed:
1005
+ - Jaccard similarity (intersection / union)
1006
+ - Dice coefficient (2 * intersection / sum of sizes)
1007
+ - Overlap coefficient (intersection / min(size1, size2))
1008
+ - Cosine similarity (using word frequency vectors)
1009
+ - Shared vocabulary size and ratio
1010
+ - Unique words in each text
1011
+ - Most distinctive words for each text
1012
+
1013
+ References:
1014
+ Jaccard, P. (1912). The distribution of the flora in the alpine zone.
1015
+ New Phytologist, 11(2), 37-50.
1016
+ Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
1017
+ Retrieval. McGraw-Hill.
1018
+
1019
+ Example:
1020
+ >>> result = compute_vocabulary_overlap(text1, text2)
1021
+ >>> print(f"Jaccard similarity: {result.jaccard_similarity:.3f}")
1022
+ >>> print(f"Shared vocabulary: {result.shared_vocab_size} words")
1023
+ >>> print(f"Text1 unique: {result.text1_unique_count}")
1024
+ """
1025
+
1026
+ # Similarity scores (0-1 range)
1027
+ jaccard_similarity: float # Intersection / union
1028
+ dice_coefficient: float # 2 * intersection / (size1 + size2)
1029
+ overlap_coefficient: float # Intersection / min(size1, size2)
1030
+ cosine_similarity: float # Cosine of frequency vectors
1031
+
1032
+ # Vocabulary sizes
1033
+ text1_vocab_size: int # Unique words in text 1
1034
+ text2_vocab_size: int # Unique words in text 2
1035
+ shared_vocab_size: int # Words in both texts
1036
+ union_vocab_size: int # Words in either text
1037
+ text1_unique_count: int # Words only in text 1
1038
+ text2_unique_count: int # Words only in text 2
1039
+
1040
+ # Shared and distinctive vocabulary
1041
+ shared_words: list[str] # Words appearing in both texts
1042
+ text1_distinctive_words: list[tuple[str, float]] # Words + TF-IDF scores for text 1
1043
+ text2_distinctive_words: list[tuple[str, float]] # Words + TF-IDF scores for text 2
1044
+
1045
+ # Ratios
1046
+ text1_coverage: float # Shared / text1_vocab (how much of text1 is shared)
1047
+ text2_coverage: float # Shared / text2_vocab (how much of text2 is shared)
1048
+
1049
+ metadata: dict[str, Any] # Full vocabulary sets, frequency vectors, etc.
1050
+
1051
+
1052
+ # ===== Cohesion and Coherence Results =====
1053
+ # Related to GitHub Issue #22: Cohesion and Coherence Metrics
1054
+ # https://github.com/craigtrim/pystylometry/issues/22
1055
+
1056
+
1057
+ @dataclass
1058
+ class CohesionCoherenceResult:
1059
+ """Result from cohesion and coherence analysis.
1060
+
1061
+ Cohesion and coherence metrics measure how well a text holds together
1062
+ structurally (cohesion) and semantically (coherence). These metrics are
1063
+ important for analyzing writing quality, readability, and authorial
1064
+ sophistication.
1065
+
1066
+ Related GitHub Issue:
1067
+ #22 - Cohesion and Coherence Metrics
1068
+ https://github.com/craigtrim/pystylometry/issues/22
1069
+
1070
+ Cohesion features:
1071
+ - Referential cohesion (pronouns, demonstratives pointing back)
1072
+ - Lexical cohesion (word repetition, synonyms, semantic relatedness)
1073
+ - Connective density (discourse markers, conjunctions)
1074
+ - Anaphora resolution success rate
1075
+ - Lexical chains (sequences of semantically related words)
1076
+
1077
+ Coherence features:
1078
+ - Sentence-to-sentence semantic similarity
1079
+ - Topic consistency across paragraphs
1080
+ - Discourse structure (thesis, support, conclusion)
1081
+ - Semantic overlap between adjacent sentences
1082
+
1083
+ References:
1084
+ Halliday, M. A. K., & Hasan, R. (1976). Cohesion in English. Longman.
1085
+ Graesser, A. C., McNamara, D. S., & Kulikowich, J. M. (2011). Coh-Metrix:
1086
+ Providing multilevel analyses of text characteristics. Educational
1087
+ Researcher, 40(5), 223-234.
1088
+
1089
+ Example:
1090
+ >>> result = compute_cohesion_coherence("Multi-paragraph text...")
1091
+ >>> print(f"Pronoun density: {result.pronoun_density:.2f}")
1092
+ >>> print(f"Lexical overlap: {result.adjacent_sentence_overlap:.3f}")
1093
+ >>> print(f"Connective density: {result.connective_density:.2f}")
1094
+ """
1095
+
1096
+ # Referential cohesion
1097
+ pronoun_density: float # Pronouns per 100 words
1098
+ demonstrative_density: float # Demonstratives (this, that, these, those) per 100 words
1099
+ anaphora_count: int # Anaphoric references detected
1100
+ anaphora_resolution_ratio: float # Successfully resolved / total
1101
+
1102
+ # Lexical cohesion
1103
+ word_repetition_ratio: float # Repeated content words / total content words
1104
+ synonym_density: float # Synonym pairs per 100 words
1105
+ lexical_chain_count: int # Number of lexical chains detected
1106
+ mean_chain_length: float # Average length of lexical chains
1107
+ content_word_overlap: float # Content word overlap between sentences
1108
+
1109
+ # Connectives and discourse markers
1110
+ connective_density: float # Discourse connectives per 100 words
1111
+ additive_connective_ratio: float # "and", "also", "furthermore" / total connectives
1112
+ adversative_connective_ratio: float # "but", "however", "nevertheless" / total
1113
+ causal_connective_ratio: float # "because", "therefore", "thus" / total
1114
+ temporal_connective_ratio: float # "then", "after", "before" / total
1115
+
1116
+ # Coherence measures
1117
+ adjacent_sentence_overlap: float # Mean semantic overlap between adjacent sentences
1118
+ paragraph_topic_consistency: float # Mean topic consistency within paragraphs
1119
+ mean_sentence_similarity: float # Mean cosine similarity between all sentence pairs
1120
+ semantic_coherence_score: float # Composite coherence metric (0-1)
1121
+
1122
+ # Structural coherence
1123
+ paragraph_count: int # Number of paragraphs detected
1124
+ mean_paragraph_length: float # Mean sentences per paragraph
1125
+ discourse_structure_score: float # Quality of intro/body/conclusion structure
1126
+
1127
+ metadata: dict[str, Any] # Lexical chains, connective lists, similarity matrices, etc.
1128
+
1129
+
1130
+ # ===== Genre and Register Results =====
1131
+ # Related to GitHub Issue #23: Genre and Register Features
1132
+ # https://github.com/craigtrim/pystylometry/issues/23
1133
+
1134
+
1135
+ @dataclass
1136
+ class GenreRegisterResult:
1137
+ """Result from genre and register classification analysis.
1138
+
1139
+ Genre and register features distinguish between different types of texts
1140
+ (academic, journalistic, fiction, legal, etc.) based on linguistic patterns.
1141
+ These features can help identify the context and formality level of a text,
1142
+ and are useful for authorship attribution when combined with other metrics.
1143
+
1144
+ Related GitHub Issue:
1145
+ #23 - Genre and Register Features
1146
+ https://github.com/craigtrim/pystylometry/issues/23
1147
+
1148
+ Features analyzed:
1149
+ - Formality markers (Latinate words, nominalizations, passive voice)
1150
+ - Personal vs. impersonal style (1st/2nd person vs. 3rd person)
1151
+ - Abstract vs. concrete vocabulary
1152
+ - Technical term density
1153
+ - Narrative vs. expository markers
1154
+ - Dialogue presence and ratio
1155
+ - Register classification (frozen, formal, consultative, casual, intimate)
1156
+
1157
+ References:
1158
+ Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
1159
+ Biber, D., & Conrad, S. (2009). Register, genre, and style. Cambridge
1160
+ University Press.
1161
+ Heylighen, F., & Dewaele, J. M. (1999). Formality of language: Definition,
1162
+ measurement and behavioral determinants. Internal Report, Center "Leo
1163
+ Apostel", Free University of Brussels.
1164
+
1165
+ Example:
1166
+ >>> result = compute_genre_register("Academic paper text...")
1167
+ >>> print(f"Formality score: {result.formality_score:.2f}")
1168
+ >>> print(f"Register: {result.register_classification}")
1169
+ >>> print(f"Genre prediction: {result.predicted_genre}")
1170
+ """
1171
+
1172
+ # Formality indicators
1173
+ formality_score: float # Composite formality score (0-100)
1174
+ latinate_ratio: float # Latinate words / total words
1175
+ nominalization_density: float # Nominalizations per 100 words
1176
+ passive_voice_density: float # Passive constructions per 100 words
1177
+
1178
+ # Personal vs. impersonal
1179
+ first_person_ratio: float # 1st person pronouns / total pronouns
1180
+ second_person_ratio: float # 2nd person pronouns / total pronouns
1181
+ third_person_ratio: float # 3rd person pronouns / total pronouns
1182
+ impersonal_construction_density: float # "It is...", "There are..." per 100 words
1183
+
1184
+ # Abstract vs. concrete
1185
+ abstract_noun_ratio: float # Abstract nouns / total nouns
1186
+ concrete_noun_ratio: float # Concrete nouns / total nouns
1187
+ abstractness_score: float # Composite abstractness (based on word concreteness ratings)
1188
+
1189
+ # Technical and specialized
1190
+ technical_term_density: float # Technical/specialized terms per 100 words
1191
+ jargon_density: float # Domain-specific jargon per 100 words
1192
+
1193
+ # Narrative vs. expository
1194
+ narrative_marker_density: float # Past tense, action verbs per 100 words
1195
+ expository_marker_density: float # Present tense, linking verbs per 100 words
1196
+ narrative_expository_ratio: float # Narrative / expository markers
1197
+
1198
+ # Dialogue and quotation
1199
+ dialogue_ratio: float # Dialogue / total text (estimated)
1200
+ quotation_density: float # Quotations per 100 words
1201
+
1202
+ # Classification results
1203
+ register_classification: str # frozen, formal, consultative, casual, intimate
1204
+ predicted_genre: str # academic, journalistic, fiction, legal, conversational, etc.
1205
+ genre_confidence: float # Confidence in genre prediction (0-1)
1206
+
1207
+ # Feature scores for major genres (0-1 scores for each)
1208
+ academic_score: float
1209
+ journalistic_score: float
1210
+ fiction_score: float
1211
+ legal_score: float
1212
+ conversational_score: float
1213
+
1214
+ metadata: dict[str, Any] # Feature details, word lists, classification probabilities, etc.
1215
+
1216
+
1217
+ # ===== Additional Authorship Results =====
1218
+ # Related to GitHub Issue #24: Additional Authorship Attribution Methods
1219
+ # https://github.com/craigtrim/pystylometry/issues/24
1220
+
1221
+
1222
+ @dataclass
1223
+ class KilgarriffResult:
1224
+ """Result from Kilgarriff's Chi-squared method.
1225
+
1226
+ Kilgarriff's chi-squared method compares word frequency distributions between
1227
+ texts using the chi-squared test. It's particularly effective for authorship
1228
+ attribution when comparing frequency profiles of common words.
1229
+
1230
+ Related GitHub Issue:
1231
+ #24 - Additional Authorship Attribution Methods
1232
+ https://github.com/craigtrim/pystylometry/issues/24
1233
+
1234
+ References:
1235
+ Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus
1236
+ Linguistics, 6(1), 97-133.
1237
+
1238
+ Example:
1239
+ >>> result = compute_kilgarriff(text1, text2)
1240
+ >>> print(f"Chi-squared: {result.chi_squared:.2f}")
1241
+ >>> print(f"P-value: {result.p_value:.4f}")
1242
+ """
1243
+
1244
+ chi_squared: float # Chi-squared statistic
1245
+ p_value: float # Statistical significance (p-value)
1246
+ degrees_of_freedom: int # df for chi-squared test
1247
+ feature_count: int # Number of features (words) compared
1248
+ most_distinctive_features: list[tuple[str, float]] # Words + chi-squared contributions
1249
+ metadata: dict[str, Any] # Frequency tables, expected values, etc.
1250
+
1251
+
1252
+ @dataclass
1253
+ class MinMaxResult:
1254
+ """Result from Min-Max distance method (Burrows' original method).
1255
+
1256
+ The Min-Max method normalizes feature frequencies using min-max scaling,
1257
+ then computes distance between texts. This was Burrows' original approach
1258
+ before developing Delta.
1259
+
1260
+ Related GitHub Issue:
1261
+ #24 - Additional Authorship Attribution Methods
1262
+ https://github.com/craigtrim/pystylometry/issues/24
1263
+
1264
+ References:
1265
+ Burrows, J. F. (1992). Not unless you ask nicely: The interpretative
1266
+ nexus between analysis and information. Literary and Linguistic
1267
+ Computing, 7(2), 91-109.
1268
+
1269
+ Example:
1270
+ >>> result = compute_minmax(text1, text2)
1271
+ >>> print(f"MinMax distance: {result.minmax_distance:.3f}")
1272
+ """
1273
+
1274
+ minmax_distance: float # Min-max normalized distance
1275
+ feature_count: int # Number of features used
1276
+ most_distinctive_features: list[tuple[str, float]] # Features + contributions
1277
+ metadata: dict[str, Any] # Normalized frequencies, scaling parameters, etc.
1278
+
1279
+
1280
+ @dataclass
1281
+ class JohnsBurrowsResult:
1282
+ """Result from John's Burrows' variation of Delta.
1283
+
1284
+ John Burrows has developed several variations of the Delta method over
1285
+ the years. This captures alternative formulations including Quadratic
1286
+ Delta and other distance measures.
1287
+
1288
+ Related GitHub Issue:
1289
+ #24 - Additional Authorship Attribution Methods
1290
+ https://github.com/craigtrim/pystylometry/issues/24
1291
+
1292
+ References:
1293
+ Burrows, J. (2005). Who wrote Shamela? Verifying the authorship of a
1294
+ parodic text. Literary and Linguistic Computing, 20(4), 437-450.
1295
+
1296
+ Example:
1297
+ >>> result = compute_johns_delta(text1, text2, method="quadratic")
1298
+ >>> print(f"Quadratic Delta: {result.delta_score:.3f}")
1299
+ """
1300
+
1301
+ delta_score: float # Delta distance score
1302
+ method: str # "quadratic", "weighted", "rotated", etc.
1303
+ feature_count: int # Number of MFW used
1304
+ most_distinctive_features: list[tuple[str, float]] # Features + contributions
1305
+ metadata: dict[str, Any] # Method-specific parameters, z-scores, etc.
1306
+
1307
+
1308
+ # ===== Rhythm and Prosody Results =====
1309
+ # Related to GitHub Issue #25: Rhythm and Prosody Metrics
1310
+ # https://github.com/craigtrim/pystylometry/issues/25
1311
+
1312
+
1313
+ @dataclass
1314
+ class RhythmProsodyResult:
1315
+ """Result from rhythm and prosody analysis.
1316
+
1317
+ Rhythm and prosody metrics capture the musical qualities of written language,
1318
+ including stress patterns, syllable rhythms, and phonological features. While
1319
+ these are typically studied in spoken language, written text preserves many
1320
+ rhythmic patterns that vary by author and genre.
1321
+
1322
+ Related GitHub Issue:
1323
+ #25 - Rhythm and Prosody Metrics
1324
+ https://github.com/craigtrim/pystylometry/issues/25
1325
+
1326
+ Features analyzed:
1327
+ - Syllable patterns and stress patterns
1328
+ - Rhythmic regularity (coefficient of variation of syllable counts)
1329
+ - Phonological features (alliteration, assonance)
1330
+ - Syllable complexity (consonant clusters)
1331
+ - Sentence rhythm (alternating long/short sentences)
1332
+ - Polysyllabic word ratio
1333
+
1334
+ References:
1335
+ Lea, R. B., Mulligan, E. J., & Walton, J. H. (2005). Sentence rhythm and
1336
+ text comprehension. Memory & Cognition, 33(3), 388-396.
1337
+ Louwerse, M. M., & Benesh, N. (2012). Representing spatial structure through
1338
+ maps and language: Lord of the Rings encodes the spatial structure of
1339
+ Middle Earth. Cognitive Science, 36(8), 1556-1569.
1340
+
1341
+ Example:
1342
+ >>> result = compute_rhythm_prosody("Sample text with rhythm...")
1343
+ >>> print(f"Syllables per word: {result.mean_syllables_per_word:.2f}")
1344
+ >>> print(f"Rhythmic regularity: {result.rhythmic_regularity:.3f}")
1345
+ >>> print(f"Alliteration density: {result.alliteration_density:.2f}")
1346
+ """
1347
+
1348
+ # Syllable patterns
1349
+ mean_syllables_per_word: float # Average syllables per word
1350
+ syllable_std_dev: float # Std dev of syllables per word
1351
+ polysyllabic_ratio: float # Words with 3+ syllables / total
1352
+ monosyllabic_ratio: float # Single-syllable words / total
1353
+
1354
+ # Rhythmic regularity
1355
+ rhythmic_regularity: float # 1 / CV of syllable counts (higher = more regular)
1356
+ syllable_cv: float # Coefficient of variation of syllables per word
1357
+ stress_pattern_entropy: float # Entropy of stress patterns
1358
+
1359
+ # Sentence rhythm
1360
+ sentence_length_alternation: float # Degree of long/short alternation
1361
+ sentence_rhythm_score: float # Composite rhythm score
1362
+
1363
+ # Phonological features
1364
+ alliteration_density: float # Alliterative word pairs per 100 words
1365
+ assonance_density: float # Assonant word pairs per 100 words
1366
+ consonance_density: float # Consonant word pairs per 100 words
1367
+
1368
+ # Syllable complexity
1369
+ mean_consonant_cluster_length: float # Avg consonants in clusters
1370
+ initial_cluster_ratio: float # Words starting with clusters / total
1371
+ final_cluster_ratio: float # Words ending with clusters / total
1372
+
1373
+ # Stress patterns (estimated for written text)
1374
+ iambic_ratio: float # Iambic patterns (unstressed-stressed) / total
1375
+ trochaic_ratio: float # Trochaic patterns (stressed-unstressed) / total
1376
+ dactylic_ratio: float # Dactylic patterns / total
1377
+ anapestic_ratio: float # Anapestic patterns / total
1378
+
1379
+ metadata: dict[str, Any] # Syllable counts, stress patterns, phoneme data, etc.
1380
+
1381
+
160
1382
  # ===== Unified Analysis Result =====
161
1383
 
162
1384