pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +1 -2
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1224 -2
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +4 -0
- pystylometry/authorship/additional_methods.py +100 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +301 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +641 -0
- pystylometry/lexical/function_words.py +391 -0
- pystylometry/lexical/hapax.py +154 -7
- pystylometry/lexical/mtld.py +83 -7
- pystylometry/lexical/ttr.py +83 -0
- pystylometry/lexical/word_frequency_sophistication.py +581 -0
- pystylometry/lexical/yule.py +34 -7
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +985 -0
- pystylometry/readability/ari.py +93 -17
- pystylometry/readability/coleman_liau.py +102 -9
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +59 -14
- pystylometry/readability/gunning_fog.py +194 -25
- pystylometry/readability/smog.py +31 -14
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +432 -0
- pystylometry/syntactic/pos_ratios.py +104 -13
- pystylometry/syntactic/sentence_stats.py +57 -13
- pystylometry/syntactic/sentence_types.py +470 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
- pystylometry-1.0.0.dist-info/RECORD +46 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
- pystylometry-0.1.0.dist-info/RECORD +0 -26
pystylometry/_types.py
CHANGED
|
@@ -40,6 +40,68 @@ class HapaxResult:
|
|
|
40
40
|
metadata: dict[str, Any]
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
@dataclass
|
|
44
|
+
class LexiconCategories:
|
|
45
|
+
"""Categorization of words by lexicon presence."""
|
|
46
|
+
|
|
47
|
+
neologisms: list[str] # Not in WordNet AND not in BNC
|
|
48
|
+
rare_words: list[str] # In one lexicon but not both
|
|
49
|
+
common_words: list[str] # In both WordNet AND BNC
|
|
50
|
+
neologism_ratio: float # Ratio of neologisms to total hapax
|
|
51
|
+
rare_word_ratio: float # Ratio of rare words to total hapax
|
|
52
|
+
metadata: dict[str, Any]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class HapaxLexiconResult:
|
|
57
|
+
"""Result from Hapax Legomena analysis with lexicon categorization.
|
|
58
|
+
|
|
59
|
+
Extends basic hapax analysis by categorizing hapax legomena based on
|
|
60
|
+
presence in WordNet and British National Corpus (BNC):
|
|
61
|
+
|
|
62
|
+
- Neologisms: Words not in WordNet AND not in BNC (true novel words)
|
|
63
|
+
- Rare words: Words in BNC but not WordNet, or vice versa
|
|
64
|
+
- Common words: Words in both lexicons (just happen to appear once in text)
|
|
65
|
+
|
|
66
|
+
This categorization is valuable for stylometric analysis as it distinguishes
|
|
67
|
+
between vocabulary innovation (neologisms) and incidental hapax occurrence
|
|
68
|
+
(common words that appear once).
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
hapax_result: HapaxResult # Standard hapax metrics
|
|
72
|
+
lexicon_analysis: LexiconCategories # Lexicon-based categorization
|
|
73
|
+
metadata: dict[str, Any]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class TTRResult:
|
|
78
|
+
"""Result from Type-Token Ratio (TTR) analysis.
|
|
79
|
+
|
|
80
|
+
Wraps stylometry-ttr package functionality to measure vocabulary richness
|
|
81
|
+
through the ratio of unique words (types) to total words (tokens).
|
|
82
|
+
|
|
83
|
+
Includes multiple TTR variants for length normalization:
|
|
84
|
+
- Raw TTR: Direct ratio of unique to total words
|
|
85
|
+
- Root TTR (Guiraud's index): types / sqrt(tokens)
|
|
86
|
+
- Log TTR (Herdan's C): log(types) / log(tokens)
|
|
87
|
+
- STTR: Standardized TTR across fixed-size chunks
|
|
88
|
+
- Delta Std: Measures vocabulary consistency across chunks
|
|
89
|
+
|
|
90
|
+
References:
|
|
91
|
+
Guiraud, P. (1960). Problèmes et méthodes de la statistique linguistique.
|
|
92
|
+
Herdan, G. (1960). Type-token Mathematics.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
total_words: int
|
|
96
|
+
unique_words: int
|
|
97
|
+
ttr: float # Raw TTR
|
|
98
|
+
root_ttr: float # Guiraud's index
|
|
99
|
+
log_ttr: float # Herdan's C
|
|
100
|
+
sttr: float # Standardized TTR
|
|
101
|
+
delta_std: float # Vocabulary consistency
|
|
102
|
+
metadata: dict[str, Any]
|
|
103
|
+
|
|
104
|
+
|
|
43
105
|
# ===== Readability Results =====
|
|
44
106
|
|
|
45
107
|
|
|
@@ -58,7 +120,7 @@ class SMOGResult:
|
|
|
58
120
|
"""Result from SMOG Index computation."""
|
|
59
121
|
|
|
60
122
|
smog_index: float
|
|
61
|
-
grade_level:
|
|
123
|
+
grade_level: float
|
|
62
124
|
metadata: dict[str, Any]
|
|
63
125
|
|
|
64
126
|
|
|
@@ -67,7 +129,7 @@ class GunningFogResult:
|
|
|
67
129
|
"""Result from Gunning Fog Index computation."""
|
|
68
130
|
|
|
69
131
|
fog_index: float
|
|
70
|
-
grade_level:
|
|
132
|
+
grade_level: float
|
|
71
133
|
metadata: dict[str, Any]
|
|
72
134
|
|
|
73
135
|
|
|
@@ -157,6 +219,1166 @@ class EntropyResult:
|
|
|
157
219
|
metadata: dict[str, Any]
|
|
158
220
|
|
|
159
221
|
|
|
222
|
+
# ===== Character-Level Results =====
|
|
223
|
+
# Related to GitHub Issue #12: Character-Level Metrics
|
|
224
|
+
# https://github.com/craigtrim/pystylometry/issues/12
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
@dataclass
|
|
228
|
+
class CharacterMetricsResult:
|
|
229
|
+
"""Result from character-level metrics analysis.
|
|
230
|
+
|
|
231
|
+
This dataclass holds character-level stylometric features that provide
|
|
232
|
+
low-level insights into writing style. Character-level metrics are
|
|
233
|
+
fundamental for authorship attribution and can capture distinctive
|
|
234
|
+
patterns in punctuation, formatting, and word construction.
|
|
235
|
+
|
|
236
|
+
Related GitHub Issue:
|
|
237
|
+
#12 - Character-Level Metrics
|
|
238
|
+
https://github.com/craigtrim/pystylometry/issues/12
|
|
239
|
+
|
|
240
|
+
Metrics included:
|
|
241
|
+
- Average word length (characters per word)
|
|
242
|
+
- Average sentence length (characters per sentence)
|
|
243
|
+
- Punctuation density (punctuation marks per 100 words)
|
|
244
|
+
- Punctuation variety (count of unique punctuation types)
|
|
245
|
+
- Letter frequency distribution (26-element vector for a-z)
|
|
246
|
+
- Vowel-to-consonant ratio
|
|
247
|
+
- Digit frequency (count/ratio of numeric characters)
|
|
248
|
+
- Uppercase ratio (uppercase letters / total letters)
|
|
249
|
+
- Whitespace ratio (whitespace characters / total characters)
|
|
250
|
+
|
|
251
|
+
References:
|
|
252
|
+
Grieve, J. (2007). Quantitative authorship attribution: An evaluation
|
|
253
|
+
of techniques. Literary and Linguistic Computing, 22(3), 251-270.
|
|
254
|
+
Stamatatos, E. (2009). A survey of modern authorship attribution methods.
|
|
255
|
+
JASIST, 60(3), 538-556.
|
|
256
|
+
|
|
257
|
+
Example:
|
|
258
|
+
>>> result = compute_character_metrics("Sample text here.")
|
|
259
|
+
>>> print(f"Avg word length: {result.avg_word_length:.2f} chars")
|
|
260
|
+
>>> print(f"Punctuation density: {result.punctuation_density:.2f}")
|
|
261
|
+
>>> print(f"Vowel/consonant ratio: {result.vowel_consonant_ratio:.2f}")
|
|
262
|
+
"""
|
|
263
|
+
|
|
264
|
+
avg_word_length: float # Mean characters per word
|
|
265
|
+
avg_sentence_length_chars: float # Mean characters per sentence
|
|
266
|
+
punctuation_density: float # Punctuation marks per 100 words
|
|
267
|
+
punctuation_variety: int # Count of unique punctuation types used
|
|
268
|
+
letter_frequency: dict[str, float] # Frequency distribution for a-z
|
|
269
|
+
vowel_consonant_ratio: float # Ratio of vowels to consonants
|
|
270
|
+
digit_count: int # Total count of digit characters (0-9)
|
|
271
|
+
digit_ratio: float # Digits / total characters
|
|
272
|
+
uppercase_ratio: float # Uppercase letters / total letters
|
|
273
|
+
whitespace_ratio: float # Whitespace characters / total characters
|
|
274
|
+
metadata: dict[str, Any] # Additional info (character counts, etc.)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# ===== Function Word Results =====
|
|
278
|
+
# Related to GitHub Issue #13: Function Word Analysis
|
|
279
|
+
# https://github.com/craigtrim/pystylometry/issues/13
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
@dataclass
|
|
283
|
+
class FunctionWordResult:
|
|
284
|
+
"""Result from function word analysis.
|
|
285
|
+
|
|
286
|
+
Function words (determiners, prepositions, conjunctions, pronouns, auxiliary
|
|
287
|
+
verbs) are highly frequent, content-independent words that are often used
|
|
288
|
+
subconsciously. They are considered strong authorship markers because authors
|
|
289
|
+
use them consistently across different topics and genres.
|
|
290
|
+
|
|
291
|
+
Related GitHub Issue:
|
|
292
|
+
#13 - Function Word Analysis
|
|
293
|
+
https://github.com/craigtrim/pystylometry/issues/13
|
|
294
|
+
|
|
295
|
+
This analysis computes:
|
|
296
|
+
- Frequency profiles for all function word categories
|
|
297
|
+
- Ratios for specific grammatical categories
|
|
298
|
+
- Most/least frequently used function words
|
|
299
|
+
- Function word diversity metrics
|
|
300
|
+
|
|
301
|
+
Function word categories analyzed:
|
|
302
|
+
- Determiners: the, a, an, this, that, these, those, etc.
|
|
303
|
+
- Prepositions: in, on, at, by, for, with, from, to, etc.
|
|
304
|
+
- Conjunctions: and, but, or, nor, for, yet, so, etc.
|
|
305
|
+
- Pronouns: I, you, he, she, it, we, they, etc.
|
|
306
|
+
- Auxiliary verbs: be, have, do, can, will, shall, may, etc.
|
|
307
|
+
- Particles: up, down, out, off, over, etc.
|
|
308
|
+
|
|
309
|
+
References:
|
|
310
|
+
Mosteller, F., & Wallace, D. L. (1964). Inference and disputed authorship:
|
|
311
|
+
The Federalist. Addison-Wesley.
|
|
312
|
+
Burrows, J. (2002). 'Delta': A measure of stylistic difference and a guide
|
|
313
|
+
to likely authorship. Literary and Linguistic Computing, 17(3), 267-287.
|
|
314
|
+
|
|
315
|
+
Example:
|
|
316
|
+
>>> result = compute_function_words("Sample text for analysis.")
|
|
317
|
+
>>> print(f"Determiner ratio: {result.determiner_ratio:.3f}")
|
|
318
|
+
>>> print(f"Preposition ratio: {result.preposition_ratio:.3f}")
|
|
319
|
+
>>> print(f"Most frequent: {result.most_frequent_function_words[:5]}")
|
|
320
|
+
"""
|
|
321
|
+
|
|
322
|
+
determiner_ratio: float # Determiners / total words
|
|
323
|
+
preposition_ratio: float # Prepositions / total words
|
|
324
|
+
conjunction_ratio: float # Conjunctions / total words
|
|
325
|
+
pronoun_ratio: float # Pronouns / total words
|
|
326
|
+
auxiliary_ratio: float # Auxiliary verbs / total words
|
|
327
|
+
particle_ratio: float # Particles / total words
|
|
328
|
+
total_function_word_ratio: float # All function words / total words
|
|
329
|
+
function_word_diversity: float # Unique function words / total function words
|
|
330
|
+
most_frequent_function_words: list[tuple[str, int]] # Top N with counts
|
|
331
|
+
least_frequent_function_words: list[tuple[str, int]] # Bottom N with counts
|
|
332
|
+
function_word_distribution: dict[str, int] # All function words with counts
|
|
333
|
+
metadata: dict[str, Any] # Category-specific counts, total counts, etc.
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
# ===== Advanced Lexical Diversity Results =====
|
|
337
|
+
# Related to GitHub Issue #14: Advanced Lexical Diversity Metrics
|
|
338
|
+
# https://github.com/craigtrim/pystylometry/issues/14
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
@dataclass
|
|
342
|
+
class VocdDResult:
|
|
343
|
+
"""Result from voc-D computation.
|
|
344
|
+
|
|
345
|
+
voc-D is a sophisticated measure of lexical diversity that uses a mathematical
|
|
346
|
+
model to estimate vocabulary richness while controlling for text length.
|
|
347
|
+
It fits a curve to the relationship between tokens and types across multiple
|
|
348
|
+
random samples of the text.
|
|
349
|
+
|
|
350
|
+
Related GitHub Issue:
|
|
351
|
+
#14 - Advanced Lexical Diversity Metrics
|
|
352
|
+
https://github.com/craigtrim/pystylometry/issues/14
|
|
353
|
+
|
|
354
|
+
The D parameter represents the theoretical vocabulary size and is more
|
|
355
|
+
stable across different text lengths than simple TTR measures.
|
|
356
|
+
|
|
357
|
+
References:
|
|
358
|
+
Malvern, D., Richards, B., Chipere, N., & Durán, P. (2004).
|
|
359
|
+
Lexical Diversity and Language Development. Palgrave Macmillan.
|
|
360
|
+
McKee, G., Malvern, D., & Richards, B. (2000). Measuring vocabulary
|
|
361
|
+
diversity using dedicated software. Literary and Linguistic Computing,
|
|
362
|
+
15(3), 323-337.
|
|
363
|
+
|
|
364
|
+
Example:
|
|
365
|
+
>>> result = compute_vocd_d("Long sample text for voc-D analysis...")
|
|
366
|
+
>>> print(f"D parameter: {result.d_parameter:.2f}")
|
|
367
|
+
>>> print(f"Curve fit R²: {result.curve_fit_r_squared:.3f}")
|
|
368
|
+
"""
|
|
369
|
+
|
|
370
|
+
d_parameter: float # The D value (theoretical vocabulary size)
|
|
371
|
+
curve_fit_r_squared: float # Quality of curve fit (0-1)
|
|
372
|
+
sample_count: int # Number of random samples used
|
|
373
|
+
optimal_sample_size: int # Optimal token sample size used
|
|
374
|
+
metadata: dict[str, Any] # Sampling parameters, convergence info, etc.
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
@dataclass
|
|
378
|
+
class MATTRResult:
|
|
379
|
+
"""Result from MATTR (Moving-Average Type-Token Ratio) computation.
|
|
380
|
+
|
|
381
|
+
MATTR computes TTR using a moving window of fixed size, which provides
|
|
382
|
+
a more stable measure of lexical diversity than simple TTR, especially
|
|
383
|
+
for longer texts. The moving window approach reduces the impact of text
|
|
384
|
+
length on the TTR calculation.
|
|
385
|
+
|
|
386
|
+
Related GitHub Issue:
|
|
387
|
+
#14 - Advanced Lexical Diversity Metrics
|
|
388
|
+
https://github.com/craigtrim/pystylometry/issues/14
|
|
389
|
+
|
|
390
|
+
References:
|
|
391
|
+
Covington, M. A., & McFall, J. D. (2010). Cutting the Gordian knot:
|
|
392
|
+
The moving-average type-token ratio (MATTR). Journal of Quantitative
|
|
393
|
+
Linguistics, 17(2), 94-100.
|
|
394
|
+
|
|
395
|
+
Example:
|
|
396
|
+
>>> result = compute_mattr("Sample text here...", window_size=50)
|
|
397
|
+
>>> print(f"MATTR score: {result.mattr_score:.3f}")
|
|
398
|
+
>>> print(f"Window size: {result.window_size}")
|
|
399
|
+
"""
|
|
400
|
+
|
|
401
|
+
mattr_score: float # Average TTR across all windows
|
|
402
|
+
window_size: int # Size of moving window used
|
|
403
|
+
window_count: int # Number of windows analyzed
|
|
404
|
+
ttr_std_dev: float # Standard deviation of TTR across windows
|
|
405
|
+
min_ttr: float # Minimum TTR in any window
|
|
406
|
+
max_ttr: float # Maximum TTR in any window
|
|
407
|
+
metadata: dict[str, Any] # Window-by-window TTR values, etc.
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
@dataclass
|
|
411
|
+
class HDDResult:
|
|
412
|
+
"""Result from HD-D (Hypergeometric Distribution D) computation.
|
|
413
|
+
|
|
414
|
+
HD-D is a probabilistic measure of lexical diversity based on the
|
|
415
|
+
hypergeometric distribution. It estimates the probability of encountering
|
|
416
|
+
new word types as text length increases, providing a mathematically
|
|
417
|
+
rigorous measure that is less sensitive to text length than TTR.
|
|
418
|
+
|
|
419
|
+
Related GitHub Issue:
|
|
420
|
+
#14 - Advanced Lexical Diversity Metrics
|
|
421
|
+
https://github.com/craigtrim/pystylometry/issues/14
|
|
422
|
+
|
|
423
|
+
References:
|
|
424
|
+
McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D: A validation
|
|
425
|
+
study of sophisticated approaches to lexical diversity assessment.
|
|
426
|
+
Behavior Research Methods, 42(2), 381-392.
|
|
427
|
+
|
|
428
|
+
Example:
|
|
429
|
+
>>> result = compute_hdd("Sample text for HD-D analysis...")
|
|
430
|
+
>>> print(f"HD-D score: {result.hdd_score:.3f}")
|
|
431
|
+
>>> print(f"Sample size: {result.sample_size}")
|
|
432
|
+
"""
|
|
433
|
+
|
|
434
|
+
hdd_score: float # The HD-D value
|
|
435
|
+
sample_size: int # Sample size used for calculation
|
|
436
|
+
type_count: int # Number of unique types in sample
|
|
437
|
+
token_count: int # Number of tokens in sample
|
|
438
|
+
metadata: dict[str, Any] # Probability distribution info, etc.
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
@dataclass
|
|
442
|
+
class MSTTRResult:
|
|
443
|
+
"""Result from MSTTR (Mean Segmental Type-Token Ratio) computation.
|
|
444
|
+
|
|
445
|
+
MSTTR divides the text into sequential segments of equal length and
|
|
446
|
+
computes the average TTR across all segments. This provides a length-
|
|
447
|
+
normalized measure of lexical diversity that is more comparable across
|
|
448
|
+
texts of different lengths.
|
|
449
|
+
|
|
450
|
+
Related GitHub Issue:
|
|
451
|
+
#14 - Advanced Lexical Diversity Metrics
|
|
452
|
+
https://github.com/craigtrim/pystylometry/issues/14
|
|
453
|
+
|
|
454
|
+
References:
|
|
455
|
+
Johnson, W. (1944). Studies in language behavior: I. A program of research.
|
|
456
|
+
Psychological Monographs, 56(2), 1-15.
|
|
457
|
+
|
|
458
|
+
Example:
|
|
459
|
+
>>> result = compute_msttr("Sample text...", segment_size=100)
|
|
460
|
+
>>> print(f"MSTTR score: {result.msttr_score:.3f}")
|
|
461
|
+
>>> print(f"Segments analyzed: {result.segment_count}")
|
|
462
|
+
"""
|
|
463
|
+
|
|
464
|
+
msttr_score: float # Mean TTR across all segments
|
|
465
|
+
segment_size: int # Size of each segment
|
|
466
|
+
segment_count: int # Number of segments analyzed
|
|
467
|
+
ttr_std_dev: float # Standard deviation of TTR across segments
|
|
468
|
+
min_ttr: float # Minimum TTR in any segment
|
|
469
|
+
max_ttr: float # Maximum TTR in any segment
|
|
470
|
+
segment_ttrs: list[float] # TTR for each individual segment
|
|
471
|
+
metadata: dict[str, Any] # Segment details, remaining tokens, etc.
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
# ===== Word Frequency Sophistication Results =====
|
|
475
|
+
# Related to GitHub Issue #15: Word Frequency Sophistication Metrics
|
|
476
|
+
# https://github.com/craigtrim/pystylometry/issues/15
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
@dataclass
|
|
480
|
+
class WordFrequencySophisticationResult:
|
|
481
|
+
"""Result from word frequency sophistication analysis.
|
|
482
|
+
|
|
483
|
+
Word frequency sophistication metrics measure how common or rare the
|
|
484
|
+
vocabulary used in a text is, based on reference frequency lists from
|
|
485
|
+
large corpora. Authors who use less frequent (more sophisticated) words
|
|
486
|
+
score higher on these metrics.
|
|
487
|
+
|
|
488
|
+
Related GitHub Issue:
|
|
489
|
+
#15 - Word Frequency Sophistication Metrics
|
|
490
|
+
https://github.com/craigtrim/pystylometry/issues/15
|
|
491
|
+
|
|
492
|
+
This analysis uses reference frequency data from:
|
|
493
|
+
- COCA (Corpus of Contemporary American English)
|
|
494
|
+
- BNC (British National Corpus)
|
|
495
|
+
- Google N-grams
|
|
496
|
+
- SUBTLEXus (subtitle frequencies)
|
|
497
|
+
|
|
498
|
+
Metrics computed:
|
|
499
|
+
- Mean word frequency (average frequency rank)
|
|
500
|
+
- Median word frequency
|
|
501
|
+
- Rare word ratio (words beyond frequency threshold)
|
|
502
|
+
- Academic word ratio (from Academic Word List)
|
|
503
|
+
- Advanced word ratio (sophisticated vocabulary)
|
|
504
|
+
|
|
505
|
+
References:
|
|
506
|
+
Brysbaert, M., & New, B. (2009). Moving beyond Kučera and Francis:
|
|
507
|
+
A critical evaluation of current word frequency norms. Behavior
|
|
508
|
+
Research Methods, Instruments, & Computers, 41(4), 977-990.
|
|
509
|
+
Coxhead, A. (2000). A new academic word list. TESOL Quarterly, 34(2), 213-238.
|
|
510
|
+
|
|
511
|
+
Example:
|
|
512
|
+
>>> result = compute_word_frequency_sophistication("Sample text...")
|
|
513
|
+
>>> print(f"Mean frequency rank: {result.mean_frequency_rank:.1f}")
|
|
514
|
+
>>> print(f"Rare word ratio: {result.rare_word_ratio:.3f}")
|
|
515
|
+
>>> print(f"Academic word ratio: {result.academic_word_ratio:.3f}")
|
|
516
|
+
"""
|
|
517
|
+
|
|
518
|
+
mean_frequency_rank: float # Average frequency rank of words
|
|
519
|
+
median_frequency_rank: float # Median frequency rank
|
|
520
|
+
rare_word_ratio: float # Words beyond frequency threshold / total
|
|
521
|
+
common_word_ratio: float # High-frequency words / total
|
|
522
|
+
academic_word_ratio: float # Academic Word List words / total
|
|
523
|
+
advanced_word_ratio: float # Sophisticated vocabulary / total
|
|
524
|
+
frequency_band_distribution: dict[str, float] # Distribution across frequency bands
|
|
525
|
+
rarest_words: list[tuple[str, float]] # Least frequent words with ranks
|
|
526
|
+
most_common_words: list[tuple[str, float]] # Most frequent words with ranks
|
|
527
|
+
metadata: dict[str, Any] # Corpus source, band thresholds, total words, etc.
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
# ===== Additional Readability Results =====
|
|
531
|
+
# Related to GitHub Issue #16: Additional Readability Formulas
|
|
532
|
+
# https://github.com/craigtrim/pystylometry/issues/16
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
@dataclass
|
|
536
|
+
class DaleChallResult:
|
|
537
|
+
"""Result from Dale-Chall Readability Formula.
|
|
538
|
+
|
|
539
|
+
The Dale-Chall formula uses a list of 3000 familiar words that 80% of
|
|
540
|
+
fourth-graders understand. Words not on this list are considered "difficult."
|
|
541
|
+
The formula provides a grade level estimate based on sentence length and
|
|
542
|
+
the percentage of difficult words.
|
|
543
|
+
|
|
544
|
+
Related GitHub Issue:
|
|
545
|
+
#16 - Additional Readability Formulas
|
|
546
|
+
https://github.com/craigtrim/pystylometry/issues/16
|
|
547
|
+
|
|
548
|
+
Formula: 0.1579 * (difficult_words / total_words * 100) + 0.0496 * avg_sentence_length
|
|
549
|
+
|
|
550
|
+
If % difficult words > 5%, add 3.6365 to the raw score.
|
|
551
|
+
|
|
552
|
+
References:
|
|
553
|
+
Dale, E., & Chall, J. S. (1948). A formula for predicting readability.
|
|
554
|
+
Educational Research Bulletin, 27(1), 11-28.
|
|
555
|
+
Chall, J. S., & Dale, E. (1995). Readability revisited: The new Dale-Chall
|
|
556
|
+
readability formula. Brookline Books.
|
|
557
|
+
|
|
558
|
+
Example:
|
|
559
|
+
>>> result = compute_dale_chall("Sample text to analyze...")
|
|
560
|
+
>>> print(f"Dale-Chall score: {result.dale_chall_score:.2f}")
|
|
561
|
+
>>> print(f"Grade level: {result.grade_level}")
|
|
562
|
+
>>> print(f"Difficult word %: {result.difficult_word_ratio * 100:.1f}%")
|
|
563
|
+
"""
|
|
564
|
+
|
|
565
|
+
dale_chall_score: float # The Dale-Chall readability score
|
|
566
|
+
grade_level: str # Corresponding grade level (e.g., "7-8", "College")
|
|
567
|
+
difficult_word_count: int # Words not on Dale-Chall list
|
|
568
|
+
difficult_word_ratio: float # Difficult words / total words
|
|
569
|
+
avg_sentence_length: float # Average words per sentence
|
|
570
|
+
total_words: int # Total word count
|
|
571
|
+
metadata: dict[str, Any] # List of difficult words, adjusted score, etc.
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
@dataclass
|
|
575
|
+
class LinsearWriteResult:
|
|
576
|
+
"""Result from Linsear Write Formula.
|
|
577
|
+
|
|
578
|
+
The Linsear Write Formula was developed for the U.S. Air Force to calculate
|
|
579
|
+
the readability of technical manuals. It categorizes words as "easy" (1-2
|
|
580
|
+
syllables) or "hard" (3+ syllables) and uses sentence length to estimate
|
|
581
|
+
grade level. It's particularly effective for technical writing.
|
|
582
|
+
|
|
583
|
+
Related GitHub Issue:
|
|
584
|
+
#16 - Additional Readability Formulas
|
|
585
|
+
https://github.com/craigtrim/pystylometry/issues/16
|
|
586
|
+
|
|
587
|
+
References:
|
|
588
|
+
Klare, G. R. (1974-1975). Assessing readability. Reading Research Quarterly,
|
|
589
|
+
10(1), 62-102.
|
|
590
|
+
|
|
591
|
+
Example:
|
|
592
|
+
>>> result = compute_linsear_write("Technical manual text...")
|
|
593
|
+
>>> print(f"Linsear Write score: {result.linsear_score:.2f}")
|
|
594
|
+
>>> print(f"Grade level: {result.grade_level}")
|
|
595
|
+
"""
|
|
596
|
+
|
|
597
|
+
linsear_score: float # The Linsear Write score
|
|
598
|
+
grade_level: int # Corresponding U.S. grade level
|
|
599
|
+
easy_word_count: int # Words with 1-2 syllables
|
|
600
|
+
hard_word_count: int # Words with 3+ syllables
|
|
601
|
+
avg_sentence_length: float # Average words per sentence
|
|
602
|
+
metadata: dict[str, Any] # Calculation details, sentence count, etc.
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
@dataclass
|
|
606
|
+
class FryResult:
|
|
607
|
+
"""Result from Fry Readability Graph.
|
|
608
|
+
|
|
609
|
+
The Fry Readability Graph uses average sentence length and average syllables
|
|
610
|
+
per word to determine reading difficulty. It plots these values on a graph
|
|
611
|
+
to determine the grade level. This implementation provides the numerical
|
|
612
|
+
coordinates and estimated grade level.
|
|
613
|
+
|
|
614
|
+
Related GitHub Issue:
|
|
615
|
+
#16 - Additional Readability Formulas
|
|
616
|
+
https://github.com/craigtrim/pystylometry/issues/16
|
|
617
|
+
|
|
618
|
+
References:
|
|
619
|
+
Fry, E. (1968). A readability formula that saves time. Journal of Reading,
|
|
620
|
+
11(7), 513-578.
|
|
621
|
+
Fry, E. (1977). Fry's readability graph: Clarifications, validity, and
|
|
622
|
+
extension to level 17. Journal of Reading, 21(3), 242-252.
|
|
623
|
+
|
|
624
|
+
Example:
|
|
625
|
+
>>> result = compute_fry("Sample educational text...")
|
|
626
|
+
>>> print(f"Avg sentence length: {result.avg_sentence_length:.1f}")
|
|
627
|
+
>>> print(f"Avg syllables/100 words: {result.avg_syllables_per_100:.1f}")
|
|
628
|
+
>>> print(f"Grade level: {result.grade_level}")
|
|
629
|
+
"""
|
|
630
|
+
|
|
631
|
+
avg_sentence_length: float # Average words per sentence
|
|
632
|
+
avg_syllables_per_100: float # Average syllables per 100 words
|
|
633
|
+
grade_level: str # Estimated grade level (e.g., "5", "7", "College")
|
|
634
|
+
graph_zone: str # Which zone of Fry graph (for validity checking)
|
|
635
|
+
metadata: dict[str, Any] # Total sentences, total syllables, etc.
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
@dataclass
|
|
639
|
+
class FORCASTResult:
|
|
640
|
+
"""Result from FORCAST Readability Formula.
|
|
641
|
+
|
|
642
|
+
FORCAST (FORmula for CASTing readability) was developed by the U.S. military
|
|
643
|
+
to assess readability without counting syllables. It uses only single-syllable
|
|
644
|
+
words as a measure, making it faster to compute than syllable-based formulas.
|
|
645
|
+
Particularly useful for technical and military documents.
|
|
646
|
+
|
|
647
|
+
Related GitHub Issue:
|
|
648
|
+
#16 - Additional Readability Formulas
|
|
649
|
+
https://github.com/craigtrim/pystylometry/issues/16
|
|
650
|
+
|
|
651
|
+
Formula: 20 - (N / 10), where N is the number of single-syllable words
|
|
652
|
+
per 150-word sample.
|
|
653
|
+
|
|
654
|
+
References:
|
|
655
|
+
Caylor, J. S., Sticht, T. G., Fox, L. C., & Ford, J. P. (1973).
|
|
656
|
+
Methodologies for determining reading requirements of military
|
|
657
|
+
occupational specialties. Human Resources Research Organization.
|
|
658
|
+
|
|
659
|
+
Example:
|
|
660
|
+
>>> result = compute_forcast("Military technical document text...")
|
|
661
|
+
>>> print(f"FORCAST score: {result.forcast_score:.2f}")
|
|
662
|
+
>>> print(f"Grade level: {result.grade_level}")
|
|
663
|
+
"""
|
|
664
|
+
|
|
665
|
+
forcast_score: float # The FORCAST readability score
|
|
666
|
+
grade_level: int # Corresponding U.S. grade level
|
|
667
|
+
single_syllable_ratio: float # Single-syllable words / total words
|
|
668
|
+
single_syllable_count: int # Count of single-syllable words
|
|
669
|
+
total_words: int # Total word count
|
|
670
|
+
metadata: dict[str, Any] # Samples used, calculation details, etc.
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
@dataclass
|
|
674
|
+
class PowersSumnerKearlResult:
|
|
675
|
+
"""Result from Powers-Sumner-Kearl Readability Formula.
|
|
676
|
+
|
|
677
|
+
The Powers-Sumner-Kearl formula is a variation of the Flesch Reading Ease
|
|
678
|
+
formula, recalibrated for primary grade levels (grades 1-4). It uses
|
|
679
|
+
average sentence length and average syllables per word, but with different
|
|
680
|
+
coefficients optimized for younger readers.
|
|
681
|
+
|
|
682
|
+
Related GitHub Issue:
|
|
683
|
+
#16 - Additional Readability Formulas
|
|
684
|
+
https://github.com/craigtrim/pystylometry/issues/16
|
|
685
|
+
|
|
686
|
+
Formula: 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
|
|
687
|
+
|
|
688
|
+
References:
|
|
689
|
+
Powers, R. D., Sumner, W. A., & Kearl, B. E. (1958). A recalculation of
|
|
690
|
+
four adult readability formulas. Journal of Educational Psychology,
|
|
691
|
+
49(2), 99-105.
|
|
692
|
+
|
|
693
|
+
Example:
|
|
694
|
+
>>> result = compute_powers_sumner_kearl("Children's book text...")
|
|
695
|
+
>>> print(f"PSK score: {result.psk_score:.2f}")
|
|
696
|
+
>>> print(f"Grade level: {result.grade_level}")
|
|
697
|
+
"""
|
|
698
|
+
|
|
699
|
+
psk_score: float # The Powers-Sumner-Kearl score
|
|
700
|
+
grade_level: float # Corresponding grade level (can be decimal for primary grades)
|
|
701
|
+
avg_sentence_length: float # Average words per sentence
|
|
702
|
+
avg_syllables_per_word: float # Average syllables per word
|
|
703
|
+
total_sentences: int # Total sentence count
|
|
704
|
+
total_words: int # Total word count
|
|
705
|
+
total_syllables: int # Total syllable count
|
|
706
|
+
metadata: dict[str, Any] # Calculation details, comparison to Flesch, etc.
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
# ===== Advanced Syntactic Results =====
|
|
710
|
+
# Related to GitHub Issue #17: Advanced Syntactic Analysis
|
|
711
|
+
# https://github.com/craigtrim/pystylometry/issues/17
|
|
712
|
+
|
|
713
|
+
|
|
714
|
+
@dataclass
|
|
715
|
+
class AdvancedSyntacticResult:
|
|
716
|
+
"""Result from advanced syntactic analysis using dependency parsing.
|
|
717
|
+
|
|
718
|
+
Advanced syntactic analysis uses dependency parsing to extract sophisticated
|
|
719
|
+
grammatical features that go beyond simple POS tagging. These features
|
|
720
|
+
capture sentence complexity, grammatical sophistication, and syntactic
|
|
721
|
+
style preferences.
|
|
722
|
+
|
|
723
|
+
Related GitHub Issue:
|
|
724
|
+
#17 - Advanced Syntactic Analysis
|
|
725
|
+
https://github.com/craigtrim/pystylometry/issues/17
|
|
726
|
+
|
|
727
|
+
Features analyzed:
|
|
728
|
+
- Parse tree depth (sentence structural complexity)
|
|
729
|
+
- T-units (minimal terminable units - independent clauses with modifiers)
|
|
730
|
+
- Clausal density (clauses per T-unit)
|
|
731
|
+
- Dependent clause ratio
|
|
732
|
+
- Passive voice ratio
|
|
733
|
+
- Subordination index
|
|
734
|
+
- Coordination index
|
|
735
|
+
- Sentence complexity score
|
|
736
|
+
|
|
737
|
+
References:
|
|
738
|
+
Hunt, K. W. (1965). Grammatical structures written at three grade levels.
|
|
739
|
+
NCTE Research Report No. 3.
|
|
740
|
+
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
741
|
+
Lu, X. (2010). Automatic analysis of syntactic complexity in second language
|
|
742
|
+
writing. International Journal of Corpus Linguistics, 15(4), 474-496.
|
|
743
|
+
|
|
744
|
+
Example:
|
|
745
|
+
>>> result = compute_advanced_syntactic("Complex sentence structures...")
|
|
746
|
+
>>> print(f"Parse tree depth: {result.mean_parse_tree_depth:.1f}")
|
|
747
|
+
>>> print(f"T-units: {result.t_unit_count}")
|
|
748
|
+
>>> print(f"Passive voice %: {result.passive_voice_ratio * 100:.1f}%")
|
|
749
|
+
"""
|
|
750
|
+
|
|
751
|
+
mean_parse_tree_depth: float # Average depth of dependency parse trees
|
|
752
|
+
max_parse_tree_depth: int # Maximum parse tree depth in text
|
|
753
|
+
t_unit_count: int # Number of T-units (minimal terminable units)
|
|
754
|
+
mean_t_unit_length: float # Average words per T-unit
|
|
755
|
+
clausal_density: float # Clauses per T-unit
|
|
756
|
+
dependent_clause_ratio: float # Dependent clauses / total clauses
|
|
757
|
+
passive_voice_ratio: float # Passive constructions / total sentences
|
|
758
|
+
subordination_index: float # Subordinate clauses / total clauses
|
|
759
|
+
coordination_index: float # Coordinate clauses / total clauses
|
|
760
|
+
sentence_complexity_score: float # Composite complexity metric
|
|
761
|
+
dependency_distance: float # Mean distance between heads and dependents
|
|
762
|
+
left_branching_ratio: float # Left-branching structures / total
|
|
763
|
+
right_branching_ratio: float # Right-branching structures / total
|
|
764
|
+
metadata: dict[str, Any] # Parse tree details, clause counts, etc.
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
# ===== Sentence Type Results =====
|
|
768
|
+
# Related to GitHub Issue #18: Sentence Type Classification
|
|
769
|
+
# https://github.com/craigtrim/pystylometry/issues/18
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
@dataclass
|
|
773
|
+
class SentenceTypeResult:
|
|
774
|
+
"""Result from sentence type classification analysis.
|
|
775
|
+
|
|
776
|
+
Sentence type classification categorizes sentences by their grammatical
|
|
777
|
+
structure (simple, compound, complex, compound-complex) and communicative
|
|
778
|
+
function (declarative, interrogative, imperative, exclamatory). Different
|
|
779
|
+
authors and genres show distinct patterns in sentence type distribution.
|
|
780
|
+
|
|
781
|
+
Related GitHub Issue:
|
|
782
|
+
#18 - Sentence Type Classification
|
|
783
|
+
https://github.com/craigtrim/pystylometry/issues/18
|
|
784
|
+
|
|
785
|
+
Structural types:
|
|
786
|
+
- Simple: One independent clause (e.g., "The cat sat.")
|
|
787
|
+
- Compound: Multiple independent clauses (e.g., "I came, I saw, I conquered.")
|
|
788
|
+
- Complex: One independent + dependent clause(s) (e.g., "When I arrived, I saw her.")
|
|
789
|
+
- Compound-Complex: Multiple independent + dependent (e.g., "I came when called, and I stayed.")
|
|
790
|
+
|
|
791
|
+
Functional types:
|
|
792
|
+
- Declarative: Statement (e.g., "The sky is blue.")
|
|
793
|
+
- Interrogative: Question (e.g., "Is the sky blue?")
|
|
794
|
+
- Imperative: Command (e.g., "Look at the sky!")
|
|
795
|
+
- Exclamatory: Exclamation (e.g., "What a blue sky!")
|
|
796
|
+
|
|
797
|
+
References:
|
|
798
|
+
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
799
|
+
Huddleston, R., & Pullum, G. K. (2002). The Cambridge Grammar of the English Language.
|
|
800
|
+
|
|
801
|
+
Example:
|
|
802
|
+
>>> result = compute_sentence_types("Mix of sentence types here...")
|
|
803
|
+
>>> print(f"Simple: {result.simple_ratio * 100:.1f}%")
|
|
804
|
+
>>> print(f"Complex: {result.complex_ratio * 100:.1f}%")
|
|
805
|
+
>>> print(f"Questions: {result.interrogative_ratio * 100:.1f}%")
|
|
806
|
+
"""
|
|
807
|
+
|
|
808
|
+
# Structural type ratios (sum to 1.0)
|
|
809
|
+
simple_ratio: float # Simple sentences / total
|
|
810
|
+
compound_ratio: float # Compound sentences / total
|
|
811
|
+
complex_ratio: float # Complex sentences / total
|
|
812
|
+
compound_complex_ratio: float # Compound-complex / total
|
|
813
|
+
|
|
814
|
+
# Functional type ratios (sum to 1.0)
|
|
815
|
+
declarative_ratio: float # Declarative sentences / total
|
|
816
|
+
interrogative_ratio: float # Interrogative (questions) / total
|
|
817
|
+
imperative_ratio: float # Imperative (commands) / total
|
|
818
|
+
exclamatory_ratio: float # Exclamatory sentences / total
|
|
819
|
+
|
|
820
|
+
# Counts
|
|
821
|
+
simple_count: int
|
|
822
|
+
compound_count: int
|
|
823
|
+
complex_count: int
|
|
824
|
+
compound_complex_count: int
|
|
825
|
+
declarative_count: int
|
|
826
|
+
interrogative_count: int
|
|
827
|
+
imperative_count: int
|
|
828
|
+
exclamatory_count: int
|
|
829
|
+
total_sentences: int
|
|
830
|
+
|
|
831
|
+
# Diversity
|
|
832
|
+
structural_diversity: float # Shannon entropy of structural type distribution
|
|
833
|
+
functional_diversity: float # Shannon entropy of functional type distribution
|
|
834
|
+
|
|
835
|
+
metadata: dict[str, Any] # Sentence-by-sentence classifications, etc.
|
|
836
|
+
|
|
837
|
+
|
|
838
|
+
# ===== Extended N-gram Results =====
|
|
839
|
+
# Related to GitHub Issue #19: Extended N-gram Features
|
|
840
|
+
# https://github.com/craigtrim/pystylometry/issues/19
|
|
841
|
+
|
|
842
|
+
|
|
843
|
+
@dataclass
|
|
844
|
+
class ExtendedNgramResult:
|
|
845
|
+
"""Result from extended n-gram analysis.
|
|
846
|
+
|
|
847
|
+
Extended n-gram analysis goes beyond basic bigram/trigram entropy to provide
|
|
848
|
+
comprehensive n-gram statistics including frequency distributions, most
|
|
849
|
+
distinctive n-grams, skipgrams, and part-of-speech n-grams. These features
|
|
850
|
+
are valuable for authorship attribution and style analysis.
|
|
851
|
+
|
|
852
|
+
Related GitHub Issue:
|
|
853
|
+
#19 - Extended N-gram Features
|
|
854
|
+
https://github.com/craigtrim/pystylometry/issues/19
|
|
855
|
+
|
|
856
|
+
Features computed:
|
|
857
|
+
- Trigram frequency distributions and top trigrams
|
|
858
|
+
- 4-gram frequency distributions and top 4-grams
|
|
859
|
+
- Skipgrams (n-grams with gaps, e.g., "the * dog")
|
|
860
|
+
- POS n-grams (e.g., "DET ADJ NOUN")
|
|
861
|
+
- Character trigrams and 4-grams
|
|
862
|
+
- N-gram diversity metrics
|
|
863
|
+
- Entropy for each n-gram order
|
|
864
|
+
|
|
865
|
+
References:
|
|
866
|
+
Guthrie, D., Allison, B., Liu, W., Guthrie, L., & Wilks, Y. (2006).
|
|
867
|
+
A closer look at skip-gram modelling. LREC.
|
|
868
|
+
Stamatatos, E. (2009). A survey of modern authorship attribution methods.
|
|
869
|
+
JASIST, 60(3), 538-556.
|
|
870
|
+
|
|
871
|
+
Example:
|
|
872
|
+
>>> result = compute_extended_ngrams("Sample text for n-gram analysis...")
|
|
873
|
+
>>> print(f"Top trigrams: {result.top_word_trigrams[:5]}")
|
|
874
|
+
>>> print(f"Trigram entropy: {result.word_trigram_entropy:.2f}")
|
|
875
|
+
"""
|
|
876
|
+
|
|
877
|
+
# Word n-grams
|
|
878
|
+
top_word_trigrams: list[tuple[str, int]] # Most frequent word trigrams
|
|
879
|
+
top_word_4grams: list[tuple[str, int]] # Most frequent word 4-grams
|
|
880
|
+
word_trigram_count: int # Total unique word trigrams
|
|
881
|
+
word_4gram_count: int # Total unique word 4-grams
|
|
882
|
+
word_trigram_entropy: float # Shannon entropy of trigram distribution
|
|
883
|
+
word_4gram_entropy: float # Shannon entropy of 4-gram distribution
|
|
884
|
+
|
|
885
|
+
# Skipgrams (n-grams with gaps)
|
|
886
|
+
top_skipgrams_2_1: list[tuple[str, int]] # Top 2-skipgrams (gap of 1)
|
|
887
|
+
top_skipgrams_3_1: list[tuple[str, int]] # Top 3-skipgrams (gap of 1)
|
|
888
|
+
skipgram_2_1_count: int # Unique 2-skipgrams
|
|
889
|
+
skipgram_3_1_count: int # Unique 3-skipgrams
|
|
890
|
+
|
|
891
|
+
# POS n-grams
|
|
892
|
+
top_pos_trigrams: list[tuple[str, int]] # Most frequent POS trigrams
|
|
893
|
+
top_pos_4grams: list[tuple[str, int]] # Most frequent POS 4-grams
|
|
894
|
+
pos_trigram_count: int # Unique POS trigrams
|
|
895
|
+
pos_4gram_count: int # Unique POS 4-grams
|
|
896
|
+
pos_trigram_entropy: float # Shannon entropy of POS trigram distribution
|
|
897
|
+
|
|
898
|
+
# Character n-grams
|
|
899
|
+
top_char_trigrams: list[tuple[str, int]] # Most frequent character trigrams
|
|
900
|
+
top_char_4grams: list[tuple[str, int]] # Most frequent character 4-grams
|
|
901
|
+
char_trigram_entropy: float # Shannon entropy of char trigram distribution
|
|
902
|
+
char_4gram_entropy: float # Shannon entropy of char 4-gram distribution
|
|
903
|
+
|
|
904
|
+
metadata: dict[str, Any] # Full frequency distributions, parameters, etc.
|
|
905
|
+
|
|
906
|
+
|
|
907
|
+
# ===== Stylistic Markers Results =====
|
|
908
|
+
# Related to GitHub Issue #20: Stylistic Markers
|
|
909
|
+
# https://github.com/craigtrim/pystylometry/issues/20
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
@dataclass
|
|
913
|
+
class StylisticMarkersResult:
|
|
914
|
+
"""Result from stylistic markers analysis.
|
|
915
|
+
|
|
916
|
+
Stylistic markers are specific linguistic features that authors tend to use
|
|
917
|
+
consistently and often subconsciously. These include contraction usage,
|
|
918
|
+
intensifier preferences, hedging expressions, punctuation habits, and more.
|
|
919
|
+
They are powerful indicators of authorial identity.
|
|
920
|
+
|
|
921
|
+
Related GitHub Issue:
|
|
922
|
+
#20 - Stylistic Markers
|
|
923
|
+
https://github.com/craigtrim/pystylometry/issues/20
|
|
924
|
+
|
|
925
|
+
Markers analyzed:
|
|
926
|
+
- Contraction usage (don't vs. do not, I'm vs. I am, etc.)
|
|
927
|
+
- Intensifiers (very, really, extremely, quite, etc.)
|
|
928
|
+
- Hedges (maybe, perhaps, probably, somewhat, etc.)
|
|
929
|
+
- Modal auxiliaries (can, could, may, might, must, should, will, would)
|
|
930
|
+
- Negation patterns (not, no, never, none, neither, etc.)
|
|
931
|
+
- Exclamation frequency
|
|
932
|
+
- Question frequency
|
|
933
|
+
- Quotation usage
|
|
934
|
+
- Parenthetical expressions
|
|
935
|
+
- Ellipses and dashes
|
|
936
|
+
|
|
937
|
+
References:
|
|
938
|
+
Argamon, S., & Levitan, S. (2005). Measuring the usefulness of function
|
|
939
|
+
words for authorship attribution. ACH/ALLC.
|
|
940
|
+
Pennebaker, J. W. (2011). The secret life of pronouns. Bloomsbury Press.
|
|
941
|
+
|
|
942
|
+
Example:
|
|
943
|
+
>>> result = compute_stylistic_markers("Sample text with various markers...")
|
|
944
|
+
>>> print(f"Contraction ratio: {result.contraction_ratio * 100:.1f}%")
|
|
945
|
+
>>> print(f"Intensifier density: {result.intensifier_density:.2f}")
|
|
946
|
+
>>> print(f"Hedging density: {result.hedging_density:.2f}")
|
|
947
|
+
"""
|
|
948
|
+
|
|
949
|
+
# Contraction patterns
|
|
950
|
+
contraction_ratio: float # Contractions / (contractions + full forms)
|
|
951
|
+
contraction_count: int # Total contractions
|
|
952
|
+
expanded_form_count: int # Total expanded forms (e.g., "do not" vs "don't")
|
|
953
|
+
top_contractions: list[tuple[str, int]] # Most frequent contractions
|
|
954
|
+
|
|
955
|
+
# Intensifiers and hedges
|
|
956
|
+
intensifier_density: float # Intensifiers per 100 words
|
|
957
|
+
intensifier_count: int # Total intensifier count
|
|
958
|
+
top_intensifiers: list[tuple[str, int]] # Most frequent intensifiers
|
|
959
|
+
hedging_density: float # Hedges per 100 words
|
|
960
|
+
hedging_count: int # Total hedge count
|
|
961
|
+
top_hedges: list[tuple[str, int]] # Most frequent hedges
|
|
962
|
+
|
|
963
|
+
# Modal auxiliaries
|
|
964
|
+
modal_density: float # Modal auxiliaries per 100 words
|
|
965
|
+
modal_distribution: dict[str, int] # Count per modal (can, could, may, etc.)
|
|
966
|
+
epistemic_modal_ratio: float # Epistemic modals / all modals
|
|
967
|
+
deontic_modal_ratio: float # Deontic modals / all modals
|
|
968
|
+
|
|
969
|
+
# Negation
|
|
970
|
+
negation_density: float # Negation markers per 100 words
|
|
971
|
+
negation_count: int # Total negation markers
|
|
972
|
+
negation_types: dict[str, int] # not, no, never, etc. with counts
|
|
973
|
+
|
|
974
|
+
# Punctuation style
|
|
975
|
+
exclamation_density: float # Exclamation marks per 100 words
|
|
976
|
+
question_density: float # Question marks per 100 words
|
|
977
|
+
quotation_density: float # Quotation marks per 100 words
|
|
978
|
+
parenthetical_density: float # Parentheses per 100 words
|
|
979
|
+
ellipsis_density: float # Ellipses per 100 words
|
|
980
|
+
dash_density: float # Dashes (em/en) per 100 words
|
|
981
|
+
semicolon_density: float # Semicolons per 100 words
|
|
982
|
+
colon_density: float # Colons per 100 words
|
|
983
|
+
|
|
984
|
+
metadata: dict[str, Any] # Full lists, total word count, etc.
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
# ===== Vocabulary Overlap Results =====
|
|
988
|
+
# Related to GitHub Issue #21: Vocabulary Overlap and Similarity Metrics
|
|
989
|
+
# https://github.com/craigtrim/pystylometry/issues/21
|
|
990
|
+
|
|
991
|
+
|
|
992
|
+
@dataclass
|
|
993
|
+
class VocabularyOverlapResult:
|
|
994
|
+
"""Result from vocabulary overlap and similarity analysis.
|
|
995
|
+
|
|
996
|
+
Vocabulary overlap metrics measure the similarity between two texts based on
|
|
997
|
+
their shared vocabulary. These metrics are useful for authorship verification,
|
|
998
|
+
plagiarism detection, and measuring stylistic consistency across texts.
|
|
999
|
+
|
|
1000
|
+
Related GitHub Issue:
|
|
1001
|
+
#21 - Vocabulary Overlap and Similarity Metrics
|
|
1002
|
+
https://github.com/craigtrim/pystylometry/issues/21
|
|
1003
|
+
|
|
1004
|
+
Metrics computed:
|
|
1005
|
+
- Jaccard similarity (intersection / union)
|
|
1006
|
+
- Dice coefficient (2 * intersection / sum of sizes)
|
|
1007
|
+
- Overlap coefficient (intersection / min(size1, size2))
|
|
1008
|
+
- Cosine similarity (using word frequency vectors)
|
|
1009
|
+
- Shared vocabulary size and ratio
|
|
1010
|
+
- Unique words in each text
|
|
1011
|
+
- Most distinctive words for each text
|
|
1012
|
+
|
|
1013
|
+
References:
|
|
1014
|
+
Jaccard, P. (1912). The distribution of the flora in the alpine zone.
|
|
1015
|
+
New Phytologist, 11(2), 37-50.
|
|
1016
|
+
Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
|
|
1017
|
+
Retrieval. McGraw-Hill.
|
|
1018
|
+
|
|
1019
|
+
Example:
|
|
1020
|
+
>>> result = compute_vocabulary_overlap(text1, text2)
|
|
1021
|
+
>>> print(f"Jaccard similarity: {result.jaccard_similarity:.3f}")
|
|
1022
|
+
>>> print(f"Shared vocabulary: {result.shared_vocab_size} words")
|
|
1023
|
+
>>> print(f"Text1 unique: {result.text1_unique_count}")
|
|
1024
|
+
"""
|
|
1025
|
+
|
|
1026
|
+
# Similarity scores (0-1 range)
|
|
1027
|
+
jaccard_similarity: float # Intersection / union
|
|
1028
|
+
dice_coefficient: float # 2 * intersection / (size1 + size2)
|
|
1029
|
+
overlap_coefficient: float # Intersection / min(size1, size2)
|
|
1030
|
+
cosine_similarity: float # Cosine of frequency vectors
|
|
1031
|
+
|
|
1032
|
+
# Vocabulary sizes
|
|
1033
|
+
text1_vocab_size: int # Unique words in text 1
|
|
1034
|
+
text2_vocab_size: int # Unique words in text 2
|
|
1035
|
+
shared_vocab_size: int # Words in both texts
|
|
1036
|
+
union_vocab_size: int # Words in either text
|
|
1037
|
+
text1_unique_count: int # Words only in text 1
|
|
1038
|
+
text2_unique_count: int # Words only in text 2
|
|
1039
|
+
|
|
1040
|
+
# Shared and distinctive vocabulary
|
|
1041
|
+
shared_words: list[str] # Words appearing in both texts
|
|
1042
|
+
text1_distinctive_words: list[tuple[str, float]] # Words + TF-IDF scores for text 1
|
|
1043
|
+
text2_distinctive_words: list[tuple[str, float]] # Words + TF-IDF scores for text 2
|
|
1044
|
+
|
|
1045
|
+
# Ratios
|
|
1046
|
+
text1_coverage: float # Shared / text1_vocab (how much of text1 is shared)
|
|
1047
|
+
text2_coverage: float # Shared / text2_vocab (how much of text2 is shared)
|
|
1048
|
+
|
|
1049
|
+
metadata: dict[str, Any] # Full vocabulary sets, frequency vectors, etc.
|
|
1050
|
+
|
|
1051
|
+
|
|
1052
|
+
# ===== Cohesion and Coherence Results =====
|
|
1053
|
+
# Related to GitHub Issue #22: Cohesion and Coherence Metrics
|
|
1054
|
+
# https://github.com/craigtrim/pystylometry/issues/22
|
|
1055
|
+
|
|
1056
|
+
|
|
1057
|
+
@dataclass
|
|
1058
|
+
class CohesionCoherenceResult:
|
|
1059
|
+
"""Result from cohesion and coherence analysis.
|
|
1060
|
+
|
|
1061
|
+
Cohesion and coherence metrics measure how well a text holds together
|
|
1062
|
+
structurally (cohesion) and semantically (coherence). These metrics are
|
|
1063
|
+
important for analyzing writing quality, readability, and authorial
|
|
1064
|
+
sophistication.
|
|
1065
|
+
|
|
1066
|
+
Related GitHub Issue:
|
|
1067
|
+
#22 - Cohesion and Coherence Metrics
|
|
1068
|
+
https://github.com/craigtrim/pystylometry/issues/22
|
|
1069
|
+
|
|
1070
|
+
Cohesion features:
|
|
1071
|
+
- Referential cohesion (pronouns, demonstratives pointing back)
|
|
1072
|
+
- Lexical cohesion (word repetition, synonyms, semantic relatedness)
|
|
1073
|
+
- Connective density (discourse markers, conjunctions)
|
|
1074
|
+
- Anaphora resolution success rate
|
|
1075
|
+
- Lexical chains (sequences of semantically related words)
|
|
1076
|
+
|
|
1077
|
+
Coherence features:
|
|
1078
|
+
- Sentence-to-sentence semantic similarity
|
|
1079
|
+
- Topic consistency across paragraphs
|
|
1080
|
+
- Discourse structure (thesis, support, conclusion)
|
|
1081
|
+
- Semantic overlap between adjacent sentences
|
|
1082
|
+
|
|
1083
|
+
References:
|
|
1084
|
+
Halliday, M. A. K., & Hasan, R. (1976). Cohesion in English. Longman.
|
|
1085
|
+
Graesser, A. C., McNamara, D. S., & Kulikowich, J. M. (2011). Coh-Metrix:
|
|
1086
|
+
Providing multilevel analyses of text characteristics. Educational
|
|
1087
|
+
Researcher, 40(5), 223-234.
|
|
1088
|
+
|
|
1089
|
+
Example:
|
|
1090
|
+
>>> result = compute_cohesion_coherence("Multi-paragraph text...")
|
|
1091
|
+
>>> print(f"Pronoun density: {result.pronoun_density:.2f}")
|
|
1092
|
+
>>> print(f"Lexical overlap: {result.adjacent_sentence_overlap:.3f}")
|
|
1093
|
+
>>> print(f"Connective density: {result.connective_density:.2f}")
|
|
1094
|
+
"""
|
|
1095
|
+
|
|
1096
|
+
# Referential cohesion
|
|
1097
|
+
pronoun_density: float # Pronouns per 100 words
|
|
1098
|
+
demonstrative_density: float # Demonstratives (this, that, these, those) per 100 words
|
|
1099
|
+
anaphora_count: int # Anaphoric references detected
|
|
1100
|
+
anaphora_resolution_ratio: float # Successfully resolved / total
|
|
1101
|
+
|
|
1102
|
+
# Lexical cohesion
|
|
1103
|
+
word_repetition_ratio: float # Repeated content words / total content words
|
|
1104
|
+
synonym_density: float # Synonym pairs per 100 words
|
|
1105
|
+
lexical_chain_count: int # Number of lexical chains detected
|
|
1106
|
+
mean_chain_length: float # Average length of lexical chains
|
|
1107
|
+
content_word_overlap: float # Content word overlap between sentences
|
|
1108
|
+
|
|
1109
|
+
# Connectives and discourse markers
|
|
1110
|
+
connective_density: float # Discourse connectives per 100 words
|
|
1111
|
+
additive_connective_ratio: float # "and", "also", "furthermore" / total connectives
|
|
1112
|
+
adversative_connective_ratio: float # "but", "however", "nevertheless" / total
|
|
1113
|
+
causal_connective_ratio: float # "because", "therefore", "thus" / total
|
|
1114
|
+
temporal_connective_ratio: float # "then", "after", "before" / total
|
|
1115
|
+
|
|
1116
|
+
# Coherence measures
|
|
1117
|
+
adjacent_sentence_overlap: float # Mean semantic overlap between adjacent sentences
|
|
1118
|
+
paragraph_topic_consistency: float # Mean topic consistency within paragraphs
|
|
1119
|
+
mean_sentence_similarity: float # Mean cosine similarity between all sentence pairs
|
|
1120
|
+
semantic_coherence_score: float # Composite coherence metric (0-1)
|
|
1121
|
+
|
|
1122
|
+
# Structural coherence
|
|
1123
|
+
paragraph_count: int # Number of paragraphs detected
|
|
1124
|
+
mean_paragraph_length: float # Mean sentences per paragraph
|
|
1125
|
+
discourse_structure_score: float # Quality of intro/body/conclusion structure
|
|
1126
|
+
|
|
1127
|
+
metadata: dict[str, Any] # Lexical chains, connective lists, similarity matrices, etc.
|
|
1128
|
+
|
|
1129
|
+
|
|
1130
|
+
# ===== Genre and Register Results =====
|
|
1131
|
+
# Related to GitHub Issue #23: Genre and Register Features
|
|
1132
|
+
# https://github.com/craigtrim/pystylometry/issues/23
|
|
1133
|
+
|
|
1134
|
+
|
|
1135
|
+
@dataclass
|
|
1136
|
+
class GenreRegisterResult:
|
|
1137
|
+
"""Result from genre and register classification analysis.
|
|
1138
|
+
|
|
1139
|
+
Genre and register features distinguish between different types of texts
|
|
1140
|
+
(academic, journalistic, fiction, legal, etc.) based on linguistic patterns.
|
|
1141
|
+
These features can help identify the context and formality level of a text,
|
|
1142
|
+
and are useful for authorship attribution when combined with other metrics.
|
|
1143
|
+
|
|
1144
|
+
Related GitHub Issue:
|
|
1145
|
+
#23 - Genre and Register Features
|
|
1146
|
+
https://github.com/craigtrim/pystylometry/issues/23
|
|
1147
|
+
|
|
1148
|
+
Features analyzed:
|
|
1149
|
+
- Formality markers (Latinate words, nominalizations, passive voice)
|
|
1150
|
+
- Personal vs. impersonal style (1st/2nd person vs. 3rd person)
|
|
1151
|
+
- Abstract vs. concrete vocabulary
|
|
1152
|
+
- Technical term density
|
|
1153
|
+
- Narrative vs. expository markers
|
|
1154
|
+
- Dialogue presence and ratio
|
|
1155
|
+
- Register classification (frozen, formal, consultative, casual, intimate)
|
|
1156
|
+
|
|
1157
|
+
References:
|
|
1158
|
+
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
1159
|
+
Biber, D., & Conrad, S. (2009). Register, genre, and style. Cambridge
|
|
1160
|
+
University Press.
|
|
1161
|
+
Heylighen, F., & Dewaele, J. M. (1999). Formality of language: Definition,
|
|
1162
|
+
measurement and behavioral determinants. Internal Report, Center "Leo
|
|
1163
|
+
Apostel", Free University of Brussels.
|
|
1164
|
+
|
|
1165
|
+
Example:
|
|
1166
|
+
>>> result = compute_genre_register("Academic paper text...")
|
|
1167
|
+
>>> print(f"Formality score: {result.formality_score:.2f}")
|
|
1168
|
+
>>> print(f"Register: {result.register_classification}")
|
|
1169
|
+
>>> print(f"Genre prediction: {result.predicted_genre}")
|
|
1170
|
+
"""
|
|
1171
|
+
|
|
1172
|
+
# Formality indicators
|
|
1173
|
+
formality_score: float # Composite formality score (0-100)
|
|
1174
|
+
latinate_ratio: float # Latinate words / total words
|
|
1175
|
+
nominalization_density: float # Nominalizations per 100 words
|
|
1176
|
+
passive_voice_density: float # Passive constructions per 100 words
|
|
1177
|
+
|
|
1178
|
+
# Personal vs. impersonal
|
|
1179
|
+
first_person_ratio: float # 1st person pronouns / total pronouns
|
|
1180
|
+
second_person_ratio: float # 2nd person pronouns / total pronouns
|
|
1181
|
+
third_person_ratio: float # 3rd person pronouns / total pronouns
|
|
1182
|
+
impersonal_construction_density: float # "It is...", "There are..." per 100 words
|
|
1183
|
+
|
|
1184
|
+
# Abstract vs. concrete
|
|
1185
|
+
abstract_noun_ratio: float # Abstract nouns / total nouns
|
|
1186
|
+
concrete_noun_ratio: float # Concrete nouns / total nouns
|
|
1187
|
+
abstractness_score: float # Composite abstractness (based on word concreteness ratings)
|
|
1188
|
+
|
|
1189
|
+
# Technical and specialized
|
|
1190
|
+
technical_term_density: float # Technical/specialized terms per 100 words
|
|
1191
|
+
jargon_density: float # Domain-specific jargon per 100 words
|
|
1192
|
+
|
|
1193
|
+
# Narrative vs. expository
|
|
1194
|
+
narrative_marker_density: float # Past tense, action verbs per 100 words
|
|
1195
|
+
expository_marker_density: float # Present tense, linking verbs per 100 words
|
|
1196
|
+
narrative_expository_ratio: float # Narrative / expository markers
|
|
1197
|
+
|
|
1198
|
+
# Dialogue and quotation
|
|
1199
|
+
dialogue_ratio: float # Dialogue / total text (estimated)
|
|
1200
|
+
quotation_density: float # Quotations per 100 words
|
|
1201
|
+
|
|
1202
|
+
# Classification results
|
|
1203
|
+
register_classification: str # frozen, formal, consultative, casual, intimate
|
|
1204
|
+
predicted_genre: str # academic, journalistic, fiction, legal, conversational, etc.
|
|
1205
|
+
genre_confidence: float # Confidence in genre prediction (0-1)
|
|
1206
|
+
|
|
1207
|
+
# Feature scores for major genres (0-1 scores for each)
|
|
1208
|
+
academic_score: float
|
|
1209
|
+
journalistic_score: float
|
|
1210
|
+
fiction_score: float
|
|
1211
|
+
legal_score: float
|
|
1212
|
+
conversational_score: float
|
|
1213
|
+
|
|
1214
|
+
metadata: dict[str, Any] # Feature details, word lists, classification probabilities, etc.
|
|
1215
|
+
|
|
1216
|
+
|
|
1217
|
+
# ===== Additional Authorship Results =====
|
|
1218
|
+
# Related to GitHub Issue #24: Additional Authorship Attribution Methods
|
|
1219
|
+
# https://github.com/craigtrim/pystylometry/issues/24
|
|
1220
|
+
|
|
1221
|
+
|
|
1222
|
+
@dataclass
|
|
1223
|
+
class KilgarriffResult:
|
|
1224
|
+
"""Result from Kilgarriff's Chi-squared method.
|
|
1225
|
+
|
|
1226
|
+
Kilgarriff's chi-squared method compares word frequency distributions between
|
|
1227
|
+
texts using the chi-squared test. It's particularly effective for authorship
|
|
1228
|
+
attribution when comparing frequency profiles of common words.
|
|
1229
|
+
|
|
1230
|
+
Related GitHub Issue:
|
|
1231
|
+
#24 - Additional Authorship Attribution Methods
|
|
1232
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
1233
|
+
|
|
1234
|
+
References:
|
|
1235
|
+
Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus
|
|
1236
|
+
Linguistics, 6(1), 97-133.
|
|
1237
|
+
|
|
1238
|
+
Example:
|
|
1239
|
+
>>> result = compute_kilgarriff(text1, text2)
|
|
1240
|
+
>>> print(f"Chi-squared: {result.chi_squared:.2f}")
|
|
1241
|
+
>>> print(f"P-value: {result.p_value:.4f}")
|
|
1242
|
+
"""
|
|
1243
|
+
|
|
1244
|
+
chi_squared: float # Chi-squared statistic
|
|
1245
|
+
p_value: float # Statistical significance (p-value)
|
|
1246
|
+
degrees_of_freedom: int # df for chi-squared test
|
|
1247
|
+
feature_count: int # Number of features (words) compared
|
|
1248
|
+
most_distinctive_features: list[tuple[str, float]] # Words + chi-squared contributions
|
|
1249
|
+
metadata: dict[str, Any] # Frequency tables, expected values, etc.
|
|
1250
|
+
|
|
1251
|
+
|
|
1252
|
+
@dataclass
|
|
1253
|
+
class MinMaxResult:
|
|
1254
|
+
"""Result from Min-Max distance method (Burrows' original method).
|
|
1255
|
+
|
|
1256
|
+
The Min-Max method normalizes feature frequencies using min-max scaling,
|
|
1257
|
+
then computes distance between texts. This was Burrows' original approach
|
|
1258
|
+
before developing Delta.
|
|
1259
|
+
|
|
1260
|
+
Related GitHub Issue:
|
|
1261
|
+
#24 - Additional Authorship Attribution Methods
|
|
1262
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
1263
|
+
|
|
1264
|
+
References:
|
|
1265
|
+
Burrows, J. F. (1992). Not unless you ask nicely: The interpretative
|
|
1266
|
+
nexus between analysis and information. Literary and Linguistic
|
|
1267
|
+
Computing, 7(2), 91-109.
|
|
1268
|
+
|
|
1269
|
+
Example:
|
|
1270
|
+
>>> result = compute_minmax(text1, text2)
|
|
1271
|
+
>>> print(f"MinMax distance: {result.minmax_distance:.3f}")
|
|
1272
|
+
"""
|
|
1273
|
+
|
|
1274
|
+
minmax_distance: float # Min-max normalized distance
|
|
1275
|
+
feature_count: int # Number of features used
|
|
1276
|
+
most_distinctive_features: list[tuple[str, float]] # Features + contributions
|
|
1277
|
+
metadata: dict[str, Any] # Normalized frequencies, scaling parameters, etc.
|
|
1278
|
+
|
|
1279
|
+
|
|
1280
|
+
@dataclass
|
|
1281
|
+
class JohnsBurrowsResult:
|
|
1282
|
+
"""Result from John's Burrows' variation of Delta.
|
|
1283
|
+
|
|
1284
|
+
John Burrows has developed several variations of the Delta method over
|
|
1285
|
+
the years. This captures alternative formulations including Quadratic
|
|
1286
|
+
Delta and other distance measures.
|
|
1287
|
+
|
|
1288
|
+
Related GitHub Issue:
|
|
1289
|
+
#24 - Additional Authorship Attribution Methods
|
|
1290
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
1291
|
+
|
|
1292
|
+
References:
|
|
1293
|
+
Burrows, J. (2005). Who wrote Shamela? Verifying the authorship of a
|
|
1294
|
+
parodic text. Literary and Linguistic Computing, 20(4), 437-450.
|
|
1295
|
+
|
|
1296
|
+
Example:
|
|
1297
|
+
>>> result = compute_johns_delta(text1, text2, method="quadratic")
|
|
1298
|
+
>>> print(f"Quadratic Delta: {result.delta_score:.3f}")
|
|
1299
|
+
"""
|
|
1300
|
+
|
|
1301
|
+
delta_score: float # Delta distance score
|
|
1302
|
+
method: str # "quadratic", "weighted", "rotated", etc.
|
|
1303
|
+
feature_count: int # Number of MFW used
|
|
1304
|
+
most_distinctive_features: list[tuple[str, float]] # Features + contributions
|
|
1305
|
+
metadata: dict[str, Any] # Method-specific parameters, z-scores, etc.
|
|
1306
|
+
|
|
1307
|
+
|
|
1308
|
+
# ===== Rhythm and Prosody Results =====
|
|
1309
|
+
# Related to GitHub Issue #25: Rhythm and Prosody Metrics
|
|
1310
|
+
# https://github.com/craigtrim/pystylometry/issues/25
|
|
1311
|
+
|
|
1312
|
+
|
|
1313
|
+
@dataclass
|
|
1314
|
+
class RhythmProsodyResult:
|
|
1315
|
+
"""Result from rhythm and prosody analysis.
|
|
1316
|
+
|
|
1317
|
+
Rhythm and prosody metrics capture the musical qualities of written language,
|
|
1318
|
+
including stress patterns, syllable rhythms, and phonological features. While
|
|
1319
|
+
these are typically studied in spoken language, written text preserves many
|
|
1320
|
+
rhythmic patterns that vary by author and genre.
|
|
1321
|
+
|
|
1322
|
+
Related GitHub Issue:
|
|
1323
|
+
#25 - Rhythm and Prosody Metrics
|
|
1324
|
+
https://github.com/craigtrim/pystylometry/issues/25
|
|
1325
|
+
|
|
1326
|
+
Features analyzed:
|
|
1327
|
+
- Syllable patterns and stress patterns
|
|
1328
|
+
- Rhythmic regularity (coefficient of variation of syllable counts)
|
|
1329
|
+
- Phonological features (alliteration, assonance)
|
|
1330
|
+
- Syllable complexity (consonant clusters)
|
|
1331
|
+
- Sentence rhythm (alternating long/short sentences)
|
|
1332
|
+
- Polysyllabic word ratio
|
|
1333
|
+
|
|
1334
|
+
References:
|
|
1335
|
+
Lea, R. B., Mulligan, E. J., & Walton, J. H. (2005). Sentence rhythm and
|
|
1336
|
+
text comprehension. Memory & Cognition, 33(3), 388-396.
|
|
1337
|
+
Louwerse, M. M., & Benesh, N. (2012). Representing spatial structure through
|
|
1338
|
+
maps and language: Lord of the Rings encodes the spatial structure of
|
|
1339
|
+
Middle Earth. Cognitive Science, 36(8), 1556-1569.
|
|
1340
|
+
|
|
1341
|
+
Example:
|
|
1342
|
+
>>> result = compute_rhythm_prosody("Sample text with rhythm...")
|
|
1343
|
+
>>> print(f"Syllables per word: {result.mean_syllables_per_word:.2f}")
|
|
1344
|
+
>>> print(f"Rhythmic regularity: {result.rhythmic_regularity:.3f}")
|
|
1345
|
+
>>> print(f"Alliteration density: {result.alliteration_density:.2f}")
|
|
1346
|
+
"""
|
|
1347
|
+
|
|
1348
|
+
# Syllable patterns
|
|
1349
|
+
mean_syllables_per_word: float # Average syllables per word
|
|
1350
|
+
syllable_std_dev: float # Std dev of syllables per word
|
|
1351
|
+
polysyllabic_ratio: float # Words with 3+ syllables / total
|
|
1352
|
+
monosyllabic_ratio: float # Single-syllable words / total
|
|
1353
|
+
|
|
1354
|
+
# Rhythmic regularity
|
|
1355
|
+
rhythmic_regularity: float # 1 / CV of syllable counts (higher = more regular)
|
|
1356
|
+
syllable_cv: float # Coefficient of variation of syllables per word
|
|
1357
|
+
stress_pattern_entropy: float # Entropy of stress patterns
|
|
1358
|
+
|
|
1359
|
+
# Sentence rhythm
|
|
1360
|
+
sentence_length_alternation: float # Degree of long/short alternation
|
|
1361
|
+
sentence_rhythm_score: float # Composite rhythm score
|
|
1362
|
+
|
|
1363
|
+
# Phonological features
|
|
1364
|
+
alliteration_density: float # Alliterative word pairs per 100 words
|
|
1365
|
+
assonance_density: float # Assonant word pairs per 100 words
|
|
1366
|
+
consonance_density: float # Consonant word pairs per 100 words
|
|
1367
|
+
|
|
1368
|
+
# Syllable complexity
|
|
1369
|
+
mean_consonant_cluster_length: float # Avg consonants in clusters
|
|
1370
|
+
initial_cluster_ratio: float # Words starting with clusters / total
|
|
1371
|
+
final_cluster_ratio: float # Words ending with clusters / total
|
|
1372
|
+
|
|
1373
|
+
# Stress patterns (estimated for written text)
|
|
1374
|
+
iambic_ratio: float # Iambic patterns (unstressed-stressed) / total
|
|
1375
|
+
trochaic_ratio: float # Trochaic patterns (stressed-unstressed) / total
|
|
1376
|
+
dactylic_ratio: float # Dactylic patterns / total
|
|
1377
|
+
anapestic_ratio: float # Anapestic patterns / total
|
|
1378
|
+
|
|
1379
|
+
metadata: dict[str, Any] # Syllable counts, stress patterns, phoneme data, etc.
|
|
1380
|
+
|
|
1381
|
+
|
|
160
1382
|
# ===== Unified Analysis Result =====
|
|
161
1383
|
|
|
162
1384
|
|