pystylometry 1.1.0__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +17 -1
- pystylometry/_types.py +206 -0
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +9 -6
- pystylometry/authorship/additional_methods.py +262 -17
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +8 -1
- pystylometry/character/README.md +17 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/__init__.py +3 -0
- pystylometry/lexical/repetition.py +506 -0
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/viz/README.md +27 -0
- pystylometry-1.3.1.dist-info/LICENSE +21 -0
- pystylometry-1.3.1.dist-info/METADATA +79 -0
- {pystylometry-1.1.0.dist-info → pystylometry-1.3.1.dist-info}/RECORD +31 -16
- {pystylometry-1.1.0.dist-info → pystylometry-1.3.1.dist-info}/WHEEL +1 -1
- pystylometry-1.1.0.dist-info/METADATA +0 -278
- {pystylometry-1.1.0.dist-info → pystylometry-1.3.1.dist-info}/entry_points.txt +0 -0
pystylometry/README.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# pystylometry
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Core package for stylometric analysis and authorship attribution.
|
|
7
|
+
|
|
8
|
+
## Module Map
|
|
9
|
+
|
|
10
|
+
| Module | Purpose | Key Functions |
|
|
11
|
+
|--------|---------|---------------|
|
|
12
|
+
| [`lexical/`](lexical/) | Vocabulary diversity & richness | `compute_mtld`, `compute_yule`, `compute_ttr`, `compute_hapax_ratios` |
|
|
13
|
+
| [`readability/`](readability/) | Text readability scoring | `compute_flesch`, `compute_gunning_fog`, `compute_ari`, `compute_smog` |
|
|
14
|
+
| [`syntactic/`](syntactic/) | Sentence & parse structure | `compute_pos_ratios`, `compute_sentence_types`, `compute_advanced_syntactic` |
|
|
15
|
+
| [`authorship/`](authorship/) | Author attribution & comparison | `compute_burrows_delta`, `compute_kilgarriff`, `compute_compression_distance` |
|
|
16
|
+
| [`stylistic/`](stylistic/) | Style markers & vocabulary overlap | `compute_stylistic_markers`, `compute_vocabulary_overlap`, `compute_genre_register` |
|
|
17
|
+
| [`character/`](character/) | Character-level features | `compute_character_metrics` |
|
|
18
|
+
| [`ngrams/`](ngrams/) | N-gram entropy & sequences | `compute_extended_ngrams`, `compute_ngram_entropy` |
|
|
19
|
+
| [`dialect/`](dialect/) | Regional dialect detection | `compute_dialect` |
|
|
20
|
+
| [`consistency/`](consistency/) | Intra-document drift detection | `compute_kilgarriff_drift` |
|
|
21
|
+
| [`prosody/`](prosody/) | Rhythm & stress patterns | `compute_rhythm_prosody` |
|
|
22
|
+
| [`viz/`](viz/) | Visualization (PNG & interactive HTML) | `plot_drift_timeline`, `export_drift_report_jsx` |
|
|
23
|
+
|
|
24
|
+
## Shared Internals
|
|
25
|
+
|
|
26
|
+
| File | Purpose |
|
|
27
|
+
|------|---------|
|
|
28
|
+
| `_types.py` | All dataclass result types (e.g. `FleschResult`, `MTLDResult`, `KilgarriffDriftResult`) |
|
|
29
|
+
| `_normalize.py` | Text normalization for readability and stylometry pipelines |
|
|
30
|
+
| `_utils.py` | Shared tokenization and helper functions |
|
|
31
|
+
| `tokenizer.py` | Configurable tokenizer with sentence/word splitting |
|
|
32
|
+
| `cli.py` | Command-line interface (`pystylometry analyze`) |
|
|
33
|
+
|
|
34
|
+
## Installation Extras
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
pip install pystylometry # Core (lexical only)
|
|
38
|
+
pip install pystylometry[readability] # + readability
|
|
39
|
+
pip install pystylometry[syntactic] # + syntactic (requires spaCy)
|
|
40
|
+
pip install pystylometry[authorship] # + authorship attribution
|
|
41
|
+
pip install pystylometry[all] # Everything
|
|
42
|
+
```
|
pystylometry/__init__.py
CHANGED
|
@@ -63,18 +63,28 @@ try:
|
|
|
63
63
|
except ImportError:
|
|
64
64
|
_SYNTACTIC_AVAILABLE = False
|
|
65
65
|
|
|
66
|
-
#
|
|
66
|
+
# Prosody requires pronouncing (CMU dictionary) - same dependency as readability
|
|
67
|
+
try:
|
|
68
|
+
from . import prosody # noqa: F401 - Rhythm and prosody metrics (Issue #25)
|
|
69
|
+
|
|
70
|
+
_PROSODY_AVAILABLE = True
|
|
71
|
+
except ImportError:
|
|
72
|
+
_PROSODY_AVAILABLE = False
|
|
73
|
+
|
|
74
|
+
# Authorship, ngrams, dialect, consistency, and stylistic use only stdlib (no external dependencies)
|
|
67
75
|
from . import (
|
|
68
76
|
authorship, # noqa: F401
|
|
69
77
|
consistency, # noqa: F401 - Style drift detection (Issue #36)
|
|
70
78
|
dialect, # noqa: F401
|
|
71
79
|
ngrams, # noqa: F401
|
|
80
|
+
stylistic, # noqa: F401 - Vocabulary overlap and similarity (Issue #21)
|
|
72
81
|
)
|
|
73
82
|
|
|
74
83
|
_AUTHORSHIP_AVAILABLE = True
|
|
75
84
|
_NGRAMS_AVAILABLE = True
|
|
76
85
|
_DIALECT_AVAILABLE = True
|
|
77
86
|
_CONSISTENCY_AVAILABLE = True
|
|
87
|
+
_STYLISTIC_AVAILABLE = True
|
|
78
88
|
|
|
79
89
|
|
|
80
90
|
def analyze(
|
|
@@ -206,6 +216,8 @@ def get_available_modules() -> dict[str, bool]:
|
|
|
206
216
|
"ngrams": _NGRAMS_AVAILABLE,
|
|
207
217
|
"dialect": _DIALECT_AVAILABLE,
|
|
208
218
|
"consistency": _CONSISTENCY_AVAILABLE, # Style drift detection (Issue #36)
|
|
219
|
+
"stylistic": _STYLISTIC_AVAILABLE, # Vocabulary overlap (Issue #21)
|
|
220
|
+
"prosody": _PROSODY_AVAILABLE, # Rhythm and prosody (Issue #25)
|
|
209
221
|
}
|
|
210
222
|
|
|
211
223
|
|
|
@@ -229,3 +241,7 @@ if _DIALECT_AVAILABLE:
|
|
|
229
241
|
__all__.append("dialect")
|
|
230
242
|
if _CONSISTENCY_AVAILABLE:
|
|
231
243
|
__all__.append("consistency")
|
|
244
|
+
if _STYLISTIC_AVAILABLE:
|
|
245
|
+
__all__.append("stylistic")
|
|
246
|
+
if _PROSODY_AVAILABLE:
|
|
247
|
+
__all__.append("prosody")
|
pystylometry/_types.py
CHANGED
|
@@ -370,6 +370,158 @@ class TTRResult:
|
|
|
370
370
|
metadata: dict[str, Any]
|
|
371
371
|
|
|
372
372
|
|
|
373
|
+
# ===== Repetition Detection Results =====
|
|
374
|
+
# Related to GitHub Issue #28: Verbal tics detection for slop analysis
|
|
375
|
+
# https://github.com/craigtrim/pystylometry/issues/28
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
@dataclass
|
|
379
|
+
class RepetitiveWord:
|
|
380
|
+
"""A single word flagged as abnormally repetitive.
|
|
381
|
+
|
|
382
|
+
The repetition_score is the ratio of observed count to expected count
|
|
383
|
+
based on the word's frequency in the British National Corpus (BNC).
|
|
384
|
+
Higher scores indicate stronger overrepresentation.
|
|
385
|
+
|
|
386
|
+
Related GitHub Issue:
|
|
387
|
+
#28 - Verbal tics detection for slop analysis
|
|
388
|
+
https://github.com/craigtrim/pystylometry/issues/28
|
|
389
|
+
|
|
390
|
+
Attributes:
|
|
391
|
+
word: The flagged word (lowercased).
|
|
392
|
+
count: Observed count in the text.
|
|
393
|
+
expected_count: Expected count based on BNC relative frequency × text length.
|
|
394
|
+
0.0 if word not found in BNC.
|
|
395
|
+
repetition_score: count / expected_count. float('inf') if expected_count is 0.
|
|
396
|
+
bnc_bucket: BNC frequency bucket (1-100, 1=most frequent). None if not in BNC.
|
|
397
|
+
chunk_counts: Per-chunk occurrence counts (for distribution analysis).
|
|
398
|
+
distribution_entropy: Shannon entropy of the word's chunk distribution.
|
|
399
|
+
Low entropy = suspiciously even spread (model tic).
|
|
400
|
+
High entropy = clustered usage (human writing about a specific scene).
|
|
401
|
+
distribution_variance: Variance of per-chunk counts.
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
word: str
|
|
405
|
+
count: int
|
|
406
|
+
expected_count: float
|
|
407
|
+
repetition_score: float
|
|
408
|
+
bnc_bucket: int | None
|
|
409
|
+
chunk_counts: list[int]
|
|
410
|
+
distribution_entropy: float
|
|
411
|
+
distribution_variance: float
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
@dataclass
|
|
415
|
+
class RepetitiveUnigramsResult:
|
|
416
|
+
"""Result from repetitive unigram detection.
|
|
417
|
+
|
|
418
|
+
Identifies content words that appear far more frequently than expected
|
|
419
|
+
based on their frequency in the British National Corpus (BNC, ~100M tokens).
|
|
420
|
+
This is a key indicator of AI-generated "slop" where models exhibit verbal
|
|
421
|
+
tics — repeating certain words with suspicious regularity.
|
|
422
|
+
|
|
423
|
+
Related GitHub Issue:
|
|
424
|
+
#28 - Verbal tics detection for slop analysis
|
|
425
|
+
https://github.com/craigtrim/pystylometry/issues/28
|
|
426
|
+
|
|
427
|
+
The slop_score provides a single aggregate metric:
|
|
428
|
+
slop_score = flagged_words_per_10k × mean_repetition_score
|
|
429
|
+
|
|
430
|
+
Where:
|
|
431
|
+
- flagged_words_per_10k = count of flagged words / (total content words / 10000)
|
|
432
|
+
- mean_repetition_score = mean repetition_score across all flagged words
|
|
433
|
+
|
|
434
|
+
Higher slop_score = more likely AI-generated verbal tics.
|
|
435
|
+
|
|
436
|
+
References:
|
|
437
|
+
British National Corpus Consortium. (2007). The British National Corpus,
|
|
438
|
+
version 3 (BNC XML Edition). http://www.natcorp.ox.ac.uk/
|
|
439
|
+
|
|
440
|
+
Example:
|
|
441
|
+
>>> result = compute_repetitive_unigrams(text)
|
|
442
|
+
>>> for w in result.repetitive_words[:5]:
|
|
443
|
+
... print(f"{w.word}: {w.count}x (expected {w.expected_count:.1f}, "
|
|
444
|
+
... f"score {w.repetition_score:.1f})")
|
|
445
|
+
shimmered: 23x (expected 0.1, score 266.2)
|
|
446
|
+
>>> result.slop_score
|
|
447
|
+
42.7
|
|
448
|
+
"""
|
|
449
|
+
|
|
450
|
+
repetitive_words: list[RepetitiveWord] # Sorted by repetition_score descending
|
|
451
|
+
total_content_words: int
|
|
452
|
+
flagged_count: int # Number of words exceeding threshold
|
|
453
|
+
flagged_words_per_10k: float # flagged_count / (total_content_words / 10000)
|
|
454
|
+
mean_repetition_score: float # Mean score across flagged words
|
|
455
|
+
slop_score: float # Aggregate: flagged_words_per_10k × mean_repetition_score
|
|
456
|
+
total_content_words_dist: Distribution
|
|
457
|
+
chunk_size: int
|
|
458
|
+
chunk_count: int
|
|
459
|
+
metadata: dict[str, Any]
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
@dataclass
|
|
463
|
+
class RepetitiveNgram:
|
|
464
|
+
"""A single n-gram flagged as abnormally repetitive.
|
|
465
|
+
|
|
466
|
+
Content n-grams (bigrams, trigrams, etc.) should rarely repeat verbatim
|
|
467
|
+
in natural writing. N-grams that repeat beyond a length-scaled threshold
|
|
468
|
+
are flagged.
|
|
469
|
+
|
|
470
|
+
Related GitHub Issue:
|
|
471
|
+
#28 - Verbal tics detection for slop analysis
|
|
472
|
+
https://github.com/craigtrim/pystylometry/issues/28
|
|
473
|
+
|
|
474
|
+
Attributes:
|
|
475
|
+
ngram: The flagged n-gram as a tuple of words.
|
|
476
|
+
count: Observed count in the text.
|
|
477
|
+
frequency_per_10k: Occurrences per 10,000 n-grams.
|
|
478
|
+
chunk_counts: Per-chunk occurrence counts.
|
|
479
|
+
distribution_entropy: Shannon entropy of the n-gram's chunk distribution.
|
|
480
|
+
distribution_variance: Variance of per-chunk counts.
|
|
481
|
+
"""
|
|
482
|
+
|
|
483
|
+
ngram: tuple[str, ...]
|
|
484
|
+
count: int
|
|
485
|
+
frequency_per_10k: float
|
|
486
|
+
chunk_counts: list[int]
|
|
487
|
+
distribution_entropy: float
|
|
488
|
+
distribution_variance: float
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
@dataclass
|
|
492
|
+
class RepetitiveNgramsResult:
|
|
493
|
+
"""Result from repetitive n-gram detection.
|
|
494
|
+
|
|
495
|
+
Detects bigrams, trigrams, or higher-order n-grams that repeat more than
|
|
496
|
+
expected within the text. No external corpus is required — content n-grams
|
|
497
|
+
should not repeat verbatim often in natural writing.
|
|
498
|
+
|
|
499
|
+
N-grams composed entirely of function words (e.g., "of the", "in a") are
|
|
500
|
+
excluded since their repetition is expected.
|
|
501
|
+
|
|
502
|
+
Related GitHub Issue:
|
|
503
|
+
#28 - Verbal tics detection for slop analysis
|
|
504
|
+
https://github.com/craigtrim/pystylometry/issues/28
|
|
505
|
+
|
|
506
|
+
Example:
|
|
507
|
+
>>> result = compute_repetitive_ngrams(text, n=2)
|
|
508
|
+
>>> for ng in result.repetitive_ngrams[:5]:
|
|
509
|
+
... print(f"{' '.join(ng.ngram)}: {ng.count}x "
|
|
510
|
+
... f"({ng.frequency_per_10k:.1f} per 10k)")
|
|
511
|
+
uncomfortable truth: 8x (1.6 per 10k)
|
|
512
|
+
"""
|
|
513
|
+
|
|
514
|
+
repetitive_ngrams: list[RepetitiveNgram] # Sorted by count descending
|
|
515
|
+
n: int | tuple[int, ...] # N-gram order(s) analyzed
|
|
516
|
+
total_ngrams: int
|
|
517
|
+
flagged_count: int
|
|
518
|
+
flagged_per_10k: float # flagged_count / (total_ngrams / 10000)
|
|
519
|
+
total_ngrams_dist: Distribution
|
|
520
|
+
chunk_size: int
|
|
521
|
+
chunk_count: int
|
|
522
|
+
metadata: dict[str, Any]
|
|
523
|
+
|
|
524
|
+
|
|
373
525
|
# ===== Readability Results =====
|
|
374
526
|
|
|
375
527
|
|
|
@@ -1517,6 +1669,7 @@ class VocabularyOverlapResult:
|
|
|
1517
1669
|
- Dice coefficient (2 * intersection / sum of sizes)
|
|
1518
1670
|
- Overlap coefficient (intersection / min(size1, size2))
|
|
1519
1671
|
- Cosine similarity (using word frequency vectors)
|
|
1672
|
+
- KL divergence (asymmetric distributional difference)
|
|
1520
1673
|
- Shared vocabulary size and ratio
|
|
1521
1674
|
- Unique words in each text
|
|
1522
1675
|
- Most distinctive words for each text
|
|
@@ -1526,6 +1679,10 @@ class VocabularyOverlapResult:
|
|
|
1526
1679
|
New Phytologist, 11(2), 37-50.
|
|
1527
1680
|
Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
|
|
1528
1681
|
Retrieval. McGraw-Hill.
|
|
1682
|
+
Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
|
|
1683
|
+
Annals of Mathematical Statistics, 22(1), 79-86.
|
|
1684
|
+
Manning, C. D., & Schütze, H. (1999). Foundations of Statistical NLP.
|
|
1685
|
+
MIT Press.
|
|
1529
1686
|
|
|
1530
1687
|
Example:
|
|
1531
1688
|
>>> result = compute_vocabulary_overlap(text1, text2)
|
|
@@ -1539,6 +1696,7 @@ class VocabularyOverlapResult:
|
|
|
1539
1696
|
dice_coefficient: float # 2 * intersection / (size1 + size2)
|
|
1540
1697
|
overlap_coefficient: float # Intersection / min(size1, size2)
|
|
1541
1698
|
cosine_similarity: float # Cosine of frequency vectors
|
|
1699
|
+
kl_divergence: float # Kullback-Leibler divergence (asymmetric, text1 || text2)
|
|
1542
1700
|
|
|
1543
1701
|
# Vocabulary sizes
|
|
1544
1702
|
text1_vocab_size: int # Unique words in text 1
|
|
@@ -1897,6 +2055,54 @@ class JohnsBurrowsResult:
|
|
|
1897
2055
|
metadata: dict[str, Any] # Method-specific parameters, z-scores, etc.
|
|
1898
2056
|
|
|
1899
2057
|
|
|
2058
|
+
@dataclass
|
|
2059
|
+
class CompressionResult:
|
|
2060
|
+
"""Result from compression-based authorship attribution.
|
|
2061
|
+
|
|
2062
|
+
Compression-based methods use the Normalized Compression Distance (NCD) to
|
|
2063
|
+
measure similarity between texts. The intuition is that if two texts are
|
|
2064
|
+
similar, compressing them together will yield better compression than
|
|
2065
|
+
compressing separately. This approach is language-independent and captures
|
|
2066
|
+
deep statistical regularities.
|
|
2067
|
+
|
|
2068
|
+
Related GitHub Issue:
|
|
2069
|
+
#24 - Additional Authorship Attribution Methods
|
|
2070
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
2071
|
+
|
|
2072
|
+
Formula:
|
|
2073
|
+
NCD(x,y) = (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
|
|
2074
|
+
|
|
2075
|
+
where C(x) is the compressed size of x, and C(xy) is the compressed
|
|
2076
|
+
size of x and y concatenated.
|
|
2077
|
+
|
|
2078
|
+
Interpretation:
|
|
2079
|
+
- NCD ≈ 0: Texts are very similar
|
|
2080
|
+
- NCD ≈ 1: Texts are very different
|
|
2081
|
+
- Typical same-author pairs: 0.3-0.6
|
|
2082
|
+
- Typical different-author pairs: 0.6-0.9
|
|
2083
|
+
|
|
2084
|
+
References:
|
|
2085
|
+
Cilibrasi, R., & Vitányi, P. M. (2005). Clustering by compression.
|
|
2086
|
+
IEEE Transactions on Information Theory, 51(4), 1523-1545.
|
|
2087
|
+
|
|
2088
|
+
Benedetto, D., Caglioti, E., & Loreto, V. (2002). Language trees and
|
|
2089
|
+
zipping. Physical Review Letters, 88(4), 048702.
|
|
2090
|
+
|
|
2091
|
+
Example:
|
|
2092
|
+
>>> result = compute_compression_distance(text1, text2)
|
|
2093
|
+
>>> print(f"NCD: {result.ncd:.3f}")
|
|
2094
|
+
>>> if result.ncd < 0.5:
|
|
2095
|
+
... print("Texts likely by same author")
|
|
2096
|
+
"""
|
|
2097
|
+
|
|
2098
|
+
ncd: float # Normalized Compression Distance [0, 1+]
|
|
2099
|
+
compressor: str # Compression algorithm used (e.g., "gzip", "zlib", "bz2")
|
|
2100
|
+
text1_compressed_size: int # Compressed size of text1 alone
|
|
2101
|
+
text2_compressed_size: int # Compressed size of text2 alone
|
|
2102
|
+
combined_compressed_size: int # Compressed size of concatenated texts
|
|
2103
|
+
metadata: dict[str, Any] # Raw sizes, compression ratios, etc.
|
|
2104
|
+
|
|
2105
|
+
|
|
1900
2106
|
# ===== Rhythm and Prosody Results =====
|
|
1901
2107
|
# Related to GitHub Issue #25: Rhythm and Prosody Metrics
|
|
1902
2108
|
# https://github.com/craigtrim/pystylometry/issues/25
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# authorship
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Authorship attribution methods for comparing texts and determining likely authorship.
|
|
7
|
+
|
|
8
|
+
## Catalogue
|
|
9
|
+
|
|
10
|
+
| File | Functions | Method |
|
|
11
|
+
|------|-----------|--------|
|
|
12
|
+
| `burrows_delta.py` | `compute_burrows_delta`, `compute_cosine_delta` | Classic Delta and angular distance variant |
|
|
13
|
+
| `zeta.py` | `compute_zeta` | Zeta method for marker word detection |
|
|
14
|
+
| `kilgarriff.py` | `compute_kilgarriff` | Chi-squared corpus comparison |
|
|
15
|
+
| `additional_methods.py` | `compute_minmax`, `compute_johns_delta` | MinMax distance, Quadratic/Weighted Delta |
|
|
16
|
+
| `compression.py` | `compute_compression_distance` | Normalized Compression Distance (NCD) |
|
|
17
|
+
|
|
18
|
+
## See Also
|
|
19
|
+
|
|
20
|
+
- [`consistency/`](../consistency/) applies `compute_kilgarriff` in sliding windows for intra-document drift detection
|
|
21
|
+
- [`lexical/`](../lexical/) provides the vocabulary features many attribution methods rely on
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
This module provides methods for authorship attribution - comparing texts to
|
|
4
4
|
determine whether they were written by the same author. Available methods
|
|
5
|
-
include classic approaches (Burrows' Delta, Zeta)
|
|
6
|
-
(Kilgarriff's chi-squared).
|
|
5
|
+
include classic approaches (Burrows' Delta, Zeta), statistical methods
|
|
6
|
+
(Kilgarriff's chi-squared), and information-theoretic methods (NCD).
|
|
7
7
|
|
|
8
8
|
Related GitHub Issues:
|
|
9
9
|
#24 - Additional Authorship Attribution Methods
|
|
@@ -16,20 +16,23 @@ Available Functions:
|
|
|
16
16
|
compute_cosine_delta: Angular distance variant of Delta
|
|
17
17
|
compute_zeta: Zeta method for marker word detection
|
|
18
18
|
compute_kilgarriff: Chi-squared method for corpus comparison
|
|
19
|
-
compute_minmax: Burrows' original min-max method
|
|
20
|
-
compute_johns_delta: Delta variations (
|
|
19
|
+
compute_minmax: Burrows' original min-max distance method
|
|
20
|
+
compute_johns_delta: Delta variations (quadratic, weighted)
|
|
21
|
+
compute_compression_distance: Normalized Compression Distance (NCD)
|
|
21
22
|
"""
|
|
22
23
|
|
|
23
24
|
from .additional_methods import compute_johns_delta, compute_minmax
|
|
24
25
|
from .burrows_delta import compute_burrows_delta, compute_cosine_delta
|
|
26
|
+
from .compression import compute_compression_distance
|
|
25
27
|
from .kilgarriff import compute_kilgarriff
|
|
26
28
|
from .zeta import compute_zeta
|
|
27
29
|
|
|
28
30
|
__all__ = [
|
|
29
31
|
"compute_burrows_delta",
|
|
32
|
+
"compute_compression_distance",
|
|
30
33
|
"compute_cosine_delta",
|
|
31
|
-
"
|
|
34
|
+
"compute_johns_delta",
|
|
32
35
|
"compute_kilgarriff",
|
|
33
36
|
"compute_minmax",
|
|
34
|
-
"
|
|
37
|
+
"compute_zeta",
|
|
35
38
|
]
|