pystylometry 1.3.1__py3-none-any.whl → 1.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +42 -3
- pystylometry/_types.py +53 -3
- pystylometry/cli.py +321 -0
- pystylometry/lexical/__init__.py +2 -1
- pystylometry/lexical/ttr.py +288 -97
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.5.dist-info}/METADATA +2 -3
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.5.dist-info}/RECORD +10 -10
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.5.dist-info}/entry_points.txt +1 -0
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.5.dist-info}/LICENSE +0 -0
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.5.dist-info}/WHEEL +0 -0
pystylometry/__init__.py
CHANGED
|
@@ -40,14 +40,13 @@ Usage:
|
|
|
40
40
|
print(result.pattern_confidence)
|
|
41
41
|
"""
|
|
42
42
|
|
|
43
|
+
from . import lexical # noqa: E402
|
|
43
44
|
from ._types import AnalysisResult
|
|
45
|
+
from .tokenizer import TokenizationStats, Tokenizer, TokenMetadata
|
|
44
46
|
|
|
45
47
|
# Version
|
|
46
48
|
__version__ = "0.1.0"
|
|
47
49
|
|
|
48
|
-
# Core exports - always available
|
|
49
|
-
from . import lexical
|
|
50
|
-
|
|
51
50
|
# Optional exports - may raise ImportError if dependencies not installed
|
|
52
51
|
try:
|
|
53
52
|
from . import readability # noqa: F401
|
|
@@ -87,6 +86,41 @@ _CONSISTENCY_AVAILABLE = True
|
|
|
87
86
|
_STYLISTIC_AVAILABLE = True
|
|
88
87
|
|
|
89
88
|
|
|
89
|
+
def tokenize(text: str, **kwargs: object) -> list[str]:
|
|
90
|
+
"""Tokenize text using the stylometric tokenizer.
|
|
91
|
+
|
|
92
|
+
Convenience wrapper around Tokenizer.tokenize(). All keyword arguments
|
|
93
|
+
are forwarded to the Tokenizer constructor.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
text: Input text to tokenize.
|
|
97
|
+
**kwargs: Options forwarded to Tokenizer (lowercase, strip_numbers,
|
|
98
|
+
expand_contractions, etc.).
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
List of token strings.
|
|
102
|
+
|
|
103
|
+
Example:
|
|
104
|
+
>>> from pystylometry import tokenize
|
|
105
|
+
>>> tokenize("Hello, world! It's a test.")
|
|
106
|
+
['hello', 'world', "it's", 'a', 'test']
|
|
107
|
+
"""
|
|
108
|
+
return Tokenizer(**kwargs).tokenize(text) # type: ignore[arg-type]
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def tokenize_with_metadata(text: str, **kwargs: object) -> list[TokenMetadata]:
|
|
112
|
+
"""Tokenize text and return tokens with positional and type metadata.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
text: Input text to tokenize.
|
|
116
|
+
**kwargs: Options forwarded to Tokenizer.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
List of TokenMetadata objects.
|
|
120
|
+
"""
|
|
121
|
+
return Tokenizer(**kwargs).tokenize_with_metadata(text) # type: ignore[arg-type]
|
|
122
|
+
|
|
123
|
+
|
|
90
124
|
def analyze(
|
|
91
125
|
text: str,
|
|
92
126
|
lexical_metrics: bool = True,
|
|
@@ -225,6 +259,11 @@ __all__ = [
|
|
|
225
259
|
"__version__",
|
|
226
260
|
"analyze",
|
|
227
261
|
"get_available_modules",
|
|
262
|
+
"tokenize",
|
|
263
|
+
"tokenize_with_metadata",
|
|
264
|
+
"Tokenizer",
|
|
265
|
+
"TokenMetadata",
|
|
266
|
+
"TokenizationStats",
|
|
228
267
|
"lexical",
|
|
229
268
|
]
|
|
230
269
|
|
pystylometry/_types.py
CHANGED
|
@@ -23,7 +23,7 @@ from __future__ import annotations
|
|
|
23
23
|
|
|
24
24
|
import statistics
|
|
25
25
|
from dataclasses import dataclass
|
|
26
|
-
from typing import Any
|
|
26
|
+
from typing import Any, Optional
|
|
27
27
|
|
|
28
28
|
# ===== Distribution and Chunking =====
|
|
29
29
|
# Related to GitHub Issue #27: Native chunked analysis with Distribution dataclass
|
|
@@ -316,8 +316,8 @@ class HapaxLexiconResult:
|
|
|
316
316
|
class TTRResult:
|
|
317
317
|
"""Result from Type-Token Ratio (TTR) analysis.
|
|
318
318
|
|
|
319
|
-
|
|
320
|
-
|
|
319
|
+
Measures vocabulary richness through the ratio of unique words (types)
|
|
320
|
+
to total words (tokens).
|
|
321
321
|
|
|
322
322
|
All numeric metrics include both a mean value (convenient access) and
|
|
323
323
|
a full distribution with per-chunk values and statistics.
|
|
@@ -370,6 +370,56 @@ class TTRResult:
|
|
|
370
370
|
metadata: dict[str, Any]
|
|
371
371
|
|
|
372
372
|
|
|
373
|
+
@dataclass
|
|
374
|
+
class TTRAggregateResult:
|
|
375
|
+
"""Aggregated TTR statistics for a collection of texts.
|
|
376
|
+
|
|
377
|
+
Computes group-level summary statistics (mean, std, min, max, median)
|
|
378
|
+
across multiple ``TTRResult`` objects. Useful for comparative analysis
|
|
379
|
+
across authors, genres, or time periods.
|
|
380
|
+
|
|
381
|
+
Related GitHub Issue:
|
|
382
|
+
#43 - Inline stylometry-ttr into pystylometry (remove external dependency)
|
|
383
|
+
https://github.com/craigtrim/pystylometry/issues/43
|
|
384
|
+
|
|
385
|
+
Example:
|
|
386
|
+
>>> from pystylometry.lexical import compute_ttr, TTRAggregator
|
|
387
|
+
>>> results = [compute_ttr(t) for t in texts]
|
|
388
|
+
>>> agg = TTRAggregator()
|
|
389
|
+
>>> stats = agg.aggregate(results, group_id="Austen")
|
|
390
|
+
>>> stats.ttr_mean
|
|
391
|
+
0.412
|
|
392
|
+
"""
|
|
393
|
+
|
|
394
|
+
group_id: str
|
|
395
|
+
text_count: int
|
|
396
|
+
total_words: int
|
|
397
|
+
|
|
398
|
+
# Raw TTR statistics
|
|
399
|
+
ttr_mean: float
|
|
400
|
+
ttr_std: float
|
|
401
|
+
ttr_min: float
|
|
402
|
+
ttr_max: float
|
|
403
|
+
ttr_median: float
|
|
404
|
+
|
|
405
|
+
# Root TTR (Guiraud's index) statistics
|
|
406
|
+
root_ttr_mean: float
|
|
407
|
+
root_ttr_std: float
|
|
408
|
+
|
|
409
|
+
# Log TTR (Herdan's C) statistics
|
|
410
|
+
log_ttr_mean: float
|
|
411
|
+
log_ttr_std: float
|
|
412
|
+
|
|
413
|
+
# STTR statistics (None if no texts had enough words for STTR)
|
|
414
|
+
sttr_mean: Optional[float]
|
|
415
|
+
sttr_std: Optional[float]
|
|
416
|
+
|
|
417
|
+
# Delta std mean (None if no texts had delta metrics)
|
|
418
|
+
delta_std_mean: Optional[float]
|
|
419
|
+
|
|
420
|
+
metadata: dict[str, Any]
|
|
421
|
+
|
|
422
|
+
|
|
373
423
|
# ===== Repetition Detection Results =====
|
|
374
424
|
# Related to GitHub Issue #28: Verbal tics detection for slop analysis
|
|
375
425
|
# https://github.com/craigtrim/pystylometry/issues/28
|
pystylometry/cli.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
Usage:
|
|
4
4
|
pystylometry-drift <file> [--window-size=N] [--stride=N] [--mode=MODE] [--json]
|
|
5
5
|
pystylometry-drift <file> --plot [output.png]
|
|
6
|
+
pystylometry-tokenize <file> [--json] [--metadata] [--stats]
|
|
6
7
|
|
|
7
8
|
Example:
|
|
8
9
|
pystylometry-drift manuscript.txt
|
|
@@ -10,6 +11,9 @@ Example:
|
|
|
10
11
|
pystylometry-drift manuscript.txt --json
|
|
11
12
|
pystylometry-drift manuscript.txt --plot
|
|
12
13
|
pystylometry-drift manuscript.txt --plot drift_report.png
|
|
14
|
+
pystylometry-tokenize manuscript.txt
|
|
15
|
+
pystylometry-tokenize manuscript.txt --json --metadata
|
|
16
|
+
pystylometry-tokenize manuscript.txt --stats
|
|
13
17
|
"""
|
|
14
18
|
|
|
15
19
|
from __future__ import annotations
|
|
@@ -423,5 +427,322 @@ The generated viewer includes:
|
|
|
423
427
|
print()
|
|
424
428
|
|
|
425
429
|
|
|
430
|
+
def tokenize_cli() -> None:
|
|
431
|
+
"""CLI entry point for stylometric tokenization."""
|
|
432
|
+
parser = argparse.ArgumentParser(
|
|
433
|
+
prog="pystylometry-tokenize",
|
|
434
|
+
description="Tokenize text for stylometric analysis.",
|
|
435
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
436
|
+
epilog="""
|
|
437
|
+
Examples:
|
|
438
|
+
pystylometry-tokenize manuscript.txt
|
|
439
|
+
pystylometry-tokenize manuscript.txt --json
|
|
440
|
+
pystylometry-tokenize manuscript.txt --json --metadata
|
|
441
|
+
pystylometry-tokenize manuscript.txt --stats
|
|
442
|
+
pystylometry-tokenize manuscript.txt -U --expand-contractions
|
|
443
|
+
pystylometry-tokenize manuscript.txt --min-length 3 --strip-numbers
|
|
444
|
+
""",
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
parser.add_argument(
|
|
448
|
+
"file",
|
|
449
|
+
type=Path,
|
|
450
|
+
help="Path to text file to tokenize",
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
# Output mode
|
|
454
|
+
output_group = parser.add_argument_group("output")
|
|
455
|
+
output_group.add_argument(
|
|
456
|
+
"-j",
|
|
457
|
+
"--json",
|
|
458
|
+
action="store_true",
|
|
459
|
+
help="Output as JSON (list of strings, or list of objects with --metadata)",
|
|
460
|
+
)
|
|
461
|
+
output_group.add_argument(
|
|
462
|
+
"-m",
|
|
463
|
+
"--metadata",
|
|
464
|
+
action="store_true",
|
|
465
|
+
help="Include token type and position metadata (implies --json)",
|
|
466
|
+
)
|
|
467
|
+
output_group.add_argument(
|
|
468
|
+
"-s",
|
|
469
|
+
"--stats",
|
|
470
|
+
action="store_true",
|
|
471
|
+
help="Show tokenization statistics instead of tokens",
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
# Core behavior
|
|
475
|
+
behavior_group = parser.add_argument_group("behavior")
|
|
476
|
+
behavior_group.add_argument(
|
|
477
|
+
"-U",
|
|
478
|
+
"--no-lowercase",
|
|
479
|
+
action="store_true",
|
|
480
|
+
help="Preserve original case (default: lowercase)",
|
|
481
|
+
)
|
|
482
|
+
behavior_group.add_argument(
|
|
483
|
+
"-e",
|
|
484
|
+
"--expand-contractions",
|
|
485
|
+
action="store_true",
|
|
486
|
+
help="Expand contractions (it's -> it is)",
|
|
487
|
+
)
|
|
488
|
+
behavior_group.add_argument(
|
|
489
|
+
"-n",
|
|
490
|
+
"--strip-numbers",
|
|
491
|
+
action="store_true",
|
|
492
|
+
help="Remove numeric tokens",
|
|
493
|
+
)
|
|
494
|
+
behavior_group.add_argument(
|
|
495
|
+
"--keep-punctuation",
|
|
496
|
+
action="store_true",
|
|
497
|
+
help="Keep punctuation tokens (default: stripped)",
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
# Filtering
|
|
501
|
+
filter_group = parser.add_argument_group("filtering")
|
|
502
|
+
filter_group.add_argument(
|
|
503
|
+
"--min-length",
|
|
504
|
+
type=int,
|
|
505
|
+
default=1,
|
|
506
|
+
metavar="N",
|
|
507
|
+
help="Minimum token length (default: 1)",
|
|
508
|
+
)
|
|
509
|
+
filter_group.add_argument(
|
|
510
|
+
"--max-length",
|
|
511
|
+
type=int,
|
|
512
|
+
default=None,
|
|
513
|
+
metavar="N",
|
|
514
|
+
help="Maximum token length (default: unlimited)",
|
|
515
|
+
)
|
|
516
|
+
filter_group.add_argument(
|
|
517
|
+
"--preserve-urls",
|
|
518
|
+
action="store_true",
|
|
519
|
+
help="Keep URL tokens",
|
|
520
|
+
)
|
|
521
|
+
filter_group.add_argument(
|
|
522
|
+
"--preserve-emails",
|
|
523
|
+
action="store_true",
|
|
524
|
+
help="Keep email tokens",
|
|
525
|
+
)
|
|
526
|
+
filter_group.add_argument(
|
|
527
|
+
"--preserve-hashtags",
|
|
528
|
+
action="store_true",
|
|
529
|
+
help="Keep hashtag tokens",
|
|
530
|
+
)
|
|
531
|
+
filter_group.add_argument(
|
|
532
|
+
"--preserve-mentions",
|
|
533
|
+
action="store_true",
|
|
534
|
+
help="Keep @mention tokens",
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
# Advanced
|
|
538
|
+
advanced_group = parser.add_argument_group("advanced")
|
|
539
|
+
advanced_group.add_argument(
|
|
540
|
+
"--expand-abbreviations",
|
|
541
|
+
action="store_true",
|
|
542
|
+
help="Expand abbreviations (Dr. -> Doctor)",
|
|
543
|
+
)
|
|
544
|
+
advanced_group.add_argument(
|
|
545
|
+
"--strip-accents",
|
|
546
|
+
action="store_true",
|
|
547
|
+
help="Remove accents from characters",
|
|
548
|
+
)
|
|
549
|
+
advanced_group.add_argument(
|
|
550
|
+
"--no-clean",
|
|
551
|
+
action="store_true",
|
|
552
|
+
help="Skip text cleaning (italics, brackets, page markers)",
|
|
553
|
+
)
|
|
554
|
+
advanced_group.add_argument(
|
|
555
|
+
"--no-unicode-normalize",
|
|
556
|
+
action="store_true",
|
|
557
|
+
help="Skip unicode normalization",
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
args = parser.parse_args()
|
|
561
|
+
|
|
562
|
+
# --- ANSI colors ---
|
|
563
|
+
use_color = sys.stderr.isatty()
|
|
564
|
+
|
|
565
|
+
def _c(code: str, text: str) -> str:
|
|
566
|
+
return f"\033[{code}m{text}\033[0m" if use_color else text
|
|
567
|
+
|
|
568
|
+
bold = lambda t: _c("1", t) # noqa: E731
|
|
569
|
+
dim = lambda t: _c("2", t) # noqa: E731
|
|
570
|
+
cyan = lambda t: _c("36", t) # noqa: E731
|
|
571
|
+
green = lambda t: _c("32", t) # noqa: E731
|
|
572
|
+
yellow = lambda t: _c("33", t) # noqa: E731
|
|
573
|
+
|
|
574
|
+
# --- Validate file ---
|
|
575
|
+
if not args.file.exists():
|
|
576
|
+
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
|
577
|
+
sys.exit(1)
|
|
578
|
+
|
|
579
|
+
try:
|
|
580
|
+
text = args.file.read_text(encoding="utf-8")
|
|
581
|
+
except Exception as e:
|
|
582
|
+
print(f"Error reading file: {e}", file=sys.stderr)
|
|
583
|
+
sys.exit(1)
|
|
584
|
+
|
|
585
|
+
# --- Build Tokenizer kwargs ---
|
|
586
|
+
tokenizer_kwargs = {
|
|
587
|
+
"lowercase": not args.no_lowercase,
|
|
588
|
+
"min_length": args.min_length,
|
|
589
|
+
"max_length": args.max_length,
|
|
590
|
+
"strip_numbers": args.strip_numbers,
|
|
591
|
+
"strip_punctuation": not args.keep_punctuation,
|
|
592
|
+
"preserve_urls": args.preserve_urls,
|
|
593
|
+
"preserve_emails": args.preserve_emails,
|
|
594
|
+
"preserve_hashtags": args.preserve_hashtags,
|
|
595
|
+
"preserve_mentions": args.preserve_mentions,
|
|
596
|
+
"expand_contractions": args.expand_contractions,
|
|
597
|
+
"expand_abbreviations": args.expand_abbreviations,
|
|
598
|
+
"strip_accents": args.strip_accents,
|
|
599
|
+
"normalize_unicode": not args.no_unicode_normalize,
|
|
600
|
+
"clean_text": not args.no_clean,
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
# Collect active options for banner
|
|
604
|
+
active_opts = []
|
|
605
|
+
if args.no_lowercase:
|
|
606
|
+
active_opts.append("preserve case")
|
|
607
|
+
if args.expand_contractions:
|
|
608
|
+
active_opts.append("expand contractions")
|
|
609
|
+
if args.expand_abbreviations:
|
|
610
|
+
active_opts.append("expand abbreviations")
|
|
611
|
+
if args.strip_numbers:
|
|
612
|
+
active_opts.append("strip numbers")
|
|
613
|
+
if args.keep_punctuation:
|
|
614
|
+
active_opts.append("keep punctuation")
|
|
615
|
+
if args.strip_accents:
|
|
616
|
+
active_opts.append("strip accents")
|
|
617
|
+
if args.no_clean:
|
|
618
|
+
active_opts.append("skip cleaning")
|
|
619
|
+
if args.no_unicode_normalize:
|
|
620
|
+
active_opts.append("skip unicode normalization")
|
|
621
|
+
if args.preserve_urls:
|
|
622
|
+
active_opts.append("preserve URLs")
|
|
623
|
+
if args.preserve_emails:
|
|
624
|
+
active_opts.append("preserve emails")
|
|
625
|
+
if args.preserve_hashtags:
|
|
626
|
+
active_opts.append("preserve hashtags")
|
|
627
|
+
if args.preserve_mentions:
|
|
628
|
+
active_opts.append("preserve mentions")
|
|
629
|
+
if args.min_length > 1:
|
|
630
|
+
active_opts.append(f"min length {args.min_length}")
|
|
631
|
+
if args.max_length is not None:
|
|
632
|
+
active_opts.append(f"max length {args.max_length}")
|
|
633
|
+
|
|
634
|
+
# Determine output format
|
|
635
|
+
if args.stats:
|
|
636
|
+
output_format = "Statistics"
|
|
637
|
+
elif args.metadata:
|
|
638
|
+
output_format = "JSON (with metadata)"
|
|
639
|
+
elif args.json:
|
|
640
|
+
output_format = "JSON"
|
|
641
|
+
else:
|
|
642
|
+
output_format = "One token per line"
|
|
643
|
+
|
|
644
|
+
# --- Banner (to stderr so stdout stays pipeable) ---
|
|
645
|
+
char_count = len(text)
|
|
646
|
+
line_count = text.count("\n") + 1
|
|
647
|
+
|
|
648
|
+
banner = sys.stderr
|
|
649
|
+
print(file=banner)
|
|
650
|
+
print(f" {bold('PYSTYLOMETRY')} {dim('—')} {cyan('Stylometric Tokenizer')}", file=banner)
|
|
651
|
+
print(f" {dim('═' * 71)}", file=banner)
|
|
652
|
+
print(file=banner)
|
|
653
|
+
print(f" {bold('INPUT')}", file=banner)
|
|
654
|
+
print(f" {dim('─' * 71)}", file=banner)
|
|
655
|
+
print(f" File: {args.file}", file=banner)
|
|
656
|
+
print(f" Size: {char_count:,} characters / {line_count:,} lines", file=banner)
|
|
657
|
+
print(file=banner)
|
|
658
|
+
print(f" {bold('CONFIGURATION')}", file=banner)
|
|
659
|
+
print(f" {dim('─' * 71)}", file=banner)
|
|
660
|
+
print(f" Case: {'preserve' if args.no_lowercase else 'lowercase'}", file=banner)
|
|
661
|
+
print(
|
|
662
|
+
f" Punctuation: {'keep' if args.keep_punctuation else 'strip'}",
|
|
663
|
+
file=banner,
|
|
664
|
+
)
|
|
665
|
+
print(
|
|
666
|
+
f" Contractions: {'expand' if args.expand_contractions else 'preserve'}",
|
|
667
|
+
file=banner,
|
|
668
|
+
)
|
|
669
|
+
print(f" Numbers: {'strip' if args.strip_numbers else 'keep'}", file=banner)
|
|
670
|
+
if active_opts:
|
|
671
|
+
print(f" Active options: {', '.join(active_opts)}", file=banner)
|
|
672
|
+
print(file=banner)
|
|
673
|
+
print(f" {bold('OUTPUT')}", file=banner)
|
|
674
|
+
print(f" {dim('─' * 71)}", file=banner)
|
|
675
|
+
print(f" Format: {output_format}", file=banner)
|
|
676
|
+
print(file=banner)
|
|
677
|
+
|
|
678
|
+
# --- Tokenize ---
|
|
679
|
+
from pystylometry.tokenizer import Tokenizer
|
|
680
|
+
|
|
681
|
+
tokenizer = Tokenizer(**tokenizer_kwargs)
|
|
682
|
+
|
|
683
|
+
if args.stats:
|
|
684
|
+
stats = tokenizer.get_statistics(text)
|
|
685
|
+
print(f" {bold('RESULTS')}", file=banner)
|
|
686
|
+
print(f" {dim('─' * 71)}", file=banner)
|
|
687
|
+
print(f" Total tokens: {green(f'{stats.total_tokens:,}')}", file=banner)
|
|
688
|
+
print(f" Unique tokens: {green(f'{stats.unique_tokens:,}')}", file=banner)
|
|
689
|
+
print(f" Word tokens: {stats.word_tokens:,}", file=banner)
|
|
690
|
+
print(f" Number tokens: {stats.number_tokens:,}", file=banner)
|
|
691
|
+
print(f" Punctuation: {stats.punctuation_tokens:,}", file=banner)
|
|
692
|
+
print(f" URLs: {stats.url_tokens:,}", file=banner)
|
|
693
|
+
print(f" Emails: {stats.email_tokens:,}", file=banner)
|
|
694
|
+
print(f" Hashtags: {stats.hashtag_tokens:,}", file=banner)
|
|
695
|
+
print(f" Mentions: {stats.mention_tokens:,}", file=banner)
|
|
696
|
+
print(f" Avg length: {stats.average_token_length:.1f}", file=banner)
|
|
697
|
+
print(f" Min length: {stats.min_token_length}", file=banner)
|
|
698
|
+
print(f" Max length: {stats.max_token_length}", file=banner)
|
|
699
|
+
print(file=banner)
|
|
700
|
+
|
|
701
|
+
if args.json:
|
|
702
|
+
import dataclasses
|
|
703
|
+
|
|
704
|
+
print(json.dumps(dataclasses.asdict(stats), indent=2))
|
|
705
|
+
|
|
706
|
+
elif args.metadata or (args.json and args.metadata):
|
|
707
|
+
metadata_list = tokenizer.tokenize_with_metadata(text)
|
|
708
|
+
count = len(metadata_list)
|
|
709
|
+
print(
|
|
710
|
+
f" {yellow('Tokenizing...')} {green(f'{count:,}')} tokens extracted",
|
|
711
|
+
file=banner,
|
|
712
|
+
)
|
|
713
|
+
print(file=banner)
|
|
714
|
+
output = [
|
|
715
|
+
{
|
|
716
|
+
"token": m.token,
|
|
717
|
+
"start": m.start,
|
|
718
|
+
"end": m.end,
|
|
719
|
+
"type": m.token_type,
|
|
720
|
+
}
|
|
721
|
+
for m in metadata_list
|
|
722
|
+
]
|
|
723
|
+
print(json.dumps(output, indent=2))
|
|
724
|
+
|
|
725
|
+
elif args.json:
|
|
726
|
+
tokens = tokenizer.tokenize(text)
|
|
727
|
+
count = len(tokens)
|
|
728
|
+
print(
|
|
729
|
+
f" {yellow('Tokenizing...')} {green(f'{count:,}')} tokens extracted",
|
|
730
|
+
file=banner,
|
|
731
|
+
)
|
|
732
|
+
print(file=banner)
|
|
733
|
+
print(json.dumps(tokens, indent=2))
|
|
734
|
+
|
|
735
|
+
else:
|
|
736
|
+
tokens = tokenizer.tokenize(text)
|
|
737
|
+
count = len(tokens)
|
|
738
|
+
print(
|
|
739
|
+
f" {yellow('Tokenizing...')} {green(f'{count:,}')} tokens extracted",
|
|
740
|
+
file=banner,
|
|
741
|
+
)
|
|
742
|
+
print(file=banner)
|
|
743
|
+
for token in tokens:
|
|
744
|
+
print(token)
|
|
745
|
+
|
|
746
|
+
|
|
426
747
|
if __name__ == "__main__":
|
|
427
748
|
drift_cli()
|
pystylometry/lexical/__init__.py
CHANGED
|
@@ -6,12 +6,13 @@ from .function_words import compute_function_words
|
|
|
6
6
|
from .hapax import compute_hapax_ratios, compute_hapax_with_lexicon_analysis
|
|
7
7
|
from .mtld import compute_mtld
|
|
8
8
|
from .repetition import compute_repetitive_ngrams, compute_repetitive_unigrams
|
|
9
|
-
from .ttr import compute_ttr
|
|
9
|
+
from .ttr import TTRAggregator, compute_ttr
|
|
10
10
|
from .word_frequency_sophistication import compute_word_frequency_sophistication
|
|
11
11
|
from .yule import compute_yule
|
|
12
12
|
|
|
13
13
|
__all__ = [
|
|
14
14
|
"compute_ttr",
|
|
15
|
+
"TTRAggregator",
|
|
15
16
|
"compute_mtld",
|
|
16
17
|
"compute_yule",
|
|
17
18
|
"compute_hapax_ratios",
|
pystylometry/lexical/ttr.py
CHANGED
|
@@ -1,149 +1,340 @@
|
|
|
1
|
-
"""Type-Token Ratio (TTR) analysis
|
|
1
|
+
"""Type-Token Ratio (TTR) analysis with native chunked computation.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Computes multiple TTR variants for measuring lexical diversity (vocabulary
|
|
4
|
+
richness). All metrics are computed per-chunk and wrapped in Distribution
|
|
5
|
+
objects for stylometric fingerprinting.
|
|
5
6
|
|
|
6
|
-
|
|
7
|
+
Previously delegated to the external ``stylometry-ttr`` package; now
|
|
8
|
+
computed inline using only the Python standard library (``math`` and
|
|
9
|
+
``statistics``).
|
|
10
|
+
|
|
11
|
+
Related GitHub Issues:
|
|
7
12
|
#27 - Native chunked analysis with Distribution dataclass
|
|
8
13
|
https://github.com/craigtrim/pystylometry/issues/27
|
|
14
|
+
|
|
15
|
+
#43 - Inline stylometry-ttr into pystylometry (remove external dependency)
|
|
16
|
+
https://github.com/craigtrim/pystylometry/issues/43
|
|
17
|
+
|
|
18
|
+
References:
|
|
19
|
+
Guiraud, P. (1960). Problèmes et méthodes de la statistique linguistique.
|
|
20
|
+
Herdan, G. (1960). Type-token Mathematics: A Textbook of Mathematical
|
|
21
|
+
Linguistics. Mouton.
|
|
22
|
+
Johnson, W. (1944). Studies in language behavior: I. A program of research.
|
|
23
|
+
Psychological Monographs, 56(2), 1-15.
|
|
9
24
|
"""
|
|
10
25
|
|
|
11
26
|
from __future__ import annotations
|
|
12
27
|
|
|
13
|
-
|
|
28
|
+
import math
|
|
29
|
+
import statistics
|
|
30
|
+
from typing import Optional
|
|
31
|
+
|
|
32
|
+
from .._types import Distribution, TTRAggregateResult, TTRResult, make_distribution
|
|
33
|
+
from ..tokenizer import Tokenizer
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Internal helpers
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
# Minimum words required before STTR computation is meaningful.
|
|
40
|
+
# With fewer words we cannot form at least two full chunks, so the
|
|
41
|
+
# standardised metric would be unreliable.
|
|
42
|
+
_MIN_WORDS_FOR_STTR = 2000
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _compute_chunk_ttrs(tokens: list[str], chunk_size: int) -> list[float]:
|
|
46
|
+
"""Compute per-chunk raw TTR values for non-overlapping chunks.
|
|
47
|
+
|
|
48
|
+
Only full-sized chunks are included so that every TTR is measured on the
|
|
49
|
+
same token count, keeping the standardised metric unbiased.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
tokens: Full token list.
|
|
53
|
+
chunk_size: Number of tokens per chunk.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
List of per-chunk TTR values (may be empty if too few tokens).
|
|
57
|
+
"""
|
|
58
|
+
total = len(tokens)
|
|
59
|
+
chunk_ttrs: list[float] = []
|
|
60
|
+
for i in range(0, total - chunk_size + 1, chunk_size):
|
|
61
|
+
chunk = tokens[i : i + chunk_size]
|
|
62
|
+
chunk_ttrs.append(len(set(chunk)) / chunk_size)
|
|
63
|
+
return chunk_ttrs
|
|
64
|
+
|
|
14
65
|
|
|
66
|
+
def _compute_deltas(
|
|
67
|
+
chunk_ttrs: list[float],
|
|
68
|
+
) -> tuple[Optional[float], Optional[float], Optional[float], Optional[float]]:
|
|
69
|
+
"""Compute delta metrics: TTR(n) - TTR(n-1) for consecutive chunks.
|
|
15
70
|
|
|
16
|
-
|
|
71
|
+
Delta metrics capture chunk-to-chunk vocabulary variability:
|
|
72
|
+
- delta_mean: average change (positive = expanding vocabulary)
|
|
73
|
+
- delta_std: volatility of change (stylometric fingerprint)
|
|
74
|
+
- delta_min: largest negative swing
|
|
75
|
+
- delta_max: largest positive swing
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
chunk_ttrs: Per-chunk TTR values (needs >= 2 values).
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Tuple of (delta_mean, delta_std, delta_min, delta_max).
|
|
82
|
+
All ``None`` when fewer than 2 chunks are available.
|
|
17
83
|
"""
|
|
18
|
-
|
|
84
|
+
if len(chunk_ttrs) < 2:
|
|
85
|
+
return None, None, None, None
|
|
86
|
+
|
|
87
|
+
deltas = [chunk_ttrs[i] - chunk_ttrs[i - 1] for i in range(1, len(chunk_ttrs))]
|
|
88
|
+
d_mean = statistics.mean(deltas)
|
|
89
|
+
d_std = statistics.stdev(deltas) if len(deltas) > 1 else 0.0
|
|
90
|
+
return d_mean, d_std, min(deltas), max(deltas)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# ---------------------------------------------------------------------------
|
|
94
|
+
# Public API
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
|
|
19
97
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
98
|
+
def compute_ttr(
|
|
99
|
+
text: str,
|
|
100
|
+
text_id: str | None = None,
|
|
101
|
+
chunk_size: int = 1000,
|
|
102
|
+
) -> TTRResult:
|
|
103
|
+
"""Compute Type-Token Ratio (TTR) metrics for vocabulary richness.
|
|
104
|
+
|
|
105
|
+
Tokenises the input with pystylometry's ``Tokenizer`` (lowercase, words
|
|
106
|
+
only), then computes five TTR-family metrics. Each metric is computed
|
|
107
|
+
per-chunk and the full per-chunk distribution is exposed via a
|
|
108
|
+
``Distribution`` object for stylometric fingerprinting.
|
|
23
109
|
|
|
24
110
|
Metrics computed:
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
111
|
+
- **Raw TTR**: ``unique_words / total_words``
|
|
112
|
+
- **Root TTR** (Guiraud's index): ``unique_words / sqrt(total_words)``
|
|
113
|
+
- **Log TTR** (Herdan's C): ``log(unique_words) / log(total_words)``
|
|
114
|
+
- **STTR**: Mean TTR across fixed-size chunks (reduces length bias).
|
|
115
|
+
Only computed when the text has >= 2000 words.
|
|
116
|
+
- **Delta Std**: Std-dev of chunk-to-chunk TTR change (vocabulary
|
|
117
|
+
consistency). Only computed when >= 2 chunks are available.
|
|
30
118
|
|
|
31
|
-
Related GitHub
|
|
119
|
+
Related GitHub Issues:
|
|
32
120
|
#27 - Native chunked analysis with Distribution dataclass
|
|
33
121
|
https://github.com/craigtrim/pystylometry/issues/27
|
|
34
122
|
|
|
123
|
+
#43 - Inline stylometry-ttr into pystylometry
|
|
124
|
+
https://github.com/craigtrim/pystylometry/issues/43
|
|
125
|
+
|
|
35
126
|
References:
|
|
36
|
-
Guiraud, P. (1960). Problèmes et méthodes de la statistique
|
|
127
|
+
Guiraud, P. (1960). Problèmes et méthodes de la statistique
|
|
128
|
+
linguistique.
|
|
37
129
|
Herdan, G. (1960). Type-token Mathematics: A Textbook of Mathematical
|
|
38
130
|
Linguistics. Mouton.
|
|
39
|
-
Johnson, W. (1944). Studies in language behavior: I. A program of
|
|
40
|
-
Psychological Monographs, 56(2), 1-15.
|
|
131
|
+
Johnson, W. (1944). Studies in language behavior: I. A program of
|
|
132
|
+
research. Psychological Monographs, 56(2), 1-15.
|
|
41
133
|
|
|
42
134
|
Args:
|
|
43
|
-
text: Input text to
|
|
44
|
-
text_id: Optional identifier for the text (
|
|
45
|
-
chunk_size: Number of words per chunk
|
|
46
|
-
|
|
47
|
-
so this parameter is included for API consistency but actual chunking
|
|
48
|
-
behavior is delegated to stylometry-ttr.
|
|
135
|
+
text: Input text to analyse.
|
|
136
|
+
text_id: Optional identifier for the text (stored in metadata).
|
|
137
|
+
chunk_size: Number of words per chunk for STTR and per-chunk
|
|
138
|
+
distributions (default: 1000).
|
|
49
139
|
|
|
50
140
|
Returns:
|
|
51
|
-
TTRResult with all TTR variants
|
|
52
|
-
objects for stylometric fingerprinting.
|
|
141
|
+
TTRResult with all TTR variants, Distribution objects, and metadata.
|
|
53
142
|
|
|
54
143
|
Example:
|
|
55
144
|
>>> result = compute_ttr("The quick brown fox jumps over the lazy dog.")
|
|
56
145
|
>>> print(f"Raw TTR: {result.ttr:.3f}")
|
|
57
|
-
Raw TTR:
|
|
146
|
+
Raw TTR: 1.000
|
|
58
147
|
>>> print(f"Root TTR: {result.root_ttr:.3f}")
|
|
59
|
-
Root TTR:
|
|
60
|
-
>>> print(f"STTR: {result.sttr:.3f}")
|
|
61
|
-
STTR: 1.000
|
|
148
|
+
Root TTR: 3.000
|
|
62
149
|
|
|
63
150
|
>>> # With text identifier
|
|
64
151
|
>>> result = compute_ttr("Sample text here.", text_id="sample-001")
|
|
65
152
|
>>> print(result.metadata["text_id"])
|
|
66
153
|
sample-001
|
|
67
154
|
"""
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
root_ttr_val = ttr_result.root_ttr
|
|
84
|
-
log_ttr_val = ttr_result.log_ttr
|
|
85
|
-
sttr_val = ttr_result.sttr if ttr_result.sttr is not None else 0.0
|
|
86
|
-
delta_std_val = ttr_result.delta_std if ttr_result.delta_std is not None else 0.0
|
|
87
|
-
|
|
88
|
-
# Create single-value distributions from stylometry-ttr results
|
|
89
|
-
# The stylometry-ttr package handles its own internal chunking for STTR
|
|
90
|
-
# so we wrap the aggregate results in Distribution objects
|
|
91
|
-
ttr_dist = (
|
|
92
|
-
make_distribution([ttr_val])
|
|
93
|
-
if ttr_val is not None
|
|
94
|
-
else Distribution(
|
|
95
|
-
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
155
|
+
# Tokenise using pystylometry's own tokenizer (lowercase, words only)
|
|
156
|
+
tokenizer = Tokenizer(lowercase=True, strip_punctuation=True)
|
|
157
|
+
tokens = tokenizer.tokenize(text)
|
|
158
|
+
|
|
159
|
+
total_words = len(tokens)
|
|
160
|
+
|
|
161
|
+
# --- empty / trivial text --------------------------------------------------
|
|
162
|
+
if total_words == 0:
|
|
163
|
+
empty_dist = Distribution(
|
|
164
|
+
values=[],
|
|
165
|
+
mean=float("nan"),
|
|
166
|
+
median=float("nan"),
|
|
167
|
+
std=0.0,
|
|
168
|
+
range=0.0,
|
|
169
|
+
iqr=0.0,
|
|
96
170
|
)
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
171
|
+
return TTRResult(
|
|
172
|
+
total_words=0,
|
|
173
|
+
unique_words=0,
|
|
174
|
+
ttr=0.0,
|
|
175
|
+
root_ttr=0.0,
|
|
176
|
+
log_ttr=0.0,
|
|
177
|
+
sttr=0.0,
|
|
178
|
+
delta_std=0.0,
|
|
179
|
+
ttr_dist=empty_dist,
|
|
180
|
+
root_ttr_dist=empty_dist,
|
|
181
|
+
log_ttr_dist=empty_dist,
|
|
182
|
+
sttr_dist=empty_dist,
|
|
183
|
+
delta_std_dist=empty_dist,
|
|
184
|
+
chunk_size=chunk_size,
|
|
185
|
+
chunk_count=0,
|
|
186
|
+
metadata={
|
|
187
|
+
"text_id": text_id or "",
|
|
188
|
+
"sttr_available": False,
|
|
189
|
+
"delta_std_available": False,
|
|
190
|
+
},
|
|
103
191
|
)
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
192
|
+
|
|
193
|
+
# --- global metrics --------------------------------------------------------
|
|
194
|
+
unique_words = len(set(tokens))
|
|
195
|
+
ttr_val = unique_words / total_words
|
|
196
|
+
root_ttr_val = unique_words / math.sqrt(total_words)
|
|
197
|
+
log_ttr_val = math.log(unique_words) / math.log(total_words) if total_words > 1 else 0.0
|
|
198
|
+
|
|
199
|
+
# --- per-chunk metrics -----------------------------------------------------
|
|
200
|
+
chunk_ttrs = _compute_chunk_ttrs(tokens, chunk_size)
|
|
201
|
+
chunk_count = len(chunk_ttrs)
|
|
202
|
+
|
|
203
|
+
# STTR: mean TTR across chunks (only meaningful with enough text)
|
|
204
|
+
sttr_available = total_words >= _MIN_WORDS_FOR_STTR and chunk_count >= 1
|
|
205
|
+
if sttr_available:
|
|
206
|
+
sttr_val = statistics.mean(chunk_ttrs)
|
|
207
|
+
else:
|
|
208
|
+
sttr_val = 0.0
|
|
209
|
+
|
|
210
|
+
# Delta metrics
|
|
211
|
+
delta_mean, delta_std_val, delta_min, delta_max = _compute_deltas(chunk_ttrs)
|
|
212
|
+
delta_std_available = delta_std_val is not None
|
|
213
|
+
if delta_std_val is None:
|
|
214
|
+
delta_std_val = 0.0
|
|
215
|
+
|
|
216
|
+
# --- build Distribution objects --------------------------------------------
|
|
217
|
+
# For per-chunk distributions: compute root_ttr and log_ttr per chunk as well
|
|
218
|
+
if chunk_count >= 1:
|
|
219
|
+
ttr_dist = make_distribution(chunk_ttrs)
|
|
220
|
+
|
|
221
|
+
# Root TTR per chunk: for each chunk of chunk_size tokens,
|
|
222
|
+
# root_ttr = unique / sqrt(chunk_size)
|
|
223
|
+
root_ttr_chunks = [
|
|
224
|
+
len(set(tokens[i : i + chunk_size])) / math.sqrt(chunk_size)
|
|
225
|
+
for i in range(0, total_words - chunk_size + 1, chunk_size)
|
|
226
|
+
]
|
|
227
|
+
root_ttr_dist = make_distribution(root_ttr_chunks)
|
|
228
|
+
|
|
229
|
+
# Log TTR per chunk
|
|
230
|
+
log_ttr_chunks = []
|
|
231
|
+
for i in range(0, total_words - chunk_size + 1, chunk_size):
|
|
232
|
+
chunk = tokens[i : i + chunk_size]
|
|
233
|
+
u = len(set(chunk))
|
|
234
|
+
t = len(chunk)
|
|
235
|
+
val = math.log(u) / math.log(t) if t > 1 else 0.0
|
|
236
|
+
log_ttr_chunks.append(val)
|
|
237
|
+
log_ttr_dist = make_distribution(log_ttr_chunks)
|
|
238
|
+
|
|
239
|
+
sttr_dist = (
|
|
240
|
+
make_distribution(chunk_ttrs) if sttr_available else make_distribution([sttr_val])
|
|
117
241
|
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
make_distribution([delta_std_val])
|
|
121
|
-
if ttr_result.delta_std is not None
|
|
122
|
-
else Distribution(
|
|
123
|
-
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
242
|
+
delta_std_dist = (
|
|
243
|
+
make_distribution([delta_std_val]) if delta_std_available else make_distribution([0.0])
|
|
124
244
|
)
|
|
125
|
-
|
|
245
|
+
else:
|
|
246
|
+
# Not enough text for any chunks — wrap globals in single-value dists
|
|
247
|
+
ttr_dist = make_distribution([ttr_val])
|
|
248
|
+
root_ttr_dist = make_distribution([root_ttr_val])
|
|
249
|
+
log_ttr_dist = make_distribution([log_ttr_val])
|
|
250
|
+
sttr_dist = make_distribution([sttr_val])
|
|
251
|
+
delta_std_dist = make_distribution([0.0])
|
|
126
252
|
|
|
127
|
-
# Convert to our TTRResult dataclass
|
|
128
253
|
return TTRResult(
|
|
129
|
-
total_words=
|
|
130
|
-
unique_words=
|
|
131
|
-
ttr=ttr_val
|
|
132
|
-
root_ttr=root_ttr_val
|
|
133
|
-
log_ttr=log_ttr_val
|
|
134
|
-
sttr=sttr_val,
|
|
135
|
-
delta_std=delta_std_val,
|
|
254
|
+
total_words=total_words,
|
|
255
|
+
unique_words=unique_words,
|
|
256
|
+
ttr=round(ttr_val, 6),
|
|
257
|
+
root_ttr=round(root_ttr_val, 4),
|
|
258
|
+
log_ttr=round(log_ttr_val, 6),
|
|
259
|
+
sttr=round(sttr_val, 6),
|
|
260
|
+
delta_std=round(delta_std_val, 6),
|
|
136
261
|
ttr_dist=ttr_dist,
|
|
137
262
|
root_ttr_dist=root_ttr_dist,
|
|
138
263
|
log_ttr_dist=log_ttr_dist,
|
|
139
264
|
sttr_dist=sttr_dist,
|
|
140
265
|
delta_std_dist=delta_std_dist,
|
|
141
266
|
chunk_size=chunk_size,
|
|
142
|
-
chunk_count=
|
|
267
|
+
chunk_count=chunk_count if chunk_count >= 1 else 1,
|
|
143
268
|
metadata={
|
|
144
269
|
"text_id": text_id or "",
|
|
145
|
-
"
|
|
146
|
-
"
|
|
147
|
-
"delta_std_available": ttr_result.delta_std is not None,
|
|
270
|
+
"sttr_available": sttr_available,
|
|
271
|
+
"delta_std_available": delta_std_available,
|
|
148
272
|
},
|
|
149
273
|
)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
# ---------------------------------------------------------------------------
|
|
277
|
+
# Aggregation
|
|
278
|
+
# ---------------------------------------------------------------------------
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class TTRAggregator:
|
|
282
|
+
"""Aggregate per-text TTR results into group-level statistics.
|
|
283
|
+
|
|
284
|
+
Useful for comparing vocabulary richness across authors, genres, or
|
|
285
|
+
time periods by computing summary statistics (mean, std, min, max,
|
|
286
|
+
median) over a collection of ``TTRResult`` objects.
|
|
287
|
+
|
|
288
|
+
Related GitHub Issue:
|
|
289
|
+
#43 - Inline stylometry-ttr into pystylometry
|
|
290
|
+
https://github.com/craigtrim/pystylometry/issues/43
|
|
291
|
+
|
|
292
|
+
Example:
|
|
293
|
+
>>> from pystylometry.lexical import compute_ttr, TTRAggregator
|
|
294
|
+
>>> results = [compute_ttr(t) for t in texts]
|
|
295
|
+
>>> agg = TTRAggregator()
|
|
296
|
+
>>> stats = agg.aggregate(results, group_id="Shakespeare")
|
|
297
|
+
>>> print(stats.ttr_mean)
|
|
298
|
+
0.412
|
|
299
|
+
"""
|
|
300
|
+
|
|
301
|
+
def aggregate(self, results: list[TTRResult], group_id: str) -> TTRAggregateResult:
|
|
302
|
+
"""Compute aggregate statistics from multiple TTR results.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
results: List of per-text ``TTRResult`` objects.
|
|
306
|
+
group_id: Identifier for the group (e.g. author name).
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
``TTRAggregateResult`` with group-level statistics.
|
|
310
|
+
|
|
311
|
+
Raises:
|
|
312
|
+
ValueError: If *results* is empty.
|
|
313
|
+
"""
|
|
314
|
+
if not results:
|
|
315
|
+
raise ValueError("Cannot aggregate empty results list")
|
|
316
|
+
|
|
317
|
+
ttrs = [r.ttr for r in results]
|
|
318
|
+
root_ttrs = [r.root_ttr for r in results]
|
|
319
|
+
log_ttrs = [r.log_ttr for r in results]
|
|
320
|
+
sttrs = [r.sttr for r in results if r.metadata.get("sttr_available")]
|
|
321
|
+
delta_stds = [r.delta_std for r in results if r.metadata.get("delta_std_available")]
|
|
322
|
+
|
|
323
|
+
return TTRAggregateResult(
|
|
324
|
+
group_id=group_id,
|
|
325
|
+
text_count=len(results),
|
|
326
|
+
total_words=sum(r.total_words for r in results),
|
|
327
|
+
ttr_mean=round(statistics.mean(ttrs), 6),
|
|
328
|
+
ttr_std=round(statistics.stdev(ttrs), 6) if len(ttrs) > 1 else 0.0,
|
|
329
|
+
ttr_min=round(min(ttrs), 6),
|
|
330
|
+
ttr_max=round(max(ttrs), 6),
|
|
331
|
+
ttr_median=round(statistics.median(ttrs), 6),
|
|
332
|
+
root_ttr_mean=round(statistics.mean(root_ttrs), 4),
|
|
333
|
+
root_ttr_std=round(statistics.stdev(root_ttrs), 4) if len(root_ttrs) > 1 else 0.0,
|
|
334
|
+
log_ttr_mean=round(statistics.mean(log_ttrs), 6),
|
|
335
|
+
log_ttr_std=round(statistics.stdev(log_ttrs), 6) if len(log_ttrs) > 1 else 0.0,
|
|
336
|
+
sttr_mean=round(statistics.mean(sttrs), 6) if sttrs else None,
|
|
337
|
+
sttr_std=round(statistics.stdev(sttrs), 6) if len(sttrs) > 1 else None,
|
|
338
|
+
delta_std_mean=round(statistics.mean(delta_stds), 6) if delta_stds else None,
|
|
339
|
+
metadata={"group_id": group_id, "text_count": len(results)},
|
|
340
|
+
)
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pystylometry
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.5
|
|
4
4
|
Summary: Comprehensive Python package for stylometric analysis
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: stylometry,nlp,text-analysis,authorship,readability,lexical-diversity,readability-metrics
|
|
7
7
|
Author: Craig Trim
|
|
8
8
|
Author-email: craigtrim@gmail.com
|
|
9
9
|
Requires-Python: >=3.9,<4.0
|
|
10
|
-
Classifier: Development Status ::
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
11
|
Classifier: Intended Audience :: Developers
|
|
12
12
|
Classifier: Intended Audience :: Science/Research
|
|
13
13
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -19,7 +19,6 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
19
19
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
20
20
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
21
21
|
Classifier: Typing :: Typed
|
|
22
|
-
Requires-Dist: stylometry-ttr (>=1.0.3,<2.0.0)
|
|
23
22
|
Project-URL: Homepage, https://github.com/craigtrim/pystylometry
|
|
24
23
|
Project-URL: Issues, https://github.com/craigtrim/pystylometry/issues
|
|
25
24
|
Project-URL: Repository, https://github.com/craigtrim/pystylometry
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
pystylometry/README.md,sha256=WFOtCAF3qtDTgGG3a_jTjNSwVgpQEXI1PKqbVBfyo1M,2366
|
|
2
|
-
pystylometry/__init__.py,sha256=
|
|
2
|
+
pystylometry/__init__.py,sha256=bZ8xk66Mx7gj3K_I6594DoqATIGv1FtLFSJmF6Dz1g4,10462
|
|
3
3
|
pystylometry/_normalize.py,sha256=7tdfgAKg5CI2d4eoDypmFqOVByoxpwgUUZD6vyBH86A,8679
|
|
4
|
-
pystylometry/_types.py,sha256=
|
|
4
|
+
pystylometry/_types.py,sha256=_YCkVyvHulmKkvmjzb73dcCOWJwiJZVhkV7sJcMr4YY,83618
|
|
5
5
|
pystylometry/_utils.py,sha256=CXTx4KDJ_6iiHcc2OXqOYs-izhLf_ZEmJFKdHyd7q34,5282
|
|
6
6
|
pystylometry/authorship/README.md,sha256=zNXCpLj7nczPnYykJnCUw3y-kxfC9mWZmngi3nfw6us,1016
|
|
7
7
|
pystylometry/authorship/__init__.py,sha256=D7m38hWi_62o1ZDSrghLCfob9YsykTht4K37wiVgHfg,1530
|
|
@@ -13,7 +13,7 @@ pystylometry/authorship/zeta.py,sha256=oOi9Y6ZPq15ILLVl6So9O9ERvzig26en6_dpQJWeo
|
|
|
13
13
|
pystylometry/character/README.md,sha256=poQwhbI8MabVD_626CWjEL87IOX5YDGS0ZJTH1hNwEE,607
|
|
14
14
|
pystylometry/character/__init__.py,sha256=CiiKJmZ10UJE8qAecavpOKyw-vGonsOew_mFH34ZOC0,371
|
|
15
15
|
pystylometry/character/character_metrics.py,sha256=OCIGP_ivtwtzcifcxcbmp2R5SIKh2tKyvKcHAv64S8g,14029
|
|
16
|
-
pystylometry/cli.py,sha256=
|
|
16
|
+
pystylometry/cli.py,sha256=HvzBZxFSiS5AAXCb6N9Eo3QonkH-ucRFp6xDF1kJTQ0,26380
|
|
17
17
|
pystylometry/consistency/README.md,sha256=HG_Rd6WRBnIz3M7J11dVDv1S2ARkMABFYrTn-VV8xRY,1058
|
|
18
18
|
pystylometry/consistency/__init__.py,sha256=l7nzpS7M4yHDBbM2LGAtW0XGT2n7YjSey_1xKf45224,2181
|
|
19
19
|
pystylometry/consistency/_thresholds.py,sha256=5fZwdJ_cnDy0ED7CCYs6V_zP6kIAR1p0h0NYkbZ0HRg,6381
|
|
@@ -24,13 +24,13 @@ pystylometry/dialect/_data/dialect_markers.json,sha256=DthluOA6q0rG_8IrCrFIYWh_E
|
|
|
24
24
|
pystylometry/dialect/_loader.py,sha256=M2ATp-5754v_yX9EWvBP0r5qgNf8xlL8XadVsVb_Hco,12989
|
|
25
25
|
pystylometry/dialect/detector.py,sha256=9x0ZuIfTIjsmdNSx0Ezy5AC0SAFtC4kVw11iOSBd9gQ,20147
|
|
26
26
|
pystylometry/lexical/README.md,sha256=cFQ7KRZV4ubsQwIlOH3YHTbhhNl5X91Sr3zcn-3x0HI,1185
|
|
27
|
-
pystylometry/lexical/__init__.py,sha256=
|
|
27
|
+
pystylometry/lexical/__init__.py,sha256=ib_F-NGVydLNGT_HgaWurBT25AadTE4eNcAN1lGMKmQ,934
|
|
28
28
|
pystylometry/lexical/advanced_diversity.py,sha256=rL1hlNqTnaEFcA2v4oBJlojHZMTqdvvm4jYXTFGVpYE,25664
|
|
29
29
|
pystylometry/lexical/function_words.py,sha256=eel9bq_qWgWlvG0NtDiouilMt9kaFqz2rh3add2UC4U,17832
|
|
30
30
|
pystylometry/lexical/hapax.py,sha256=djTqZyZIYXa3GRiPoy6TTGHPm0wCRNJ9U0Rwnf5NoDk,12173
|
|
31
31
|
pystylometry/lexical/mtld.py,sha256=XpeCF8sOXZhWbaazHGuqm08mrOf_DYfkfGGAltWnyy4,7101
|
|
32
32
|
pystylometry/lexical/repetition.py,sha256=A9L0oNwfnCepVkWy57kjHV47Pw4M6fZXEl25hBVdq2s,18318
|
|
33
|
-
pystylometry/lexical/ttr.py,sha256=
|
|
33
|
+
pystylometry/lexical/ttr.py,sha256=igS8gnvIv57zvjQPtmIgkB5Wj7jdaKSMRpJ1WvMfKtw,13091
|
|
34
34
|
pystylometry/lexical/word_frequency_sophistication.py,sha256=OHOS0fBvd1Bz8zsJk-pJbWLTgImmBd-aewQnp_kq8BY,38828
|
|
35
35
|
pystylometry/lexical/yule.py,sha256=NXggha8jmQCu4i-qKZpISwyJBqNpuPHyVR86BLDLgio,5192
|
|
36
36
|
pystylometry/ngrams/README.md,sha256=50wyaWcLGbosLzTPR1cXdE_xAVU8jVY7fd3ReEk9KnY,802
|
|
@@ -71,8 +71,8 @@ pystylometry/viz/jsx/_base.py,sha256=nd7kEc13fUcRMom3A5jqjGyTy-djIeydq2k3oPHZIHY
|
|
|
71
71
|
pystylometry/viz/jsx/report.py,sha256=DbbHnnNAEi5tmVg4PmiHb17vkBBXujyE4x1CfVBiOBw,25857
|
|
72
72
|
pystylometry/viz/jsx/timeline.py,sha256=hor-xnBa6oVkSqN0AEZUCQFBOB-iTfHSFZHiEfeakPA,30716
|
|
73
73
|
pystylometry/viz/jsx/viewer.py,sha256=3LO49d_2bRf_P-P-2oSKpKx4N8Ugo4oCLb3DtvyNxXI,43716
|
|
74
|
-
pystylometry-1.3.
|
|
75
|
-
pystylometry-1.3.
|
|
76
|
-
pystylometry-1.3.
|
|
77
|
-
pystylometry-1.3.
|
|
78
|
-
pystylometry-1.3.
|
|
74
|
+
pystylometry-1.3.5.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
|
|
75
|
+
pystylometry-1.3.5.dist-info/METADATA,sha256=DEFzNf_ZQd3mulwOnFMRSyc1s30alGM6UtO-L7nloPc,4779
|
|
76
|
+
pystylometry-1.3.5.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
77
|
+
pystylometry-1.3.5.dist-info/entry_points.txt,sha256=XsJvKgKs3LRDuzdF45JO7ZnS0wGKg1f5qsxVYSZzLp8,165
|
|
78
|
+
pystylometry-1.3.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|