pystylometry 1.3.1__py3-none-any.whl → 1.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +42 -3
- pystylometry/_types.py +53 -3
- pystylometry/cli.py +695 -0
- pystylometry/lexical/__init__.py +4 -1
- pystylometry/lexical/bnc_frequency.py +309 -0
- pystylometry/lexical/ttr.py +288 -97
- pystylometry/viz/jsx/__init__.py +2 -0
- pystylometry/viz/jsx/bnc_frequency.py +495 -0
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/METADATA +16 -3
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/RECORD +13 -11
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/entry_points.txt +2 -0
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/LICENSE +0 -0
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/WHEEL +0 -0
pystylometry/cli.py
CHANGED
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
Usage:
|
|
4
4
|
pystylometry-drift <file> [--window-size=N] [--stride=N] [--mode=MODE] [--json]
|
|
5
5
|
pystylometry-drift <file> --plot [output.png]
|
|
6
|
+
pystylometry-tokenize <file> [--json] [--metadata] [--stats]
|
|
7
|
+
bnc --input-file <file> [--output-file <file>] [--format csv|html|json]
|
|
6
8
|
|
|
7
9
|
Example:
|
|
8
10
|
pystylometry-drift manuscript.txt
|
|
@@ -10,6 +12,12 @@ Example:
|
|
|
10
12
|
pystylometry-drift manuscript.txt --json
|
|
11
13
|
pystylometry-drift manuscript.txt --plot
|
|
12
14
|
pystylometry-drift manuscript.txt --plot drift_report.png
|
|
15
|
+
pystylometry-tokenize manuscript.txt
|
|
16
|
+
pystylometry-tokenize manuscript.txt --json --metadata
|
|
17
|
+
pystylometry-tokenize manuscript.txt --stats
|
|
18
|
+
bnc --input-file manuscript.txt
|
|
19
|
+
bnc --input-file manuscript.txt --output-file report.html --format html
|
|
20
|
+
bnc -i manuscript.txt --format json
|
|
13
21
|
"""
|
|
14
22
|
|
|
15
23
|
from __future__ import annotations
|
|
@@ -423,5 +431,692 @@ The generated viewer includes:
|
|
|
423
431
|
print()
|
|
424
432
|
|
|
425
433
|
|
|
434
|
+
def tokenize_cli() -> None:
|
|
435
|
+
"""CLI entry point for stylometric tokenization."""
|
|
436
|
+
parser = argparse.ArgumentParser(
|
|
437
|
+
prog="pystylometry-tokenize",
|
|
438
|
+
description="Tokenize text for stylometric analysis.",
|
|
439
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
440
|
+
epilog="""
|
|
441
|
+
Examples:
|
|
442
|
+
pystylometry-tokenize manuscript.txt
|
|
443
|
+
pystylometry-tokenize manuscript.txt --json
|
|
444
|
+
pystylometry-tokenize manuscript.txt --json --metadata
|
|
445
|
+
pystylometry-tokenize manuscript.txt --stats
|
|
446
|
+
pystylometry-tokenize manuscript.txt -U --expand-contractions
|
|
447
|
+
pystylometry-tokenize manuscript.txt --min-length 3 --strip-numbers
|
|
448
|
+
""",
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
parser.add_argument(
|
|
452
|
+
"file",
|
|
453
|
+
type=Path,
|
|
454
|
+
help="Path to text file to tokenize",
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
# Output mode
|
|
458
|
+
output_group = parser.add_argument_group("output")
|
|
459
|
+
output_group.add_argument(
|
|
460
|
+
"-j",
|
|
461
|
+
"--json",
|
|
462
|
+
action="store_true",
|
|
463
|
+
help="Output as JSON (list of strings, or list of objects with --metadata)",
|
|
464
|
+
)
|
|
465
|
+
output_group.add_argument(
|
|
466
|
+
"-m",
|
|
467
|
+
"--metadata",
|
|
468
|
+
action="store_true",
|
|
469
|
+
help="Include token type and position metadata (implies --json)",
|
|
470
|
+
)
|
|
471
|
+
output_group.add_argument(
|
|
472
|
+
"-s",
|
|
473
|
+
"--stats",
|
|
474
|
+
action="store_true",
|
|
475
|
+
help="Show tokenization statistics instead of tokens",
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
# Core behavior
|
|
479
|
+
behavior_group = parser.add_argument_group("behavior")
|
|
480
|
+
behavior_group.add_argument(
|
|
481
|
+
"-U",
|
|
482
|
+
"--no-lowercase",
|
|
483
|
+
action="store_true",
|
|
484
|
+
help="Preserve original case (default: lowercase)",
|
|
485
|
+
)
|
|
486
|
+
behavior_group.add_argument(
|
|
487
|
+
"-e",
|
|
488
|
+
"--expand-contractions",
|
|
489
|
+
action="store_true",
|
|
490
|
+
help="Expand contractions (it's -> it is)",
|
|
491
|
+
)
|
|
492
|
+
behavior_group.add_argument(
|
|
493
|
+
"-n",
|
|
494
|
+
"--strip-numbers",
|
|
495
|
+
action="store_true",
|
|
496
|
+
help="Remove numeric tokens",
|
|
497
|
+
)
|
|
498
|
+
behavior_group.add_argument(
|
|
499
|
+
"--keep-punctuation",
|
|
500
|
+
action="store_true",
|
|
501
|
+
help="Keep punctuation tokens (default: stripped)",
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
# Filtering
|
|
505
|
+
filter_group = parser.add_argument_group("filtering")
|
|
506
|
+
filter_group.add_argument(
|
|
507
|
+
"--min-length",
|
|
508
|
+
type=int,
|
|
509
|
+
default=1,
|
|
510
|
+
metavar="N",
|
|
511
|
+
help="Minimum token length (default: 1)",
|
|
512
|
+
)
|
|
513
|
+
filter_group.add_argument(
|
|
514
|
+
"--max-length",
|
|
515
|
+
type=int,
|
|
516
|
+
default=None,
|
|
517
|
+
metavar="N",
|
|
518
|
+
help="Maximum token length (default: unlimited)",
|
|
519
|
+
)
|
|
520
|
+
filter_group.add_argument(
|
|
521
|
+
"--preserve-urls",
|
|
522
|
+
action="store_true",
|
|
523
|
+
help="Keep URL tokens",
|
|
524
|
+
)
|
|
525
|
+
filter_group.add_argument(
|
|
526
|
+
"--preserve-emails",
|
|
527
|
+
action="store_true",
|
|
528
|
+
help="Keep email tokens",
|
|
529
|
+
)
|
|
530
|
+
filter_group.add_argument(
|
|
531
|
+
"--preserve-hashtags",
|
|
532
|
+
action="store_true",
|
|
533
|
+
help="Keep hashtag tokens",
|
|
534
|
+
)
|
|
535
|
+
filter_group.add_argument(
|
|
536
|
+
"--preserve-mentions",
|
|
537
|
+
action="store_true",
|
|
538
|
+
help="Keep @mention tokens",
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
# Advanced
|
|
542
|
+
advanced_group = parser.add_argument_group("advanced")
|
|
543
|
+
advanced_group.add_argument(
|
|
544
|
+
"--expand-abbreviations",
|
|
545
|
+
action="store_true",
|
|
546
|
+
help="Expand abbreviations (Dr. -> Doctor)",
|
|
547
|
+
)
|
|
548
|
+
advanced_group.add_argument(
|
|
549
|
+
"--strip-accents",
|
|
550
|
+
action="store_true",
|
|
551
|
+
help="Remove accents from characters",
|
|
552
|
+
)
|
|
553
|
+
advanced_group.add_argument(
|
|
554
|
+
"--no-clean",
|
|
555
|
+
action="store_true",
|
|
556
|
+
help="Skip text cleaning (italics, brackets, page markers)",
|
|
557
|
+
)
|
|
558
|
+
advanced_group.add_argument(
|
|
559
|
+
"--no-unicode-normalize",
|
|
560
|
+
action="store_true",
|
|
561
|
+
help="Skip unicode normalization",
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
args = parser.parse_args()
|
|
565
|
+
|
|
566
|
+
# --- ANSI colors ---
|
|
567
|
+
use_color = sys.stderr.isatty()
|
|
568
|
+
|
|
569
|
+
def _c(code: str, text: str) -> str:
|
|
570
|
+
return f"\033[{code}m{text}\033[0m" if use_color else text
|
|
571
|
+
|
|
572
|
+
bold = lambda t: _c("1", t) # noqa: E731
|
|
573
|
+
dim = lambda t: _c("2", t) # noqa: E731
|
|
574
|
+
cyan = lambda t: _c("36", t) # noqa: E731
|
|
575
|
+
green = lambda t: _c("32", t) # noqa: E731
|
|
576
|
+
yellow = lambda t: _c("33", t) # noqa: E731
|
|
577
|
+
|
|
578
|
+
# --- Validate file ---
|
|
579
|
+
if not args.file.exists():
|
|
580
|
+
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
|
581
|
+
sys.exit(1)
|
|
582
|
+
|
|
583
|
+
try:
|
|
584
|
+
text = args.file.read_text(encoding="utf-8")
|
|
585
|
+
except Exception as e:
|
|
586
|
+
print(f"Error reading file: {e}", file=sys.stderr)
|
|
587
|
+
sys.exit(1)
|
|
588
|
+
|
|
589
|
+
# --- Build Tokenizer kwargs ---
|
|
590
|
+
tokenizer_kwargs = {
|
|
591
|
+
"lowercase": not args.no_lowercase,
|
|
592
|
+
"min_length": args.min_length,
|
|
593
|
+
"max_length": args.max_length,
|
|
594
|
+
"strip_numbers": args.strip_numbers,
|
|
595
|
+
"strip_punctuation": not args.keep_punctuation,
|
|
596
|
+
"preserve_urls": args.preserve_urls,
|
|
597
|
+
"preserve_emails": args.preserve_emails,
|
|
598
|
+
"preserve_hashtags": args.preserve_hashtags,
|
|
599
|
+
"preserve_mentions": args.preserve_mentions,
|
|
600
|
+
"expand_contractions": args.expand_contractions,
|
|
601
|
+
"expand_abbreviations": args.expand_abbreviations,
|
|
602
|
+
"strip_accents": args.strip_accents,
|
|
603
|
+
"normalize_unicode": not args.no_unicode_normalize,
|
|
604
|
+
"clean_text": not args.no_clean,
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
# Collect active options for banner
|
|
608
|
+
active_opts = []
|
|
609
|
+
if args.no_lowercase:
|
|
610
|
+
active_opts.append("preserve case")
|
|
611
|
+
if args.expand_contractions:
|
|
612
|
+
active_opts.append("expand contractions")
|
|
613
|
+
if args.expand_abbreviations:
|
|
614
|
+
active_opts.append("expand abbreviations")
|
|
615
|
+
if args.strip_numbers:
|
|
616
|
+
active_opts.append("strip numbers")
|
|
617
|
+
if args.keep_punctuation:
|
|
618
|
+
active_opts.append("keep punctuation")
|
|
619
|
+
if args.strip_accents:
|
|
620
|
+
active_opts.append("strip accents")
|
|
621
|
+
if args.no_clean:
|
|
622
|
+
active_opts.append("skip cleaning")
|
|
623
|
+
if args.no_unicode_normalize:
|
|
624
|
+
active_opts.append("skip unicode normalization")
|
|
625
|
+
if args.preserve_urls:
|
|
626
|
+
active_opts.append("preserve URLs")
|
|
627
|
+
if args.preserve_emails:
|
|
628
|
+
active_opts.append("preserve emails")
|
|
629
|
+
if args.preserve_hashtags:
|
|
630
|
+
active_opts.append("preserve hashtags")
|
|
631
|
+
if args.preserve_mentions:
|
|
632
|
+
active_opts.append("preserve mentions")
|
|
633
|
+
if args.min_length > 1:
|
|
634
|
+
active_opts.append(f"min length {args.min_length}")
|
|
635
|
+
if args.max_length is not None:
|
|
636
|
+
active_opts.append(f"max length {args.max_length}")
|
|
637
|
+
|
|
638
|
+
# Determine output format
|
|
639
|
+
if args.stats:
|
|
640
|
+
output_format = "Statistics"
|
|
641
|
+
elif args.metadata:
|
|
642
|
+
output_format = "JSON (with metadata)"
|
|
643
|
+
elif args.json:
|
|
644
|
+
output_format = "JSON"
|
|
645
|
+
else:
|
|
646
|
+
output_format = "One token per line"
|
|
647
|
+
|
|
648
|
+
# --- Banner (to stderr so stdout stays pipeable) ---
|
|
649
|
+
char_count = len(text)
|
|
650
|
+
line_count = text.count("\n") + 1
|
|
651
|
+
|
|
652
|
+
banner = sys.stderr
|
|
653
|
+
print(file=banner)
|
|
654
|
+
print(f" {bold('PYSTYLOMETRY')} {dim('—')} {cyan('Stylometric Tokenizer')}", file=banner)
|
|
655
|
+
print(f" {dim('═' * 71)}", file=banner)
|
|
656
|
+
print(file=banner)
|
|
657
|
+
print(f" {bold('INPUT')}", file=banner)
|
|
658
|
+
print(f" {dim('─' * 71)}", file=banner)
|
|
659
|
+
print(f" File: {args.file}", file=banner)
|
|
660
|
+
print(f" Size: {char_count:,} characters / {line_count:,} lines", file=banner)
|
|
661
|
+
print(file=banner)
|
|
662
|
+
print(f" {bold('CONFIGURATION')}", file=banner)
|
|
663
|
+
print(f" {dim('─' * 71)}", file=banner)
|
|
664
|
+
print(f" Case: {'preserve' if args.no_lowercase else 'lowercase'}", file=banner)
|
|
665
|
+
print(
|
|
666
|
+
f" Punctuation: {'keep' if args.keep_punctuation else 'strip'}",
|
|
667
|
+
file=banner,
|
|
668
|
+
)
|
|
669
|
+
print(
|
|
670
|
+
f" Contractions: {'expand' if args.expand_contractions else 'preserve'}",
|
|
671
|
+
file=banner,
|
|
672
|
+
)
|
|
673
|
+
print(f" Numbers: {'strip' if args.strip_numbers else 'keep'}", file=banner)
|
|
674
|
+
if active_opts:
|
|
675
|
+
print(f" Active options: {', '.join(active_opts)}", file=banner)
|
|
676
|
+
print(file=banner)
|
|
677
|
+
print(f" {bold('OUTPUT')}", file=banner)
|
|
678
|
+
print(f" {dim('─' * 71)}", file=banner)
|
|
679
|
+
print(f" Format: {output_format}", file=banner)
|
|
680
|
+
print(file=banner)
|
|
681
|
+
|
|
682
|
+
# --- Tokenize ---
|
|
683
|
+
from pystylometry.tokenizer import Tokenizer
|
|
684
|
+
|
|
685
|
+
tokenizer = Tokenizer(**tokenizer_kwargs)
|
|
686
|
+
|
|
687
|
+
if args.stats:
|
|
688
|
+
stats = tokenizer.get_statistics(text)
|
|
689
|
+
print(f" {bold('RESULTS')}", file=banner)
|
|
690
|
+
print(f" {dim('─' * 71)}", file=banner)
|
|
691
|
+
print(f" Total tokens: {green(f'{stats.total_tokens:,}')}", file=banner)
|
|
692
|
+
print(f" Unique tokens: {green(f'{stats.unique_tokens:,}')}", file=banner)
|
|
693
|
+
print(f" Word tokens: {stats.word_tokens:,}", file=banner)
|
|
694
|
+
print(f" Number tokens: {stats.number_tokens:,}", file=banner)
|
|
695
|
+
print(f" Punctuation: {stats.punctuation_tokens:,}", file=banner)
|
|
696
|
+
print(f" URLs: {stats.url_tokens:,}", file=banner)
|
|
697
|
+
print(f" Emails: {stats.email_tokens:,}", file=banner)
|
|
698
|
+
print(f" Hashtags: {stats.hashtag_tokens:,}", file=banner)
|
|
699
|
+
print(f" Mentions: {stats.mention_tokens:,}", file=banner)
|
|
700
|
+
print(f" Avg length: {stats.average_token_length:.1f}", file=banner)
|
|
701
|
+
print(f" Min length: {stats.min_token_length}", file=banner)
|
|
702
|
+
print(f" Max length: {stats.max_token_length}", file=banner)
|
|
703
|
+
print(file=banner)
|
|
704
|
+
|
|
705
|
+
if args.json:
|
|
706
|
+
import dataclasses
|
|
707
|
+
|
|
708
|
+
print(json.dumps(dataclasses.asdict(stats), indent=2))
|
|
709
|
+
|
|
710
|
+
elif args.metadata or (args.json and args.metadata):
|
|
711
|
+
metadata_list = tokenizer.tokenize_with_metadata(text)
|
|
712
|
+
count = len(metadata_list)
|
|
713
|
+
print(
|
|
714
|
+
f" {yellow('Tokenizing...')} {green(f'{count:,}')} tokens extracted",
|
|
715
|
+
file=banner,
|
|
716
|
+
)
|
|
717
|
+
print(file=banner)
|
|
718
|
+
output = [
|
|
719
|
+
{
|
|
720
|
+
"token": m.token,
|
|
721
|
+
"start": m.start,
|
|
722
|
+
"end": m.end,
|
|
723
|
+
"type": m.token_type,
|
|
724
|
+
}
|
|
725
|
+
for m in metadata_list
|
|
726
|
+
]
|
|
727
|
+
print(json.dumps(output, indent=2))
|
|
728
|
+
|
|
729
|
+
elif args.json:
|
|
730
|
+
tokens = tokenizer.tokenize(text)
|
|
731
|
+
count = len(tokens)
|
|
732
|
+
print(
|
|
733
|
+
f" {yellow('Tokenizing...')} {green(f'{count:,}')} tokens extracted",
|
|
734
|
+
file=banner,
|
|
735
|
+
)
|
|
736
|
+
print(file=banner)
|
|
737
|
+
print(json.dumps(tokens, indent=2))
|
|
738
|
+
|
|
739
|
+
else:
|
|
740
|
+
tokens = tokenizer.tokenize(text)
|
|
741
|
+
count = len(tokens)
|
|
742
|
+
print(
|
|
743
|
+
f" {yellow('Tokenizing...')} {green(f'{count:,}')} tokens extracted",
|
|
744
|
+
file=banner,
|
|
745
|
+
)
|
|
746
|
+
print(file=banner)
|
|
747
|
+
for token in tokens:
|
|
748
|
+
print(token)
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
def bnc_frequency_cli() -> None:
|
|
752
|
+
"""CLI entry point for BNC word frequency analysis."""
|
|
753
|
+
parser = argparse.ArgumentParser(
|
|
754
|
+
prog="bnc",
|
|
755
|
+
description="Analyze word frequencies against the British National Corpus (BNC).",
|
|
756
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
757
|
+
epilog="""
|
|
758
|
+
Examples:
|
|
759
|
+
bnc --input-file manuscript.txt
|
|
760
|
+
bnc --input-file manuscript.txt --output-file report.html
|
|
761
|
+
bnc --input-file manuscript.txt --format json
|
|
762
|
+
bnc --input-file manuscript.txt --overuse-threshold 2.0 --min-mentions 3
|
|
763
|
+
bnc --input-file manuscript.txt --no-wordnet
|
|
764
|
+
|
|
765
|
+
Output:
|
|
766
|
+
Generates a report with three sections:
|
|
767
|
+
- Not in BNC: Words not found in the corpus (with WordNet status, character type)
|
|
768
|
+
- Most Underused: Words appearing less frequently than expected
|
|
769
|
+
- Most Overused: Words appearing more frequently than expected
|
|
770
|
+
|
|
771
|
+
Thresholds:
|
|
772
|
+
Words with ratio > overuse-threshold are "overused"
|
|
773
|
+
Words with ratio < underuse-threshold are "underused"
|
|
774
|
+
Ratio = observed_count / expected_count (based on BNC frequencies)
|
|
775
|
+
""",
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
parser.add_argument(
|
|
779
|
+
"--input-file",
|
|
780
|
+
"-i",
|
|
781
|
+
type=Path,
|
|
782
|
+
required=True,
|
|
783
|
+
metavar="FILE",
|
|
784
|
+
help="Path to text file to analyze",
|
|
785
|
+
)
|
|
786
|
+
parser.add_argument(
|
|
787
|
+
"--output-file",
|
|
788
|
+
"-o",
|
|
789
|
+
type=Path,
|
|
790
|
+
default=None,
|
|
791
|
+
metavar="FILE",
|
|
792
|
+
help="Output file (default: <input>_bnc_frequency.<ext> based on --format)",
|
|
793
|
+
)
|
|
794
|
+
parser.add_argument(
|
|
795
|
+
"--overuse-threshold",
|
|
796
|
+
type=float,
|
|
797
|
+
default=1.3,
|
|
798
|
+
metavar="N",
|
|
799
|
+
help="Ratio above which words are considered overused (default: 1.3)",
|
|
800
|
+
)
|
|
801
|
+
parser.add_argument(
|
|
802
|
+
"--underuse-threshold",
|
|
803
|
+
type=float,
|
|
804
|
+
default=0.8,
|
|
805
|
+
metavar="N",
|
|
806
|
+
help="Ratio below which words are considered underused (default: 0.8)",
|
|
807
|
+
)
|
|
808
|
+
parser.add_argument(
|
|
809
|
+
"--min-mentions",
|
|
810
|
+
type=int,
|
|
811
|
+
default=1,
|
|
812
|
+
metavar="N",
|
|
813
|
+
help="Minimum word occurrences to include (default: 1)",
|
|
814
|
+
)
|
|
815
|
+
parser.add_argument(
|
|
816
|
+
"--no-wordnet",
|
|
817
|
+
action="store_true",
|
|
818
|
+
help="Skip WordNet lookup for unknown words",
|
|
819
|
+
)
|
|
820
|
+
parser.add_argument(
|
|
821
|
+
"--format",
|
|
822
|
+
choices=["csv", "html", "json", "excel"],
|
|
823
|
+
default="csv",
|
|
824
|
+
help="Output format: csv (tab-delimited), html (interactive), json, excel (default: csv)",
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
args = parser.parse_args()
|
|
828
|
+
|
|
829
|
+
# Import rich for colored output
|
|
830
|
+
from rich.console import Console
|
|
831
|
+
from rich.panel import Panel
|
|
832
|
+
from rich.table import Table
|
|
833
|
+
from rich.text import Text
|
|
834
|
+
|
|
835
|
+
console = Console(stderr=True)
|
|
836
|
+
|
|
837
|
+
# Validate file exists
|
|
838
|
+
if not args.input_file.exists():
|
|
839
|
+
console.print(f"[red]Error:[/red] File not found: {args.input_file}")
|
|
840
|
+
sys.exit(1)
|
|
841
|
+
|
|
842
|
+
# Read file
|
|
843
|
+
try:
|
|
844
|
+
text = args.input_file.read_text(encoding="utf-8")
|
|
845
|
+
except Exception as e:
|
|
846
|
+
console.print(f"[red]Error reading file:[/red] {e}")
|
|
847
|
+
sys.exit(1)
|
|
848
|
+
|
|
849
|
+
# Determine output path (extension based on format)
|
|
850
|
+
suffix_map = {"csv": ".tsv", "html": ".html", "json": ".json", "excel": ".xlsx"}
|
|
851
|
+
if args.output_file:
|
|
852
|
+
output_path = args.output_file
|
|
853
|
+
else:
|
|
854
|
+
suffix = suffix_map[args.format]
|
|
855
|
+
output_path = args.input_file.with_name(f"{args.input_file.stem}_bnc_frequency{suffix}")
|
|
856
|
+
|
|
857
|
+
# Calculate file stats
|
|
858
|
+
token_count = len(text.split())
|
|
859
|
+
char_count = len(text)
|
|
860
|
+
|
|
861
|
+
# Print header
|
|
862
|
+
console.print()
|
|
863
|
+
header = Text()
|
|
864
|
+
header.append("PYSTYLOMETRY", style="bold cyan")
|
|
865
|
+
header.append(" — ", style="dim")
|
|
866
|
+
header.append("BNC Word Frequency Analysis", style="bold white")
|
|
867
|
+
console.print(Panel(header, border_style="cyan"))
|
|
868
|
+
|
|
869
|
+
# Input section
|
|
870
|
+
console.print()
|
|
871
|
+
console.print("[bold]INPUT[/bold]", style="cyan")
|
|
872
|
+
console.print("─" * 60, style="dim")
|
|
873
|
+
console.print(f" File: [white]{args.input_file}[/white]")
|
|
874
|
+
console.print(
|
|
875
|
+
f" Size: [green]{char_count:,}[/green] chars / [green]{token_count:,}[/green] tokens"
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
# Parameters section
|
|
879
|
+
console.print()
|
|
880
|
+
console.print("[bold]PARAMETERS[/bold]", style="cyan")
|
|
881
|
+
console.print("─" * 60, style="dim")
|
|
882
|
+
console.print(f" Overuse threshold: [yellow]{args.overuse_threshold}x[/yellow]")
|
|
883
|
+
console.print(f" Underuse threshold: [yellow]{args.underuse_threshold}x[/yellow]")
|
|
884
|
+
console.print(f" Min mentions: [yellow]{args.min_mentions}[/yellow]")
|
|
885
|
+
console.print(f" WordNet lookup: [yellow]{'no' if args.no_wordnet else 'yes'}[/yellow]")
|
|
886
|
+
|
|
887
|
+
# Output section
|
|
888
|
+
console.print()
|
|
889
|
+
console.print("[bold]OUTPUT[/bold]", style="cyan")
|
|
890
|
+
console.print("─" * 60, style="dim")
|
|
891
|
+
fmt_display = {
|
|
892
|
+
"csv": "Tab-delimited CSV",
|
|
893
|
+
"html": "Interactive HTML",
|
|
894
|
+
"json": "JSON",
|
|
895
|
+
"excel": "Excel (.xlsx)",
|
|
896
|
+
}
|
|
897
|
+
console.print(f" Format: [magenta]{fmt_display[args.format]}[/magenta]")
|
|
898
|
+
console.print(f" Destination: [white]{output_path}[/white]")
|
|
899
|
+
|
|
900
|
+
# Run analysis with spinner
|
|
901
|
+
console.print()
|
|
902
|
+
with console.status("[bold cyan]Running analysis...[/bold cyan]", spinner="dots"):
|
|
903
|
+
from pystylometry.lexical.bnc_frequency import compute_bnc_frequency
|
|
904
|
+
|
|
905
|
+
result = compute_bnc_frequency(
|
|
906
|
+
text,
|
|
907
|
+
overuse_threshold=args.overuse_threshold,
|
|
908
|
+
underuse_threshold=args.underuse_threshold,
|
|
909
|
+
include_wordnet=not args.no_wordnet,
|
|
910
|
+
min_mentions=args.min_mentions,
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
# Output results
|
|
914
|
+
if args.format == "json":
|
|
915
|
+
output = {
|
|
916
|
+
"stats": {
|
|
917
|
+
"total_tokens": result.total_tokens,
|
|
918
|
+
"unique_tokens": result.unique_tokens,
|
|
919
|
+
"overused_count": len(result.overused),
|
|
920
|
+
"underused_count": len(result.underused),
|
|
921
|
+
"not_in_bnc_count": len(result.not_in_bnc),
|
|
922
|
+
},
|
|
923
|
+
"overused": [
|
|
924
|
+
{
|
|
925
|
+
"word": w.word,
|
|
926
|
+
"observed": w.observed,
|
|
927
|
+
"expected": w.expected,
|
|
928
|
+
"ratio": w.ratio,
|
|
929
|
+
"char_type": w.char_type,
|
|
930
|
+
}
|
|
931
|
+
for w in result.overused
|
|
932
|
+
],
|
|
933
|
+
"underused": [
|
|
934
|
+
{
|
|
935
|
+
"word": w.word,
|
|
936
|
+
"observed": w.observed,
|
|
937
|
+
"expected": w.expected,
|
|
938
|
+
"ratio": w.ratio,
|
|
939
|
+
"char_type": w.char_type,
|
|
940
|
+
}
|
|
941
|
+
for w in result.underused
|
|
942
|
+
],
|
|
943
|
+
"not_in_bnc": [
|
|
944
|
+
{
|
|
945
|
+
"word": w.word,
|
|
946
|
+
"observed": w.observed,
|
|
947
|
+
"in_wordnet": w.in_wordnet,
|
|
948
|
+
"char_type": w.char_type,
|
|
949
|
+
}
|
|
950
|
+
for w in result.not_in_bnc
|
|
951
|
+
],
|
|
952
|
+
}
|
|
953
|
+
output_path.write_text(json.dumps(output, indent=2))
|
|
954
|
+
console.print(f'[green]✓[/green] JSON saved to: [white]"{output_path}"[/white]')
|
|
955
|
+
|
|
956
|
+
elif args.format == "csv":
|
|
957
|
+
# Tab-delimited output with category column
|
|
958
|
+
lines = ["category\tword\tobserved\texpected\tratio\tin_wordnet\tchar_type"]
|
|
959
|
+
|
|
960
|
+
def fmt_wordnet(val: bool | None) -> str:
|
|
961
|
+
if val is True:
|
|
962
|
+
return "yes"
|
|
963
|
+
elif val is False:
|
|
964
|
+
return "no"
|
|
965
|
+
return ""
|
|
966
|
+
|
|
967
|
+
for w in result.overused:
|
|
968
|
+
expected = f"{w.expected:.2f}" if w.expected else ""
|
|
969
|
+
ratio = f"{w.ratio:.4f}" if w.ratio else ""
|
|
970
|
+
in_wn = fmt_wordnet(w.in_wordnet)
|
|
971
|
+
lines.append(
|
|
972
|
+
f"overused\t{w.word}\t{w.observed}\t{expected}\t{ratio}\t{in_wn}\t{w.char_type}"
|
|
973
|
+
)
|
|
974
|
+
|
|
975
|
+
for w in result.underused:
|
|
976
|
+
expected = f"{w.expected:.2f}" if w.expected else ""
|
|
977
|
+
ratio = f"{w.ratio:.4f}" if w.ratio else ""
|
|
978
|
+
in_wn = fmt_wordnet(w.in_wordnet)
|
|
979
|
+
lines.append(
|
|
980
|
+
f"underused\t{w.word}\t{w.observed}\t{expected}\t{ratio}\t{in_wn}\t{w.char_type}"
|
|
981
|
+
)
|
|
982
|
+
|
|
983
|
+
for w in result.not_in_bnc:
|
|
984
|
+
in_wn = fmt_wordnet(w.in_wordnet)
|
|
985
|
+
lines.append(f"not-in-bnc\t{w.word}\t{w.observed}\t\t\t{in_wn}\t{w.char_type}")
|
|
986
|
+
|
|
987
|
+
output_path.write_text("\n".join(lines))
|
|
988
|
+
console.print(f'[green]✓[/green] TSV saved to: [white]"{output_path}"[/white]')
|
|
989
|
+
|
|
990
|
+
elif args.format == "excel":
|
|
991
|
+
try:
|
|
992
|
+
from openpyxl import Workbook # type: ignore[import-untyped]
|
|
993
|
+
from openpyxl.styles import Alignment, PatternFill # type: ignore[import-untyped]
|
|
994
|
+
except ImportError:
|
|
995
|
+
console.print("[red]Error:[/red] Excel export requires openpyxl.")
|
|
996
|
+
console.print(" Install with: [yellow]pip install pystylometry[excel][/yellow]")
|
|
997
|
+
console.print(" Or for pipx: [yellow]pipx inject pystylometry openpyxl[/yellow]")
|
|
998
|
+
sys.exit(1)
|
|
999
|
+
|
|
1000
|
+
wb = Workbook()
|
|
1001
|
+
|
|
1002
|
+
# Remove default sheet
|
|
1003
|
+
wb.remove(wb.active)
|
|
1004
|
+
|
|
1005
|
+
# Cell style: width 15, centered, vertically centered
|
|
1006
|
+
align = Alignment(horizontal="center", vertical="center")
|
|
1007
|
+
|
|
1008
|
+
def fmt_wordnet_excel(val: bool | None) -> str:
|
|
1009
|
+
if val is True:
|
|
1010
|
+
return "yes"
|
|
1011
|
+
elif val is False:
|
|
1012
|
+
return "no"
|
|
1013
|
+
return ""
|
|
1014
|
+
|
|
1015
|
+
# Overused sheet (sorted by ratio, high to low)
|
|
1016
|
+
ws_over = wb.create_sheet("overused")
|
|
1017
|
+
ws_over.append(["word", "observed", "expected", "ratio", "in_wordnet", "char_type"])
|
|
1018
|
+
for w in sorted(result.overused, key=lambda x: x.ratio or 0, reverse=True):
|
|
1019
|
+
in_wn = fmt_wordnet_excel(w.in_wordnet)
|
|
1020
|
+
ws_over.append([w.word, w.observed, w.expected, w.ratio, in_wn, w.char_type])
|
|
1021
|
+
|
|
1022
|
+
# Underused sheet (sorted by ratio, high to low)
|
|
1023
|
+
ws_under = wb.create_sheet("underused")
|
|
1024
|
+
ws_under.append(["word", "observed", "expected", "ratio", "in_wordnet", "char_type"])
|
|
1025
|
+
for w in sorted(result.underused, key=lambda x: x.ratio or 0, reverse=True):
|
|
1026
|
+
in_wn = fmt_wordnet_excel(w.in_wordnet)
|
|
1027
|
+
ws_under.append([w.word, w.observed, w.expected, w.ratio, in_wn, w.char_type])
|
|
1028
|
+
|
|
1029
|
+
# Not in BNC sheet
|
|
1030
|
+
ws_notbnc = wb.create_sheet("not-in-bnc")
|
|
1031
|
+
ws_notbnc.append(["word", "observed", "in_wordnet", "char_type", "plural_form"])
|
|
1032
|
+
for w in result.not_in_bnc:
|
|
1033
|
+
in_wn = fmt_wordnet_excel(w.in_wordnet)
|
|
1034
|
+
plural = "yes" if w.word.endswith("s") else "no"
|
|
1035
|
+
ws_notbnc.append([w.word, w.observed, in_wn, w.char_type, plural])
|
|
1036
|
+
|
|
1037
|
+
# Apply formatting to all sheets
|
|
1038
|
+
for ws in [ws_over, ws_under, ws_notbnc]:
|
|
1039
|
+
for col in ws.columns:
|
|
1040
|
+
col_letter = col[0].column_letter
|
|
1041
|
+
# Word column (A) gets width 30, others get 15
|
|
1042
|
+
ws.column_dimensions[col_letter].width = 30 if col_letter == "A" else 15
|
|
1043
|
+
for row in ws.iter_rows():
|
|
1044
|
+
for cell in row:
|
|
1045
|
+
cell.alignment = align
|
|
1046
|
+
|
|
1047
|
+
# Apply number formatting to expected (C) and ratio (D) columns
|
|
1048
|
+
for ws in [ws_over, ws_under]:
|
|
1049
|
+
for row in range(2, ws.max_row + 1): # Skip header row
|
|
1050
|
+
ws[f"C{row}"].number_format = "0.00"
|
|
1051
|
+
ws[f"D{row}"].number_format = "0.00"
|
|
1052
|
+
|
|
1053
|
+
# Apply background colors to in_wordnet column
|
|
1054
|
+
fill_yes = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
|
1055
|
+
fill_no = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
|
1056
|
+
|
|
1057
|
+
# in_wordnet is column E for overused/underused, column C for not-in-bnc
|
|
1058
|
+
for ws in [ws_over, ws_under]:
|
|
1059
|
+
for row in range(2, ws.max_row + 1):
|
|
1060
|
+
cell = ws[f"E{row}"]
|
|
1061
|
+
if cell.value == "yes":
|
|
1062
|
+
cell.fill = fill_yes
|
|
1063
|
+
elif cell.value == "no":
|
|
1064
|
+
cell.fill = fill_no
|
|
1065
|
+
|
|
1066
|
+
for row in range(2, ws_notbnc.max_row + 1):
|
|
1067
|
+
cell = ws_notbnc[f"C{row}"]
|
|
1068
|
+
if cell.value == "yes":
|
|
1069
|
+
cell.fill = fill_yes
|
|
1070
|
+
elif cell.value == "no":
|
|
1071
|
+
cell.fill = fill_no
|
|
1072
|
+
|
|
1073
|
+
# Apply background colors to plural_form column (E) in not-in-bnc
|
|
1074
|
+
fill_plural_yes = PatternFill(
|
|
1075
|
+
start_color="BDD7EE", end_color="BDD7EE", fill_type="solid"
|
|
1076
|
+
) # Light blue
|
|
1077
|
+
fill_plural_no = PatternFill(
|
|
1078
|
+
start_color="FCE4D6", end_color="FCE4D6", fill_type="solid"
|
|
1079
|
+
) # Light peach
|
|
1080
|
+
for row in range(2, ws_notbnc.max_row + 1):
|
|
1081
|
+
cell = ws_notbnc[f"E{row}"]
|
|
1082
|
+
if cell.value == "yes":
|
|
1083
|
+
cell.fill = fill_plural_yes
|
|
1084
|
+
elif cell.value == "no":
|
|
1085
|
+
cell.fill = fill_plural_no
|
|
1086
|
+
|
|
1087
|
+
wb.save(output_path)
|
|
1088
|
+
console.print(f'[green]✓[/green] Excel saved to: [white]"{output_path}"[/white]')
|
|
1089
|
+
|
|
1090
|
+
else: # html
|
|
1091
|
+
from pystylometry.viz.jsx import export_bnc_frequency_jsx
|
|
1092
|
+
|
|
1093
|
+
export_bnc_frequency_jsx(
|
|
1094
|
+
result,
|
|
1095
|
+
output_file=output_path,
|
|
1096
|
+
title=f"BNC Frequency Analysis: {args.input_file.name}",
|
|
1097
|
+
source_file=str(args.input_file),
|
|
1098
|
+
)
|
|
1099
|
+
|
|
1100
|
+
abs_path = output_path.resolve()
|
|
1101
|
+
file_url = f"file://{abs_path}"
|
|
1102
|
+
console.print(f'[green]✓[/green] HTML report saved to: [white]"{output_path}"[/white]')
|
|
1103
|
+
console.print(f" Open in browser: [link={file_url}]{file_url}[/link]")
|
|
1104
|
+
|
|
1105
|
+
# Summary table
|
|
1106
|
+
console.print()
|
|
1107
|
+
table = Table(title="Summary", border_style="cyan", header_style="bold cyan")
|
|
1108
|
+
table.add_column("Metric", style="white")
|
|
1109
|
+
table.add_column("Count", justify="right", style="green")
|
|
1110
|
+
|
|
1111
|
+
table.add_row("Total tokens", f"{result.total_tokens:,}")
|
|
1112
|
+
table.add_row("Unique words", f"{result.unique_tokens:,}")
|
|
1113
|
+
table.add_row("Not in BNC", f"[dim]{len(result.not_in_bnc):,}[/dim]")
|
|
1114
|
+
table.add_row("Underused", f"[blue]{len(result.underused):,}[/blue]")
|
|
1115
|
+
table.add_row("Overused", f"[red]{len(result.overused):,}[/red]")
|
|
1116
|
+
|
|
1117
|
+
console.print(table)
|
|
1118
|
+
console.print()
|
|
1119
|
+
|
|
1120
|
+
|
|
426
1121
|
if __name__ == "__main__":
|
|
427
1122
|
drift_cli()
|