pystylometry 1.3.1__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pystylometry/cli.py CHANGED
@@ -3,6 +3,8 @@
3
3
  Usage:
4
4
  pystylometry-drift <file> [--window-size=N] [--stride=N] [--mode=MODE] [--json]
5
5
  pystylometry-drift <file> --plot [output.png]
6
+ pystylometry-tokenize <file> [--json] [--metadata] [--stats]
7
+ bnc --input-file <file> [--output-file <file>] [--format csv|html|json]
6
8
 
7
9
  Example:
8
10
  pystylometry-drift manuscript.txt
@@ -10,6 +12,12 @@ Example:
10
12
  pystylometry-drift manuscript.txt --json
11
13
  pystylometry-drift manuscript.txt --plot
12
14
  pystylometry-drift manuscript.txt --plot drift_report.png
15
+ pystylometry-tokenize manuscript.txt
16
+ pystylometry-tokenize manuscript.txt --json --metadata
17
+ pystylometry-tokenize manuscript.txt --stats
18
+ bnc --input-file manuscript.txt
19
+ bnc --input-file manuscript.txt --output-file report.html --format html
20
+ bnc -i manuscript.txt --format json
13
21
  """
14
22
 
15
23
  from __future__ import annotations
@@ -423,5 +431,692 @@ The generated viewer includes:
423
431
  print()
424
432
 
425
433
 
434
+ def tokenize_cli() -> None:
435
+ """CLI entry point for stylometric tokenization."""
436
+ parser = argparse.ArgumentParser(
437
+ prog="pystylometry-tokenize",
438
+ description="Tokenize text for stylometric analysis.",
439
+ formatter_class=argparse.RawDescriptionHelpFormatter,
440
+ epilog="""
441
+ Examples:
442
+ pystylometry-tokenize manuscript.txt
443
+ pystylometry-tokenize manuscript.txt --json
444
+ pystylometry-tokenize manuscript.txt --json --metadata
445
+ pystylometry-tokenize manuscript.txt --stats
446
+ pystylometry-tokenize manuscript.txt -U --expand-contractions
447
+ pystylometry-tokenize manuscript.txt --min-length 3 --strip-numbers
448
+ """,
449
+ )
450
+
451
+ parser.add_argument(
452
+ "file",
453
+ type=Path,
454
+ help="Path to text file to tokenize",
455
+ )
456
+
457
+ # Output mode
458
+ output_group = parser.add_argument_group("output")
459
+ output_group.add_argument(
460
+ "-j",
461
+ "--json",
462
+ action="store_true",
463
+ help="Output as JSON (list of strings, or list of objects with --metadata)",
464
+ )
465
+ output_group.add_argument(
466
+ "-m",
467
+ "--metadata",
468
+ action="store_true",
469
+ help="Include token type and position metadata (implies --json)",
470
+ )
471
+ output_group.add_argument(
472
+ "-s",
473
+ "--stats",
474
+ action="store_true",
475
+ help="Show tokenization statistics instead of tokens",
476
+ )
477
+
478
+ # Core behavior
479
+ behavior_group = parser.add_argument_group("behavior")
480
+ behavior_group.add_argument(
481
+ "-U",
482
+ "--no-lowercase",
483
+ action="store_true",
484
+ help="Preserve original case (default: lowercase)",
485
+ )
486
+ behavior_group.add_argument(
487
+ "-e",
488
+ "--expand-contractions",
489
+ action="store_true",
490
+ help="Expand contractions (it's -> it is)",
491
+ )
492
+ behavior_group.add_argument(
493
+ "-n",
494
+ "--strip-numbers",
495
+ action="store_true",
496
+ help="Remove numeric tokens",
497
+ )
498
+ behavior_group.add_argument(
499
+ "--keep-punctuation",
500
+ action="store_true",
501
+ help="Keep punctuation tokens (default: stripped)",
502
+ )
503
+
504
+ # Filtering
505
+ filter_group = parser.add_argument_group("filtering")
506
+ filter_group.add_argument(
507
+ "--min-length",
508
+ type=int,
509
+ default=1,
510
+ metavar="N",
511
+ help="Minimum token length (default: 1)",
512
+ )
513
+ filter_group.add_argument(
514
+ "--max-length",
515
+ type=int,
516
+ default=None,
517
+ metavar="N",
518
+ help="Maximum token length (default: unlimited)",
519
+ )
520
+ filter_group.add_argument(
521
+ "--preserve-urls",
522
+ action="store_true",
523
+ help="Keep URL tokens",
524
+ )
525
+ filter_group.add_argument(
526
+ "--preserve-emails",
527
+ action="store_true",
528
+ help="Keep email tokens",
529
+ )
530
+ filter_group.add_argument(
531
+ "--preserve-hashtags",
532
+ action="store_true",
533
+ help="Keep hashtag tokens",
534
+ )
535
+ filter_group.add_argument(
536
+ "--preserve-mentions",
537
+ action="store_true",
538
+ help="Keep @mention tokens",
539
+ )
540
+
541
+ # Advanced
542
+ advanced_group = parser.add_argument_group("advanced")
543
+ advanced_group.add_argument(
544
+ "--expand-abbreviations",
545
+ action="store_true",
546
+ help="Expand abbreviations (Dr. -> Doctor)",
547
+ )
548
+ advanced_group.add_argument(
549
+ "--strip-accents",
550
+ action="store_true",
551
+ help="Remove accents from characters",
552
+ )
553
+ advanced_group.add_argument(
554
+ "--no-clean",
555
+ action="store_true",
556
+ help="Skip text cleaning (italics, brackets, page markers)",
557
+ )
558
+ advanced_group.add_argument(
559
+ "--no-unicode-normalize",
560
+ action="store_true",
561
+ help="Skip unicode normalization",
562
+ )
563
+
564
+ args = parser.parse_args()
565
+
566
+ # --- ANSI colors ---
567
+ use_color = sys.stderr.isatty()
568
+
569
+ def _c(code: str, text: str) -> str:
570
+ return f"\033[{code}m{text}\033[0m" if use_color else text
571
+
572
+ bold = lambda t: _c("1", t) # noqa: E731
573
+ dim = lambda t: _c("2", t) # noqa: E731
574
+ cyan = lambda t: _c("36", t) # noqa: E731
575
+ green = lambda t: _c("32", t) # noqa: E731
576
+ yellow = lambda t: _c("33", t) # noqa: E731
577
+
578
+ # --- Validate file ---
579
+ if not args.file.exists():
580
+ print(f"Error: File not found: {args.file}", file=sys.stderr)
581
+ sys.exit(1)
582
+
583
+ try:
584
+ text = args.file.read_text(encoding="utf-8")
585
+ except Exception as e:
586
+ print(f"Error reading file: {e}", file=sys.stderr)
587
+ sys.exit(1)
588
+
589
+ # --- Build Tokenizer kwargs ---
590
+ tokenizer_kwargs = {
591
+ "lowercase": not args.no_lowercase,
592
+ "min_length": args.min_length,
593
+ "max_length": args.max_length,
594
+ "strip_numbers": args.strip_numbers,
595
+ "strip_punctuation": not args.keep_punctuation,
596
+ "preserve_urls": args.preserve_urls,
597
+ "preserve_emails": args.preserve_emails,
598
+ "preserve_hashtags": args.preserve_hashtags,
599
+ "preserve_mentions": args.preserve_mentions,
600
+ "expand_contractions": args.expand_contractions,
601
+ "expand_abbreviations": args.expand_abbreviations,
602
+ "strip_accents": args.strip_accents,
603
+ "normalize_unicode": not args.no_unicode_normalize,
604
+ "clean_text": not args.no_clean,
605
+ }
606
+
607
+ # Collect active options for banner
608
+ active_opts = []
609
+ if args.no_lowercase:
610
+ active_opts.append("preserve case")
611
+ if args.expand_contractions:
612
+ active_opts.append("expand contractions")
613
+ if args.expand_abbreviations:
614
+ active_opts.append("expand abbreviations")
615
+ if args.strip_numbers:
616
+ active_opts.append("strip numbers")
617
+ if args.keep_punctuation:
618
+ active_opts.append("keep punctuation")
619
+ if args.strip_accents:
620
+ active_opts.append("strip accents")
621
+ if args.no_clean:
622
+ active_opts.append("skip cleaning")
623
+ if args.no_unicode_normalize:
624
+ active_opts.append("skip unicode normalization")
625
+ if args.preserve_urls:
626
+ active_opts.append("preserve URLs")
627
+ if args.preserve_emails:
628
+ active_opts.append("preserve emails")
629
+ if args.preserve_hashtags:
630
+ active_opts.append("preserve hashtags")
631
+ if args.preserve_mentions:
632
+ active_opts.append("preserve mentions")
633
+ if args.min_length > 1:
634
+ active_opts.append(f"min length {args.min_length}")
635
+ if args.max_length is not None:
636
+ active_opts.append(f"max length {args.max_length}")
637
+
638
+ # Determine output format
639
+ if args.stats:
640
+ output_format = "Statistics"
641
+ elif args.metadata:
642
+ output_format = "JSON (with metadata)"
643
+ elif args.json:
644
+ output_format = "JSON"
645
+ else:
646
+ output_format = "One token per line"
647
+
648
+ # --- Banner (to stderr so stdout stays pipeable) ---
649
+ char_count = len(text)
650
+ line_count = text.count("\n") + 1
651
+
652
+ banner = sys.stderr
653
+ print(file=banner)
654
+ print(f" {bold('PYSTYLOMETRY')} {dim('—')} {cyan('Stylometric Tokenizer')}", file=banner)
655
+ print(f" {dim('═' * 71)}", file=banner)
656
+ print(file=banner)
657
+ print(f" {bold('INPUT')}", file=banner)
658
+ print(f" {dim('─' * 71)}", file=banner)
659
+ print(f" File: {args.file}", file=banner)
660
+ print(f" Size: {char_count:,} characters / {line_count:,} lines", file=banner)
661
+ print(file=banner)
662
+ print(f" {bold('CONFIGURATION')}", file=banner)
663
+ print(f" {dim('─' * 71)}", file=banner)
664
+ print(f" Case: {'preserve' if args.no_lowercase else 'lowercase'}", file=banner)
665
+ print(
666
+ f" Punctuation: {'keep' if args.keep_punctuation else 'strip'}",
667
+ file=banner,
668
+ )
669
+ print(
670
+ f" Contractions: {'expand' if args.expand_contractions else 'preserve'}",
671
+ file=banner,
672
+ )
673
+ print(f" Numbers: {'strip' if args.strip_numbers else 'keep'}", file=banner)
674
+ if active_opts:
675
+ print(f" Active options: {', '.join(active_opts)}", file=banner)
676
+ print(file=banner)
677
+ print(f" {bold('OUTPUT')}", file=banner)
678
+ print(f" {dim('─' * 71)}", file=banner)
679
+ print(f" Format: {output_format}", file=banner)
680
+ print(file=banner)
681
+
682
+ # --- Tokenize ---
683
+ from pystylometry.tokenizer import Tokenizer
684
+
685
+ tokenizer = Tokenizer(**tokenizer_kwargs)
686
+
687
+ if args.stats:
688
+ stats = tokenizer.get_statistics(text)
689
+ print(f" {bold('RESULTS')}", file=banner)
690
+ print(f" {dim('─' * 71)}", file=banner)
691
+ print(f" Total tokens: {green(f'{stats.total_tokens:,}')}", file=banner)
692
+ print(f" Unique tokens: {green(f'{stats.unique_tokens:,}')}", file=banner)
693
+ print(f" Word tokens: {stats.word_tokens:,}", file=banner)
694
+ print(f" Number tokens: {stats.number_tokens:,}", file=banner)
695
+ print(f" Punctuation: {stats.punctuation_tokens:,}", file=banner)
696
+ print(f" URLs: {stats.url_tokens:,}", file=banner)
697
+ print(f" Emails: {stats.email_tokens:,}", file=banner)
698
+ print(f" Hashtags: {stats.hashtag_tokens:,}", file=banner)
699
+ print(f" Mentions: {stats.mention_tokens:,}", file=banner)
700
+ print(f" Avg length: {stats.average_token_length:.1f}", file=banner)
701
+ print(f" Min length: {stats.min_token_length}", file=banner)
702
+ print(f" Max length: {stats.max_token_length}", file=banner)
703
+ print(file=banner)
704
+
705
+ if args.json:
706
+ import dataclasses
707
+
708
+ print(json.dumps(dataclasses.asdict(stats), indent=2))
709
+
710
+ elif args.metadata or (args.json and args.metadata):
711
+ metadata_list = tokenizer.tokenize_with_metadata(text)
712
+ count = len(metadata_list)
713
+ print(
714
+ f" {yellow('Tokenizing...')} {green(f'{count:,}')} tokens extracted",
715
+ file=banner,
716
+ )
717
+ print(file=banner)
718
+ output = [
719
+ {
720
+ "token": m.token,
721
+ "start": m.start,
722
+ "end": m.end,
723
+ "type": m.token_type,
724
+ }
725
+ for m in metadata_list
726
+ ]
727
+ print(json.dumps(output, indent=2))
728
+
729
+ elif args.json:
730
+ tokens = tokenizer.tokenize(text)
731
+ count = len(tokens)
732
+ print(
733
+ f" {yellow('Tokenizing...')} {green(f'{count:,}')} tokens extracted",
734
+ file=banner,
735
+ )
736
+ print(file=banner)
737
+ print(json.dumps(tokens, indent=2))
738
+
739
+ else:
740
+ tokens = tokenizer.tokenize(text)
741
+ count = len(tokens)
742
+ print(
743
+ f" {yellow('Tokenizing...')} {green(f'{count:,}')} tokens extracted",
744
+ file=banner,
745
+ )
746
+ print(file=banner)
747
+ for token in tokens:
748
+ print(token)
749
+
750
+
751
+ def bnc_frequency_cli() -> None:
752
+ """CLI entry point for BNC word frequency analysis."""
753
+ parser = argparse.ArgumentParser(
754
+ prog="bnc",
755
+ description="Analyze word frequencies against the British National Corpus (BNC).",
756
+ formatter_class=argparse.RawDescriptionHelpFormatter,
757
+ epilog="""
758
+ Examples:
759
+ bnc --input-file manuscript.txt
760
+ bnc --input-file manuscript.txt --output-file report.html
761
+ bnc --input-file manuscript.txt --format json
762
+ bnc --input-file manuscript.txt --overuse-threshold 2.0 --min-mentions 3
763
+ bnc --input-file manuscript.txt --no-wordnet
764
+
765
+ Output:
766
+ Generates a report with three sections:
767
+ - Not in BNC: Words not found in the corpus (with WordNet status, character type)
768
+ - Most Underused: Words appearing less frequently than expected
769
+ - Most Overused: Words appearing more frequently than expected
770
+
771
+ Thresholds:
772
+ Words with ratio > overuse-threshold are "overused"
773
+ Words with ratio < underuse-threshold are "underused"
774
+ Ratio = observed_count / expected_count (based on BNC frequencies)
775
+ """,
776
+ )
777
+
778
+ parser.add_argument(
779
+ "--input-file",
780
+ "-i",
781
+ type=Path,
782
+ required=True,
783
+ metavar="FILE",
784
+ help="Path to text file to analyze",
785
+ )
786
+ parser.add_argument(
787
+ "--output-file",
788
+ "-o",
789
+ type=Path,
790
+ default=None,
791
+ metavar="FILE",
792
+ help="Output file (default: <input>_bnc_frequency.<ext> based on --format)",
793
+ )
794
+ parser.add_argument(
795
+ "--overuse-threshold",
796
+ type=float,
797
+ default=1.3,
798
+ metavar="N",
799
+ help="Ratio above which words are considered overused (default: 1.3)",
800
+ )
801
+ parser.add_argument(
802
+ "--underuse-threshold",
803
+ type=float,
804
+ default=0.8,
805
+ metavar="N",
806
+ help="Ratio below which words are considered underused (default: 0.8)",
807
+ )
808
+ parser.add_argument(
809
+ "--min-mentions",
810
+ type=int,
811
+ default=1,
812
+ metavar="N",
813
+ help="Minimum word occurrences to include (default: 1)",
814
+ )
815
+ parser.add_argument(
816
+ "--no-wordnet",
817
+ action="store_true",
818
+ help="Skip WordNet lookup for unknown words",
819
+ )
820
+ parser.add_argument(
821
+ "--format",
822
+ choices=["csv", "html", "json", "excel"],
823
+ default="csv",
824
+ help="Output format: csv (tab-delimited), html (interactive), json, excel (default: csv)",
825
+ )
826
+
827
+ args = parser.parse_args()
828
+
829
+ # Import rich for colored output
830
+ from rich.console import Console
831
+ from rich.panel import Panel
832
+ from rich.table import Table
833
+ from rich.text import Text
834
+
835
+ console = Console(stderr=True)
836
+
837
+ # Validate file exists
838
+ if not args.input_file.exists():
839
+ console.print(f"[red]Error:[/red] File not found: {args.input_file}")
840
+ sys.exit(1)
841
+
842
+ # Read file
843
+ try:
844
+ text = args.input_file.read_text(encoding="utf-8")
845
+ except Exception as e:
846
+ console.print(f"[red]Error reading file:[/red] {e}")
847
+ sys.exit(1)
848
+
849
+ # Determine output path (extension based on format)
850
+ suffix_map = {"csv": ".tsv", "html": ".html", "json": ".json", "excel": ".xlsx"}
851
+ if args.output_file:
852
+ output_path = args.output_file
853
+ else:
854
+ suffix = suffix_map[args.format]
855
+ output_path = args.input_file.with_name(f"{args.input_file.stem}_bnc_frequency{suffix}")
856
+
857
+ # Calculate file stats
858
+ token_count = len(text.split())
859
+ char_count = len(text)
860
+
861
+ # Print header
862
+ console.print()
863
+ header = Text()
864
+ header.append("PYSTYLOMETRY", style="bold cyan")
865
+ header.append(" — ", style="dim")
866
+ header.append("BNC Word Frequency Analysis", style="bold white")
867
+ console.print(Panel(header, border_style="cyan"))
868
+
869
+ # Input section
870
+ console.print()
871
+ console.print("[bold]INPUT[/bold]", style="cyan")
872
+ console.print("─" * 60, style="dim")
873
+ console.print(f" File: [white]{args.input_file}[/white]")
874
+ console.print(
875
+ f" Size: [green]{char_count:,}[/green] chars / [green]{token_count:,}[/green] tokens"
876
+ )
877
+
878
+ # Parameters section
879
+ console.print()
880
+ console.print("[bold]PARAMETERS[/bold]", style="cyan")
881
+ console.print("─" * 60, style="dim")
882
+ console.print(f" Overuse threshold: [yellow]{args.overuse_threshold}x[/yellow]")
883
+ console.print(f" Underuse threshold: [yellow]{args.underuse_threshold}x[/yellow]")
884
+ console.print(f" Min mentions: [yellow]{args.min_mentions}[/yellow]")
885
+ console.print(f" WordNet lookup: [yellow]{'no' if args.no_wordnet else 'yes'}[/yellow]")
886
+
887
+ # Output section
888
+ console.print()
889
+ console.print("[bold]OUTPUT[/bold]", style="cyan")
890
+ console.print("─" * 60, style="dim")
891
+ fmt_display = {
892
+ "csv": "Tab-delimited CSV",
893
+ "html": "Interactive HTML",
894
+ "json": "JSON",
895
+ "excel": "Excel (.xlsx)",
896
+ }
897
+ console.print(f" Format: [magenta]{fmt_display[args.format]}[/magenta]")
898
+ console.print(f" Destination: [white]{output_path}[/white]")
899
+
900
+ # Run analysis with spinner
901
+ console.print()
902
+ with console.status("[bold cyan]Running analysis...[/bold cyan]", spinner="dots"):
903
+ from pystylometry.lexical.bnc_frequency import compute_bnc_frequency
904
+
905
+ result = compute_bnc_frequency(
906
+ text,
907
+ overuse_threshold=args.overuse_threshold,
908
+ underuse_threshold=args.underuse_threshold,
909
+ include_wordnet=not args.no_wordnet,
910
+ min_mentions=args.min_mentions,
911
+ )
912
+
913
+ # Output results
914
+ if args.format == "json":
915
+ output = {
916
+ "stats": {
917
+ "total_tokens": result.total_tokens,
918
+ "unique_tokens": result.unique_tokens,
919
+ "overused_count": len(result.overused),
920
+ "underused_count": len(result.underused),
921
+ "not_in_bnc_count": len(result.not_in_bnc),
922
+ },
923
+ "overused": [
924
+ {
925
+ "word": w.word,
926
+ "observed": w.observed,
927
+ "expected": w.expected,
928
+ "ratio": w.ratio,
929
+ "char_type": w.char_type,
930
+ }
931
+ for w in result.overused
932
+ ],
933
+ "underused": [
934
+ {
935
+ "word": w.word,
936
+ "observed": w.observed,
937
+ "expected": w.expected,
938
+ "ratio": w.ratio,
939
+ "char_type": w.char_type,
940
+ }
941
+ for w in result.underused
942
+ ],
943
+ "not_in_bnc": [
944
+ {
945
+ "word": w.word,
946
+ "observed": w.observed,
947
+ "in_wordnet": w.in_wordnet,
948
+ "char_type": w.char_type,
949
+ }
950
+ for w in result.not_in_bnc
951
+ ],
952
+ }
953
+ output_path.write_text(json.dumps(output, indent=2))
954
+ console.print(f'[green]✓[/green] JSON saved to: [white]"{output_path}"[/white]')
955
+
956
+ elif args.format == "csv":
957
+ # Tab-delimited output with category column
958
+ lines = ["category\tword\tobserved\texpected\tratio\tin_wordnet\tchar_type"]
959
+
960
+ def fmt_wordnet(val: bool | None) -> str:
961
+ if val is True:
962
+ return "yes"
963
+ elif val is False:
964
+ return "no"
965
+ return ""
966
+
967
+ for w in result.overused:
968
+ expected = f"{w.expected:.2f}" if w.expected else ""
969
+ ratio = f"{w.ratio:.4f}" if w.ratio else ""
970
+ in_wn = fmt_wordnet(w.in_wordnet)
971
+ lines.append(
972
+ f"overused\t{w.word}\t{w.observed}\t{expected}\t{ratio}\t{in_wn}\t{w.char_type}"
973
+ )
974
+
975
+ for w in result.underused:
976
+ expected = f"{w.expected:.2f}" if w.expected else ""
977
+ ratio = f"{w.ratio:.4f}" if w.ratio else ""
978
+ in_wn = fmt_wordnet(w.in_wordnet)
979
+ lines.append(
980
+ f"underused\t{w.word}\t{w.observed}\t{expected}\t{ratio}\t{in_wn}\t{w.char_type}"
981
+ )
982
+
983
+ for w in result.not_in_bnc:
984
+ in_wn = fmt_wordnet(w.in_wordnet)
985
+ lines.append(f"not-in-bnc\t{w.word}\t{w.observed}\t\t\t{in_wn}\t{w.char_type}")
986
+
987
+ output_path.write_text("\n".join(lines))
988
+ console.print(f'[green]✓[/green] TSV saved to: [white]"{output_path}"[/white]')
989
+
990
+ elif args.format == "excel":
991
+ try:
992
+ from openpyxl import Workbook # type: ignore[import-untyped]
993
+ from openpyxl.styles import Alignment, PatternFill # type: ignore[import-untyped]
994
+ except ImportError:
995
+ console.print("[red]Error:[/red] Excel export requires openpyxl.")
996
+ console.print(" Install with: [yellow]pip install pystylometry[excel][/yellow]")
997
+ console.print(" Or for pipx: [yellow]pipx inject pystylometry openpyxl[/yellow]")
998
+ sys.exit(1)
999
+
1000
+ wb = Workbook()
1001
+
1002
+ # Remove default sheet
1003
+ wb.remove(wb.active)
1004
+
1005
+ # Cell style: width 15, centered, vertically centered
1006
+ align = Alignment(horizontal="center", vertical="center")
1007
+
1008
+ def fmt_wordnet_excel(val: bool | None) -> str:
1009
+ if val is True:
1010
+ return "yes"
1011
+ elif val is False:
1012
+ return "no"
1013
+ return ""
1014
+
1015
+ # Overused sheet (sorted by ratio, high to low)
1016
+ ws_over = wb.create_sheet("overused")
1017
+ ws_over.append(["word", "observed", "expected", "ratio", "in_wordnet", "char_type"])
1018
+ for w in sorted(result.overused, key=lambda x: x.ratio or 0, reverse=True):
1019
+ in_wn = fmt_wordnet_excel(w.in_wordnet)
1020
+ ws_over.append([w.word, w.observed, w.expected, w.ratio, in_wn, w.char_type])
1021
+
1022
+ # Underused sheet (sorted by ratio, high to low)
1023
+ ws_under = wb.create_sheet("underused")
1024
+ ws_under.append(["word", "observed", "expected", "ratio", "in_wordnet", "char_type"])
1025
+ for w in sorted(result.underused, key=lambda x: x.ratio or 0, reverse=True):
1026
+ in_wn = fmt_wordnet_excel(w.in_wordnet)
1027
+ ws_under.append([w.word, w.observed, w.expected, w.ratio, in_wn, w.char_type])
1028
+
1029
+ # Not in BNC sheet
1030
+ ws_notbnc = wb.create_sheet("not-in-bnc")
1031
+ ws_notbnc.append(["word", "observed", "in_wordnet", "char_type", "plural_form"])
1032
+ for w in result.not_in_bnc:
1033
+ in_wn = fmt_wordnet_excel(w.in_wordnet)
1034
+ plural = "yes" if w.word.endswith("s") else "no"
1035
+ ws_notbnc.append([w.word, w.observed, in_wn, w.char_type, plural])
1036
+
1037
+ # Apply formatting to all sheets
1038
+ for ws in [ws_over, ws_under, ws_notbnc]:
1039
+ for col in ws.columns:
1040
+ col_letter = col[0].column_letter
1041
+ # Word column (A) gets width 30, others get 15
1042
+ ws.column_dimensions[col_letter].width = 30 if col_letter == "A" else 15
1043
+ for row in ws.iter_rows():
1044
+ for cell in row:
1045
+ cell.alignment = align
1046
+
1047
+ # Apply number formatting to expected (C) and ratio (D) columns
1048
+ for ws in [ws_over, ws_under]:
1049
+ for row in range(2, ws.max_row + 1): # Skip header row
1050
+ ws[f"C{row}"].number_format = "0.00"
1051
+ ws[f"D{row}"].number_format = "0.00"
1052
+
1053
+ # Apply background colors to in_wordnet column
1054
+ fill_yes = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
1055
+ fill_no = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
1056
+
1057
+ # in_wordnet is column E for overused/underused, column C for not-in-bnc
1058
+ for ws in [ws_over, ws_under]:
1059
+ for row in range(2, ws.max_row + 1):
1060
+ cell = ws[f"E{row}"]
1061
+ if cell.value == "yes":
1062
+ cell.fill = fill_yes
1063
+ elif cell.value == "no":
1064
+ cell.fill = fill_no
1065
+
1066
+ for row in range(2, ws_notbnc.max_row + 1):
1067
+ cell = ws_notbnc[f"C{row}"]
1068
+ if cell.value == "yes":
1069
+ cell.fill = fill_yes
1070
+ elif cell.value == "no":
1071
+ cell.fill = fill_no
1072
+
1073
+ # Apply background colors to plural_form column (E) in not-in-bnc
1074
+ fill_plural_yes = PatternFill(
1075
+ start_color="BDD7EE", end_color="BDD7EE", fill_type="solid"
1076
+ ) # Light blue
1077
+ fill_plural_no = PatternFill(
1078
+ start_color="FCE4D6", end_color="FCE4D6", fill_type="solid"
1079
+ ) # Light peach
1080
+ for row in range(2, ws_notbnc.max_row + 1):
1081
+ cell = ws_notbnc[f"E{row}"]
1082
+ if cell.value == "yes":
1083
+ cell.fill = fill_plural_yes
1084
+ elif cell.value == "no":
1085
+ cell.fill = fill_plural_no
1086
+
1087
+ wb.save(output_path)
1088
+ console.print(f'[green]✓[/green] Excel saved to: [white]"{output_path}"[/white]')
1089
+
1090
+ else: # html
1091
+ from pystylometry.viz.jsx import export_bnc_frequency_jsx
1092
+
1093
+ export_bnc_frequency_jsx(
1094
+ result,
1095
+ output_file=output_path,
1096
+ title=f"BNC Frequency Analysis: {args.input_file.name}",
1097
+ source_file=str(args.input_file),
1098
+ )
1099
+
1100
+ abs_path = output_path.resolve()
1101
+ file_url = f"file://{abs_path}"
1102
+ console.print(f'[green]✓[/green] HTML report saved to: [white]"{output_path}"[/white]')
1103
+ console.print(f" Open in browser: [link={file_url}]{file_url}[/link]")
1104
+
1105
+ # Summary table
1106
+ console.print()
1107
+ table = Table(title="Summary", border_style="cyan", header_style="bold cyan")
1108
+ table.add_column("Metric", style="white")
1109
+ table.add_column("Count", justify="right", style="green")
1110
+
1111
+ table.add_row("Total tokens", f"{result.total_tokens:,}")
1112
+ table.add_row("Unique words", f"{result.unique_tokens:,}")
1113
+ table.add_row("Not in BNC", f"[dim]{len(result.not_in_bnc):,}[/dim]")
1114
+ table.add_row("Underused", f"[blue]{len(result.underused):,}[/blue]")
1115
+ table.add_row("Overused", f"[red]{len(result.overused):,}[/red]")
1116
+
1117
+ console.print(table)
1118
+ console.print()
1119
+
1120
+
426
1121
  if __name__ == "__main__":
427
1122
  drift_cli()