pystylometry 1.3.5__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pystylometry/cli.py CHANGED
@@ -4,6 +4,7 @@ Usage:
4
4
  pystylometry-drift <file> [--window-size=N] [--stride=N] [--mode=MODE] [--json]
5
5
  pystylometry-drift <file> --plot [output.png]
6
6
  pystylometry-tokenize <file> [--json] [--metadata] [--stats]
7
+ bnc --input-file <file> [--output-file <file>] [--format csv|html|json]
7
8
 
8
9
  Example:
9
10
  pystylometry-drift manuscript.txt
@@ -14,6 +15,9 @@ Example:
14
15
  pystylometry-tokenize manuscript.txt
15
16
  pystylometry-tokenize manuscript.txt --json --metadata
16
17
  pystylometry-tokenize manuscript.txt --stats
18
+ bnc --input-file manuscript.txt
19
+ bnc --input-file manuscript.txt --output-file report.html --format html
20
+ bnc -i manuscript.txt --format json
17
21
  """
18
22
 
19
23
  from __future__ import annotations
@@ -744,5 +748,375 @@ Examples:
744
748
  print(token)
745
749
 
746
750
 
751
+ def bnc_frequency_cli() -> None:
752
+ """CLI entry point for BNC word frequency analysis."""
753
+ parser = argparse.ArgumentParser(
754
+ prog="bnc",
755
+ description="Analyze word frequencies against the British National Corpus (BNC).",
756
+ formatter_class=argparse.RawDescriptionHelpFormatter,
757
+ epilog="""
758
+ Examples:
759
+ bnc --input-file manuscript.txt
760
+ bnc --input-file manuscript.txt --output-file report.html
761
+ bnc --input-file manuscript.txt --format json
762
+ bnc --input-file manuscript.txt --overuse-threshold 2.0 --min-mentions 3
763
+ bnc --input-file manuscript.txt --no-wordnet
764
+
765
+ Output:
766
+ Generates a report with three sections:
767
+ - Not in BNC: Words not found in the corpus (with WordNet status, character type)
768
+ - Most Underused: Words appearing less frequently than expected
769
+ - Most Overused: Words appearing more frequently than expected
770
+
771
+ Thresholds:
772
+ Words with ratio > overuse-threshold are "overused"
773
+ Words with ratio < underuse-threshold are "underused"
774
+ Ratio = observed_count / expected_count (based on BNC frequencies)
775
+ """,
776
+ )
777
+
778
+ parser.add_argument(
779
+ "--input-file",
780
+ "-i",
781
+ type=Path,
782
+ required=True,
783
+ metavar="FILE",
784
+ help="Path to text file to analyze",
785
+ )
786
+ parser.add_argument(
787
+ "--output-file",
788
+ "-o",
789
+ type=Path,
790
+ default=None,
791
+ metavar="FILE",
792
+ help="Output file (default: <input>_bnc_frequency.<ext> based on --format)",
793
+ )
794
+ parser.add_argument(
795
+ "--overuse-threshold",
796
+ type=float,
797
+ default=1.3,
798
+ metavar="N",
799
+ help="Ratio above which words are considered overused (default: 1.3)",
800
+ )
801
+ parser.add_argument(
802
+ "--underuse-threshold",
803
+ type=float,
804
+ default=0.8,
805
+ metavar="N",
806
+ help="Ratio below which words are considered underused (default: 0.8)",
807
+ )
808
+ parser.add_argument(
809
+ "--min-mentions",
810
+ type=int,
811
+ default=1,
812
+ metavar="N",
813
+ help="Minimum word occurrences to include (default: 1)",
814
+ )
815
+ parser.add_argument(
816
+ "--no-wordnet",
817
+ action="store_true",
818
+ help="Skip WordNet lookup for unknown words",
819
+ )
820
+ parser.add_argument(
821
+ "--format",
822
+ choices=["csv", "html", "json", "excel"],
823
+ default="csv",
824
+ help="Output format: csv (tab-delimited), html (interactive), json, excel (default: csv)",
825
+ )
826
+
827
+ args = parser.parse_args()
828
+
829
+ # Import rich for colored output
830
+ from rich.console import Console
831
+ from rich.panel import Panel
832
+ from rich.table import Table
833
+ from rich.text import Text
834
+
835
+ console = Console(stderr=True)
836
+
837
+ # Validate file exists
838
+ if not args.input_file.exists():
839
+ console.print(f"[red]Error:[/red] File not found: {args.input_file}")
840
+ sys.exit(1)
841
+
842
+ # Read file
843
+ try:
844
+ text = args.input_file.read_text(encoding="utf-8")
845
+ except Exception as e:
846
+ console.print(f"[red]Error reading file:[/red] {e}")
847
+ sys.exit(1)
848
+
849
+ # Determine output path (extension based on format)
850
+ suffix_map = {"csv": ".tsv", "html": ".html", "json": ".json", "excel": ".xlsx"}
851
+ if args.output_file:
852
+ output_path = args.output_file
853
+ else:
854
+ suffix = suffix_map[args.format]
855
+ output_path = args.input_file.with_name(f"{args.input_file.stem}_bnc_frequency{suffix}")
856
+
857
+ # Calculate file stats
858
+ token_count = len(text.split())
859
+ char_count = len(text)
860
+
861
+ # Print header
862
+ console.print()
863
+ header = Text()
864
+ header.append("PYSTYLOMETRY", style="bold cyan")
865
+ header.append(" — ", style="dim")
866
+ header.append("BNC Word Frequency Analysis", style="bold white")
867
+ console.print(Panel(header, border_style="cyan"))
868
+
869
+ # Input section
870
+ console.print()
871
+ console.print("[bold]INPUT[/bold]", style="cyan")
872
+ console.print("─" * 60, style="dim")
873
+ console.print(f" File: [white]{args.input_file}[/white]")
874
+ console.print(
875
+ f" Size: [green]{char_count:,}[/green] chars / [green]{token_count:,}[/green] tokens"
876
+ )
877
+
878
+ # Parameters section
879
+ console.print()
880
+ console.print("[bold]PARAMETERS[/bold]", style="cyan")
881
+ console.print("─" * 60, style="dim")
882
+ console.print(f" Overuse threshold: [yellow]{args.overuse_threshold}x[/yellow]")
883
+ console.print(f" Underuse threshold: [yellow]{args.underuse_threshold}x[/yellow]")
884
+ console.print(f" Min mentions: [yellow]{args.min_mentions}[/yellow]")
885
+ console.print(f" WordNet lookup: [yellow]{'no' if args.no_wordnet else 'yes'}[/yellow]")
886
+
887
+ # Output section
888
+ console.print()
889
+ console.print("[bold]OUTPUT[/bold]", style="cyan")
890
+ console.print("─" * 60, style="dim")
891
+ fmt_display = {
892
+ "csv": "Tab-delimited CSV",
893
+ "html": "Interactive HTML",
894
+ "json": "JSON",
895
+ "excel": "Excel (.xlsx)",
896
+ }
897
+ console.print(f" Format: [magenta]{fmt_display[args.format]}[/magenta]")
898
+ console.print(f" Destination: [white]{output_path}[/white]")
899
+
900
+ # Run analysis with spinner
901
+ console.print()
902
+ with console.status("[bold cyan]Running analysis...[/bold cyan]", spinner="dots"):
903
+ from pystylometry.lexical.bnc_frequency import compute_bnc_frequency
904
+
905
+ result = compute_bnc_frequency(
906
+ text,
907
+ overuse_threshold=args.overuse_threshold,
908
+ underuse_threshold=args.underuse_threshold,
909
+ include_wordnet=not args.no_wordnet,
910
+ min_mentions=args.min_mentions,
911
+ )
912
+
913
+ # Output results
914
+ if args.format == "json":
915
+ output = {
916
+ "stats": {
917
+ "total_tokens": result.total_tokens,
918
+ "unique_tokens": result.unique_tokens,
919
+ "overused_count": len(result.overused),
920
+ "underused_count": len(result.underused),
921
+ "not_in_bnc_count": len(result.not_in_bnc),
922
+ },
923
+ "overused": [
924
+ {
925
+ "word": w.word,
926
+ "observed": w.observed,
927
+ "expected": w.expected,
928
+ "ratio": w.ratio,
929
+ "char_type": w.char_type,
930
+ }
931
+ for w in result.overused
932
+ ],
933
+ "underused": [
934
+ {
935
+ "word": w.word,
936
+ "observed": w.observed,
937
+ "expected": w.expected,
938
+ "ratio": w.ratio,
939
+ "char_type": w.char_type,
940
+ }
941
+ for w in result.underused
942
+ ],
943
+ "not_in_bnc": [
944
+ {
945
+ "word": w.word,
946
+ "observed": w.observed,
947
+ "in_wordnet": w.in_wordnet,
948
+ "char_type": w.char_type,
949
+ }
950
+ for w in result.not_in_bnc
951
+ ],
952
+ }
953
+ output_path.write_text(json.dumps(output, indent=2))
954
+ console.print(f'[green]✓[/green] JSON saved to: [white]"{output_path}"[/white]')
955
+
956
+ elif args.format == "csv":
957
+ # Tab-delimited output with category column
958
+ lines = ["category\tword\tobserved\texpected\tratio\tin_wordnet\tchar_type"]
959
+
960
+ def fmt_wordnet(val: bool | None) -> str:
961
+ if val is True:
962
+ return "yes"
963
+ elif val is False:
964
+ return "no"
965
+ return ""
966
+
967
+ for w in result.overused:
968
+ expected = f"{w.expected:.2f}" if w.expected else ""
969
+ ratio = f"{w.ratio:.4f}" if w.ratio else ""
970
+ in_wn = fmt_wordnet(w.in_wordnet)
971
+ lines.append(
972
+ f"overused\t{w.word}\t{w.observed}\t{expected}\t{ratio}\t{in_wn}\t{w.char_type}"
973
+ )
974
+
975
+ for w in result.underused:
976
+ expected = f"{w.expected:.2f}" if w.expected else ""
977
+ ratio = f"{w.ratio:.4f}" if w.ratio else ""
978
+ in_wn = fmt_wordnet(w.in_wordnet)
979
+ lines.append(
980
+ f"underused\t{w.word}\t{w.observed}\t{expected}\t{ratio}\t{in_wn}\t{w.char_type}"
981
+ )
982
+
983
+ for w in result.not_in_bnc:
984
+ in_wn = fmt_wordnet(w.in_wordnet)
985
+ lines.append(f"not-in-bnc\t{w.word}\t{w.observed}\t\t\t{in_wn}\t{w.char_type}")
986
+
987
+ output_path.write_text("\n".join(lines))
988
+ console.print(f'[green]✓[/green] TSV saved to: [white]"{output_path}"[/white]')
989
+
990
+ elif args.format == "excel":
991
+ try:
992
+ from openpyxl import Workbook # type: ignore[import-untyped]
993
+ from openpyxl.styles import Alignment, PatternFill # type: ignore[import-untyped]
994
+ except ImportError:
995
+ console.print("[red]Error:[/red] Excel export requires openpyxl.")
996
+ console.print(" Install with: [yellow]pip install pystylometry[excel][/yellow]")
997
+ console.print(" Or for pipx: [yellow]pipx inject pystylometry openpyxl[/yellow]")
998
+ sys.exit(1)
999
+
1000
+ wb = Workbook()
1001
+
1002
+ # Remove default sheet
1003
+ wb.remove(wb.active)
1004
+
1005
+ # Cell style: width 15, centered, vertically centered
1006
+ align = Alignment(horizontal="center", vertical="center")
1007
+
1008
+ def fmt_wordnet_excel(val: bool | None) -> str:
1009
+ if val is True:
1010
+ return "yes"
1011
+ elif val is False:
1012
+ return "no"
1013
+ return ""
1014
+
1015
+ # Overused sheet (sorted by ratio, high to low)
1016
+ ws_over = wb.create_sheet("overused")
1017
+ ws_over.append(["word", "observed", "expected", "ratio", "in_wordnet", "char_type"])
1018
+ for w in sorted(result.overused, key=lambda x: x.ratio or 0, reverse=True):
1019
+ in_wn = fmt_wordnet_excel(w.in_wordnet)
1020
+ ws_over.append([w.word, w.observed, w.expected, w.ratio, in_wn, w.char_type])
1021
+
1022
+ # Underused sheet (sorted by ratio, high to low)
1023
+ ws_under = wb.create_sheet("underused")
1024
+ ws_under.append(["word", "observed", "expected", "ratio", "in_wordnet", "char_type"])
1025
+ for w in sorted(result.underused, key=lambda x: x.ratio or 0, reverse=True):
1026
+ in_wn = fmt_wordnet_excel(w.in_wordnet)
1027
+ ws_under.append([w.word, w.observed, w.expected, w.ratio, in_wn, w.char_type])
1028
+
1029
+ # Not in BNC sheet
1030
+ ws_notbnc = wb.create_sheet("not-in-bnc")
1031
+ ws_notbnc.append(["word", "observed", "in_wordnet", "char_type", "plural_form"])
1032
+ for w in result.not_in_bnc:
1033
+ in_wn = fmt_wordnet_excel(w.in_wordnet)
1034
+ plural = "yes" if w.word.endswith("s") else "no"
1035
+ ws_notbnc.append([w.word, w.observed, in_wn, w.char_type, plural])
1036
+
1037
+ # Apply formatting to all sheets
1038
+ for ws in [ws_over, ws_under, ws_notbnc]:
1039
+ for col in ws.columns:
1040
+ col_letter = col[0].column_letter
1041
+ # Word column (A) gets width 30, others get 15
1042
+ ws.column_dimensions[col_letter].width = 30 if col_letter == "A" else 15
1043
+ for row in ws.iter_rows():
1044
+ for cell in row:
1045
+ cell.alignment = align
1046
+
1047
+ # Apply number formatting to expected (C) and ratio (D) columns
1048
+ for ws in [ws_over, ws_under]:
1049
+ for row in range(2, ws.max_row + 1): # Skip header row
1050
+ ws[f"C{row}"].number_format = "0.00"
1051
+ ws[f"D{row}"].number_format = "0.00"
1052
+
1053
+ # Apply background colors to in_wordnet column
1054
+ fill_yes = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
1055
+ fill_no = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
1056
+
1057
+ # in_wordnet is column E for overused/underused, column C for not-in-bnc
1058
+ for ws in [ws_over, ws_under]:
1059
+ for row in range(2, ws.max_row + 1):
1060
+ cell = ws[f"E{row}"]
1061
+ if cell.value == "yes":
1062
+ cell.fill = fill_yes
1063
+ elif cell.value == "no":
1064
+ cell.fill = fill_no
1065
+
1066
+ for row in range(2, ws_notbnc.max_row + 1):
1067
+ cell = ws_notbnc[f"C{row}"]
1068
+ if cell.value == "yes":
1069
+ cell.fill = fill_yes
1070
+ elif cell.value == "no":
1071
+ cell.fill = fill_no
1072
+
1073
+ # Apply background colors to plural_form column (E) in not-in-bnc
1074
+ fill_plural_yes = PatternFill(
1075
+ start_color="BDD7EE", end_color="BDD7EE", fill_type="solid"
1076
+ ) # Light blue
1077
+ fill_plural_no = PatternFill(
1078
+ start_color="FCE4D6", end_color="FCE4D6", fill_type="solid"
1079
+ ) # Light peach
1080
+ for row in range(2, ws_notbnc.max_row + 1):
1081
+ cell = ws_notbnc[f"E{row}"]
1082
+ if cell.value == "yes":
1083
+ cell.fill = fill_plural_yes
1084
+ elif cell.value == "no":
1085
+ cell.fill = fill_plural_no
1086
+
1087
+ wb.save(output_path)
1088
+ console.print(f'[green]✓[/green] Excel saved to: [white]"{output_path}"[/white]')
1089
+
1090
+ else: # html
1091
+ from pystylometry.viz.jsx import export_bnc_frequency_jsx
1092
+
1093
+ export_bnc_frequency_jsx(
1094
+ result,
1095
+ output_file=output_path,
1096
+ title=f"BNC Frequency Analysis: {args.input_file.name}",
1097
+ source_file=str(args.input_file),
1098
+ )
1099
+
1100
+ abs_path = output_path.resolve()
1101
+ file_url = f"file://{abs_path}"
1102
+ console.print(f'[green]✓[/green] HTML report saved to: [white]"{output_path}"[/white]')
1103
+ console.print(f" Open in browser: [link={file_url}]{file_url}[/link]")
1104
+
1105
+ # Summary table
1106
+ console.print()
1107
+ table = Table(title="Summary", border_style="cyan", header_style="bold cyan")
1108
+ table.add_column("Metric", style="white")
1109
+ table.add_column("Count", justify="right", style="green")
1110
+
1111
+ table.add_row("Total tokens", f"{result.total_tokens:,}")
1112
+ table.add_row("Unique words", f"{result.unique_tokens:,}")
1113
+ table.add_row("Not in BNC", f"[dim]{len(result.not_in_bnc):,}[/dim]")
1114
+ table.add_row("Underused", f"[blue]{len(result.underused):,}[/blue]")
1115
+ table.add_row("Overused", f"[red]{len(result.overused):,}[/red]")
1116
+
1117
+ console.print(table)
1118
+ console.print()
1119
+
1120
+
747
1121
  if __name__ == "__main__":
748
1122
  drift_cli()
@@ -2,6 +2,7 @@
2
2
 
3
3
  # Local implementations
4
4
  from .advanced_diversity import compute_hdd, compute_mattr, compute_msttr, compute_vocd_d
5
+ from .bnc_frequency import compute_bnc_frequency
5
6
  from .function_words import compute_function_words
6
7
  from .hapax import compute_hapax_ratios, compute_hapax_with_lexicon_analysis
7
8
  from .mtld import compute_mtld
@@ -17,6 +18,7 @@ __all__ = [
17
18
  "compute_yule",
18
19
  "compute_hapax_ratios",
19
20
  "compute_hapax_with_lexicon_analysis",
21
+ "compute_bnc_frequency",
20
22
  "compute_function_words",
21
23
  "compute_vocd_d",
22
24
  "compute_mattr",
@@ -0,0 +1,309 @@
1
+ """BNC (British National Corpus) frequency analysis for stylometric comparison.
2
+
3
+ This module computes word frequency ratios by comparing observed word frequencies
4
+ in a text against expected frequencies from the British National Corpus (BNC).
5
+ Words can be categorized as:
6
+ - Overused: appear more frequently than expected (ratio > 1)
7
+ - Underused: appear less frequently than expected (ratio < 1)
8
+ - Not in BNC: words that don't exist in the BNC corpus
9
+
10
+ Related GitHub Issue:
11
+ #TBD - BNC frequency analysis CLI
12
+ https://github.com/craigtrim/pystylometry/issues/TBD
13
+
14
+ References:
15
+ British National Corpus: http://www.natcorp.ox.ac.uk/
16
+ The BNC is a 100-million word collection of samples of written and spoken
17
+ language from a wide range of sources, designed to represent a wide
18
+ cross-section of British English from the late 20th century.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import re
24
+ import unicodedata
25
+ from collections import Counter
26
+ from dataclasses import dataclass
27
+ from typing import Literal
28
+
29
+ from .._utils import check_optional_dependency
30
+
31
+ # Unicode apostrophe variants to normalize to ASCII apostrophe (U+0027)
32
+ # See: https://github.com/craigtrim/pystylometry/issues/45
33
+ _APOSTROPHE_VARIANTS = (
34
+ "\u0060" # GRAVE ACCENT
35
+ "\u00B4" # ACUTE ACCENT
36
+ "\u2018" # LEFT SINGLE QUOTATION MARK
37
+ "\u2019" # RIGHT SINGLE QUOTATION MARK
38
+ "\u201B" # SINGLE HIGH-REVERSED-9 QUOTATION MARK
39
+ "\u2032" # PRIME
40
+ "\u2035" # REVERSED PRIME
41
+ "\u02B9" # MODIFIER LETTER PRIME
42
+ "\u02BC" # MODIFIER LETTER APOSTROPHE
43
+ "\u02C8" # MODIFIER LETTER VERTICAL LINE
44
+ "\u0313" # COMBINING COMMA ABOVE
45
+ "\u0315" # COMBINING COMMA ABOVE RIGHT
46
+ "\u055A" # ARMENIAN APOSTROPHE
47
+ "\u05F3" # HEBREW PUNCTUATION GERESH
48
+ "\u07F4" # NKO HIGH TONE APOSTROPHE
49
+ "\u07F5" # NKO LOW TONE APOSTROPHE
50
+ "\uFF07" # FULLWIDTH APOSTROPHE
51
+ "\u1FBF" # GREEK PSILI
52
+ "\u1FBD" # GREEK KORONIS
53
+ "\uA78C" # LATIN SMALL LETTER SALTILLO
54
+ )
55
+
56
+
57
+ def _normalize_apostrophes(text: str) -> str:
58
+ """Normalize Unicode apostrophe variants to ASCII apostrophe.
59
+
60
+ Many texts (especially ebooks, PDFs, and word processor output) use
61
+ typographic "smart quotes" instead of ASCII apostrophes. This function
62
+ normalizes all variants to the standard ASCII apostrophe (U+0027) to
63
+ ensure consistent BNC lookups.
64
+
65
+ Args:
66
+ text: Input text potentially containing apostrophe variants
67
+
68
+ Returns:
69
+ Text with all apostrophe variants normalized to ASCII apostrophe
70
+
71
+ Example:
72
+ >>> _normalize_apostrophes("don't") # curly apostrophe
73
+ "don't" # ASCII apostrophe
74
+ """
75
+ for char in _APOSTROPHE_VARIANTS:
76
+ text = text.replace(char, "'")
77
+ return text
78
+
79
+
80
+ @dataclass
81
+ class WordAnalysis:
82
+ """Analysis of a single word against BNC frequency.
83
+
84
+ Attributes:
85
+ word: The word being analyzed (lowercase)
86
+ observed: Number of times the word appears in the text
87
+ expected: Expected count based on BNC relative frequency
88
+ ratio: observed / expected (None if not in BNC)
89
+ in_wordnet: Whether the word exists in WordNet
90
+ char_type: Classification of character content
91
+ """
92
+
93
+ word: str
94
+ observed: int
95
+ expected: float | None
96
+ ratio: float | None
97
+ in_wordnet: bool | None
98
+ char_type: Literal["latin", "unicode", "numeric", "mixed", "punctuation"]
99
+
100
+
101
+ @dataclass
102
+ class BNCFrequencyResult:
103
+ """Result of BNC frequency analysis.
104
+
105
+ Attributes:
106
+ overused: Words appearing more frequently than expected (ratio > threshold)
107
+ underused: Words appearing less frequently than expected (ratio < threshold)
108
+ not_in_bnc: Words not found in the BNC corpus
109
+ total_tokens: Total word count in the text
110
+ unique_tokens: Number of unique words
111
+ overuse_threshold: Ratio above which words are considered overused
112
+ underuse_threshold: Ratio below which words are considered underused
113
+ metadata: Additional analysis metadata
114
+ """
115
+
116
+ overused: list[WordAnalysis]
117
+ underused: list[WordAnalysis]
118
+ not_in_bnc: list[WordAnalysis]
119
+ total_tokens: int
120
+ unique_tokens: int
121
+ overuse_threshold: float
122
+ underuse_threshold: float
123
+ metadata: dict
124
+
125
+
126
+ def _classify_char_type(
127
+ word: str,
128
+ ) -> Literal["latin", "unicode", "numeric", "mixed", "punctuation"]:
129
+ """Classify the character content of a word.
130
+
131
+ Args:
132
+ word: Word to classify
133
+
134
+ Returns:
135
+ Character type classification:
136
+ - latin: Pure ASCII alphabetic characters (a-z, A-Z)
137
+ - unicode: Contains non-ASCII characters (accents, etc.)
138
+ - numeric: Contains only digits
139
+ - mixed: Contains letters and numbers or other combinations
140
+ - punctuation: Contains only punctuation
141
+ """
142
+ if not word:
143
+ return "punctuation"
144
+
145
+ has_ascii_alpha = bool(re.search(r"[a-zA-Z]", word))
146
+ has_unicode_alpha = any(unicodedata.category(c).startswith("L") and ord(c) > 127 for c in word)
147
+ has_digit = any(c.isdigit() for c in word)
148
+ has_punct = any(unicodedata.category(c).startswith("P") for c in word)
149
+
150
+ # Determine classification
151
+ if has_unicode_alpha:
152
+ return "unicode"
153
+ elif has_digit and not has_ascii_alpha:
154
+ return "numeric"
155
+ elif has_digit and has_ascii_alpha:
156
+ return "mixed"
157
+ elif has_ascii_alpha and not has_punct:
158
+ return "latin"
159
+ elif has_ascii_alpha and has_punct:
160
+ return "mixed"
161
+ elif not has_ascii_alpha and not has_digit:
162
+ return "punctuation"
163
+ else:
164
+ return "mixed"
165
+
166
+
167
+ def compute_bnc_frequency(
168
+ text: str,
169
+ overuse_threshold: float = 1.3,
170
+ underuse_threshold: float = 0.8,
171
+ include_wordnet: bool = True,
172
+ min_mentions: int = 1,
173
+ ) -> BNCFrequencyResult:
174
+ """Compute BNC frequency analysis for a text.
175
+
176
+ Compares observed word frequencies against expected frequencies from the
177
+ British National Corpus. Words are categorized as overused, underused,
178
+ or not in BNC based on their frequency ratios.
179
+
180
+ Args:
181
+ text: Input text to analyze
182
+ overuse_threshold: Ratio above which words are considered overused (default: 1.3)
183
+ underuse_threshold: Ratio below which words are considered underused (default: 0.8)
184
+ include_wordnet: Whether to check WordNet for unknown words (default: True)
185
+ min_mentions: Minimum number of mentions to include word (default: 1)
186
+
187
+ Returns:
188
+ BNCFrequencyResult with categorized word lists
189
+
190
+ Raises:
191
+ ImportError: If bnc-lookup package is not installed
192
+
193
+ Example:
194
+ >>> result = compute_bnc_frequency("The captain ordered the larboard watch...")
195
+ >>> result.overused[:3] # Top 3 overused words
196
+ [WordAnalysis(word='larboard', ratio=33153.5, ...), ...]
197
+ >>> result.not_in_bnc[:3] # Words not in BNC
198
+ [WordAnalysis(word='xyzbot', ...), ...]
199
+ """
200
+ # Check dependency
201
+ check_optional_dependency("bnc_lookup", "lexical")
202
+ from bnc_lookup import relative_frequency # type: ignore[import-untyped]
203
+
204
+ # Optional wordnet lookup
205
+ wordnet_checker = None
206
+ if include_wordnet:
207
+ try:
208
+ from wordnet_lookup import (
209
+ is_wordnet_term as _is_wordnet_term, # type: ignore[import-untyped]
210
+ )
211
+
212
+ wordnet_checker = _is_wordnet_term
213
+ except ImportError:
214
+ # WordNet lookup is optional
215
+ pass
216
+
217
+ # Tokenize text (simple whitespace + punctuation stripping)
218
+ # First normalize apostrophes to ensure consistent BNC lookups (Issue #45)
219
+ normalized_text = _normalize_apostrophes(text)
220
+ raw_tokens = normalized_text.split()
221
+ tokens = []
222
+ for raw in raw_tokens:
223
+ # Strip leading/trailing punctuation, lowercase
224
+ cleaned = re.sub(r"^[^\w]+|[^\w]+$", "", raw).lower()
225
+ if cleaned:
226
+ tokens.append(cleaned)
227
+
228
+ total_tokens = len(tokens)
229
+
230
+ # Count observed frequency of each word
231
+ observed = Counter(tokens)
232
+ unique_words = list(observed.keys())
233
+
234
+ # Get BNC relative frequencies (one at a time - bnc_lookup doesn't have batch)
235
+ bnc_freqs = {word: relative_frequency(word) for word in unique_words}
236
+
237
+ # Analyze each word
238
+ overused: list[WordAnalysis] = []
239
+ underused: list[WordAnalysis] = []
240
+ not_in_bnc: list[WordAnalysis] = []
241
+
242
+ for word, obs_count in observed.items():
243
+ if obs_count < min_mentions:
244
+ continue
245
+
246
+ # Classify character type
247
+ char_type = _classify_char_type(word)
248
+
249
+ # Get BNC frequency
250
+ rel_freq = bnc_freqs.get(word)
251
+
252
+ # Check WordNet if requested
253
+ in_wordnet = None
254
+ if wordnet_checker is not None:
255
+ in_wordnet = wordnet_checker(word)
256
+
257
+ if rel_freq is None or rel_freq == 0:
258
+ # Word not in BNC
259
+ analysis = WordAnalysis(
260
+ word=word,
261
+ observed=obs_count,
262
+ expected=None,
263
+ ratio=None,
264
+ in_wordnet=in_wordnet,
265
+ char_type=char_type,
266
+ )
267
+ not_in_bnc.append(analysis)
268
+ else:
269
+ # Compute expected count and ratio
270
+ expected = rel_freq * total_tokens
271
+ ratio = obs_count / expected if expected > 0 else None
272
+
273
+ analysis = WordAnalysis(
274
+ word=word,
275
+ observed=obs_count,
276
+ expected=expected,
277
+ ratio=ratio,
278
+ in_wordnet=in_wordnet,
279
+ char_type=char_type,
280
+ )
281
+
282
+ if ratio is not None:
283
+ if ratio > overuse_threshold:
284
+ overused.append(analysis)
285
+ elif ratio < underuse_threshold:
286
+ underused.append(analysis)
287
+
288
+ # Sort by ratio (highest first for overused, lowest first for underused)
289
+ overused.sort(key=lambda x: x.ratio or 0, reverse=True)
290
+ underused.sort(key=lambda x: x.ratio or float("inf"))
291
+ # Sort not_in_bnc by observed count
292
+ not_in_bnc.sort(key=lambda x: x.observed, reverse=True)
293
+
294
+ return BNCFrequencyResult(
295
+ overused=overused,
296
+ underused=underused,
297
+ not_in_bnc=not_in_bnc,
298
+ total_tokens=total_tokens,
299
+ unique_tokens=len(unique_words),
300
+ overuse_threshold=overuse_threshold,
301
+ underuse_threshold=underuse_threshold,
302
+ metadata={
303
+ "include_wordnet": include_wordnet,
304
+ "min_mentions": min_mentions,
305
+ "overused_count": len(overused),
306
+ "underused_count": len(underused),
307
+ "not_in_bnc_count": len(not_in_bnc),
308
+ },
309
+ )
@@ -20,6 +20,7 @@ Example:
20
20
  >>> export_drift_viewer("drift_analyzer.html")
21
21
  """
22
22
 
23
+ from .bnc_frequency import export_bnc_frequency_jsx
23
24
  from .report import export_drift_report_jsx
24
25
  from .timeline import export_drift_timeline_jsx
25
26
  from .viewer import export_drift_viewer
@@ -28,4 +29,5 @@ __all__ = [
28
29
  "export_drift_timeline_jsx",
29
30
  "export_drift_report_jsx",
30
31
  "export_drift_viewer",
32
+ "export_bnc_frequency_jsx",
31
33
  ]
@@ -0,0 +1,495 @@
1
+ """Interactive HTML export for BNC frequency analysis.
2
+
3
+ This module generates a self-contained HTML report showing word frequency
4
+ comparisons against the British National Corpus (BNC).
5
+
6
+ The report has three sections:
7
+ 1. Not in BNC - Words not found in the corpus (with WordNet and character type info)
8
+ 2. Most Underused - Words appearing less frequently than expected
9
+ 3. Most Overused - Words appearing more frequently than expected
10
+
11
+ Related GitHub Issue:
12
+ #TBD - BNC frequency analysis CLI
13
+ https://github.com/craigtrim/pystylometry/issues/TBD
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from pathlib import Path
19
+ from typing import TYPE_CHECKING
20
+
21
+ from ._base import CARD_STYLES, generate_html_document, write_html_file
22
+
23
+ if TYPE_CHECKING:
24
+ from pystylometry.lexical.bnc_frequency import BNCFrequencyResult
25
+
26
+
27
+ def export_bnc_frequency_jsx(
28
+ result: "BNCFrequencyResult",
29
+ output_file: str | Path,
30
+ title: str = "BNC Word Frequency Analysis",
31
+ source_file: str | None = None,
32
+ ) -> Path:
33
+ """Export BNC frequency analysis as interactive HTML.
34
+
35
+ Generates a self-contained HTML file with three sections:
36
+ - Not in BNC: Complete table with WordNet status and character type
37
+ - Most Underused: Words below the underuse threshold
38
+ - Most Overused: Words above the overuse threshold
39
+
40
+ Args:
41
+ result: BNCFrequencyResult from compute_bnc_frequency()
42
+ output_file: Path to write the HTML file
43
+ title: Page title (default: "BNC Word Frequency Analysis")
44
+ source_file: Optional source filename to display
45
+
46
+ Returns:
47
+ Path to the written HTML file
48
+
49
+ Example:
50
+ >>> from pystylometry.lexical.bnc_frequency import compute_bnc_frequency
51
+ >>> from pystylometry.viz.jsx import export_bnc_frequency_jsx
52
+ >>> result = compute_bnc_frequency(text)
53
+ >>> export_bnc_frequency_jsx(result, "frequency_report.html")
54
+ """
55
+ # Build data for the React component
56
+ not_in_bnc_data = [
57
+ {
58
+ "word": w.word,
59
+ "observed": w.observed,
60
+ "inWordnet": w.in_wordnet,
61
+ "charType": w.char_type,
62
+ }
63
+ for w in result.not_in_bnc
64
+ ]
65
+
66
+ underused_data = [
67
+ {
68
+ "word": w.word,
69
+ "observed": w.observed,
70
+ "expected": round(w.expected, 2) if w.expected else None,
71
+ "ratio": round(w.ratio, 4) if w.ratio else None,
72
+ "charType": w.char_type,
73
+ }
74
+ for w in result.underused
75
+ ]
76
+
77
+ overused_data = [
78
+ {
79
+ "word": w.word,
80
+ "observed": w.observed,
81
+ "expected": round(w.expected, 2) if w.expected else None,
82
+ "ratio": round(w.ratio, 1) if w.ratio else None,
83
+ "charType": w.char_type,
84
+ }
85
+ for w in result.overused
86
+ ]
87
+
88
+ config = {
89
+ "title": title,
90
+ "sourceFile": source_file,
91
+ "notInBnc": not_in_bnc_data,
92
+ "underused": underused_data,
93
+ "overused": overused_data,
94
+ "stats": {
95
+ "totalTokens": result.total_tokens,
96
+ "uniqueTokens": result.unique_tokens,
97
+ "notInBncCount": len(result.not_in_bnc),
98
+ "underusedCount": len(result.underused),
99
+ "overusedCount": len(result.overused),
100
+ "overuseThreshold": result.overuse_threshold,
101
+ "underuseThreshold": result.underuse_threshold,
102
+ },
103
+ }
104
+
105
+ react_component = """
106
+ // Color mapping for character types
107
+ const CHAR_TYPE_COLORS = {
108
+ latin: { bg: '#dcfce7', text: '#166534', label: 'Latin' },
109
+ unicode: { bg: '#fef3c7', text: '#92400e', label: 'Unicode' },
110
+ numeric: { bg: '#dbeafe', text: '#1e40af', label: 'Numeric' },
111
+ mixed: { bg: '#f3e8ff', text: '#6b21a8', label: 'Mixed' },
112
+ punctuation: { bg: '#f1f5f9', text: '#475569', label: 'Punct' },
113
+ };
114
+
115
+ // Tab configuration
116
+ const TABS = [
117
+ { id: 'overused', label: 'Most Overused', color: '#ef4444' },
118
+ { id: 'underused', label: 'Most Underused', color: '#3b82f6' },
119
+ { id: 'notInBnc', label: 'Not in BNC', color: '#6b7280' },
120
+ ];
121
+
122
+ // WordNet status badge
123
+ function WordnetBadge({ inWordnet }) {
124
+ if (inWordnet === null || inWordnet === undefined) {
125
+ return <span style={{ color: '#9ca3af', fontSize: '12px' }}>—</span>;
126
+ }
127
+ return inWordnet ? (
128
+ <span style={{
129
+ background: '#dcfce7',
130
+ color: '#166534',
131
+ padding: '2px 8px',
132
+ borderRadius: '9999px',
133
+ fontSize: '11px',
134
+ fontWeight: 500,
135
+ }}>Yes</span>
136
+ ) : (
137
+ <span style={{
138
+ background: '#fee2e2',
139
+ color: '#991b1b',
140
+ padding: '2px 8px',
141
+ borderRadius: '9999px',
142
+ fontSize: '11px',
143
+ fontWeight: 500,
144
+ }}>No</span>
145
+ );
146
+ }
147
+
148
+ // Character type badge
149
+ function CharTypeBadge({ charType }) {
150
+ const config = CHAR_TYPE_COLORS[charType] || CHAR_TYPE_COLORS.mixed;
151
+ return (
152
+ <span style={{
153
+ background: config.bg,
154
+ color: config.text,
155
+ padding: '2px 8px',
156
+ borderRadius: '9999px',
157
+ fontSize: '11px',
158
+ fontWeight: 500,
159
+ }}>{config.label}</span>
160
+ );
161
+ }
162
+
163
+ // Ratio display with color intensity
164
+ function RatioDisplay({ ratio, isOverused }) {
165
+ if (ratio === null || ratio === undefined) return '—';
166
+
167
+ let color, intensity;
168
+ if (isOverused) {
169
+ intensity = Math.min(Math.log2(ratio) / 6, 1);
170
+ const r = 239;
171
+ const g = Math.round(68 + (1 - intensity) * 120);
172
+ color = `rgb(${r}, ${g}, 68)`;
173
+ } else {
174
+ intensity = Math.min(Math.abs(Math.log2(ratio)) / 4, 1);
175
+ const b = 246;
176
+ const g = Math.round(130 + (1 - intensity) * 60);
177
+ color = `rgb(59, ${g}, ${b})`;
178
+ }
179
+
180
+ const displayValue = isOverused ? ratio.toFixed(1) + 'x' : ratio.toFixed(4);
181
+
182
+ return (
183
+ <span style={{
184
+ color: color,
185
+ fontWeight: 600,
186
+ fontFamily: 'ui-monospace, monospace',
187
+ }}>{displayValue}</span>
188
+ );
189
+ }
190
+
191
+ // Stats summary card
192
+ function StatsCard({ stats, activeTab, onTabChange }) {
193
+ return (
194
+ <div className="card" style={{ marginBottom: '24px' }}>
195
+ <div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(120px, 1fr))', gap: '16px' }}>
196
+ <div>
197
+ <div style={{ fontSize: '11px', color: '#6b7280', marginBottom: '4px' }}>Total Tokens</div>
198
+ <div style={{ fontSize: '20px', fontWeight: 600 }}>{stats.totalTokens.toLocaleString()}</div>
199
+ </div>
200
+ <div>
201
+ <div style={{ fontSize: '11px', color: '#6b7280', marginBottom: '4px' }}>Unique Words</div>
202
+ <div style={{ fontSize: '20px', fontWeight: 600 }}>{stats.uniqueTokens.toLocaleString()}</div>
203
+ </div>
204
+ {TABS.map(tab => (
205
+ <div
206
+ key={tab.id}
207
+ onClick={() => onTabChange(tab.id)}
208
+ style={{
209
+ cursor: 'pointer',
210
+ padding: '8px',
211
+ margin: '-8px',
212
+ borderRadius: '8px',
213
+ background: activeTab === tab.id ? `${tab.color}10` : 'transparent',
214
+ border: activeTab === tab.id ? `2px solid ${tab.color}` : '2px solid transparent',
215
+ transition: 'all 0.15s',
216
+ }}
217
+ >
218
+ <div style={{ fontSize: '11px', color: '#6b7280', marginBottom: '4px' }}>{tab.label}</div>
219
+ <div style={{ fontSize: '20px', fontWeight: 600, color: tab.color }}>
220
+ {tab.id === 'overused' ? stats.overusedCount.toLocaleString() :
221
+ tab.id === 'underused' ? stats.underusedCount.toLocaleString() :
222
+ stats.notInBncCount.toLocaleString()}
223
+ </div>
224
+ </div>
225
+ ))}
226
+ </div>
227
+ </div>
228
+ );
229
+ }
230
+
231
+ // Tab bar component
232
+ function TabBar({ activeTab, onTabChange, stats }) {
233
+ return (
234
+ <div style={{ display: 'flex', gap: '4px', marginBottom: '16px', borderBottom: '2px solid #e2e8f0', paddingBottom: '0' }}>
235
+ {TABS.map(tab => {
236
+ const count = tab.id === 'overused' ? stats.overusedCount :
237
+ tab.id === 'underused' ? stats.underusedCount :
238
+ stats.notInBncCount;
239
+ const isActive = activeTab === tab.id;
240
+ return (
241
+ <button
242
+ key={tab.id}
243
+ onClick={() => onTabChange(tab.id)}
244
+ style={{
245
+ padding: '12px 20px',
246
+ border: 'none',
247
+ background: 'transparent',
248
+ cursor: 'pointer',
249
+ fontSize: '14px',
250
+ fontWeight: isActive ? 600 : 500,
251
+ color: isActive ? tab.color : '#6b7280',
252
+ borderBottom: isActive ? `3px solid ${tab.color}` : '3px solid transparent',
253
+ marginBottom: '-2px',
254
+ transition: 'all 0.15s',
255
+ }}
256
+ >
257
+ {tab.label}
258
+ <span style={{
259
+ marginLeft: '8px',
260
+ padding: '2px 8px',
261
+ borderRadius: '9999px',
262
+ fontSize: '12px',
263
+ background: isActive ? `${tab.color}20` : '#f1f5f9',
264
+ color: isActive ? tab.color : '#6b7280',
265
+ }}>{count.toLocaleString()}</span>
266
+ </button>
267
+ );
268
+ })}
269
+ </div>
270
+ );
271
+ }
272
+
273
+ // Data table component
274
+ function DataTable({ data, columns, emptyMessage, filter, onFilterChange }) {
275
+ const [sortKey, setSortKey] = React.useState(null);
276
+ const [sortDir, setSortDir] = React.useState('desc');
277
+
278
+ const filteredData = React.useMemo(() => {
279
+ if (!filter) return data;
280
+ const lowerFilter = filter.toLowerCase();
281
+ return data.filter(row => row.word.toLowerCase().includes(lowerFilter));
282
+ }, [data, filter]);
283
+
284
+ const sortedData = React.useMemo(() => {
285
+ if (!sortKey) return filteredData;
286
+ return [...filteredData].sort((a, b) => {
287
+ let aVal = a[sortKey];
288
+ let bVal = b[sortKey];
289
+ if (aVal === null || aVal === undefined) aVal = sortDir === 'desc' ? -Infinity : Infinity;
290
+ if (bVal === null || bVal === undefined) bVal = sortDir === 'desc' ? -Infinity : Infinity;
291
+ if (typeof aVal === 'string') {
292
+ return sortDir === 'desc' ? bVal.localeCompare(aVal) : aVal.localeCompare(bVal);
293
+ }
294
+ return sortDir === 'desc' ? bVal - aVal : aVal - bVal;
295
+ });
296
+ }, [filteredData, sortKey, sortDir]);
297
+
298
+ const handleSort = (key) => {
299
+ if (sortKey === key) {
300
+ setSortDir(sortDir === 'desc' ? 'asc' : 'desc');
301
+ } else {
302
+ setSortKey(key);
303
+ setSortDir('desc');
304
+ }
305
+ };
306
+
307
+ return (
308
+ <div>
309
+ <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '12px' }}>
310
+ <span style={{ fontSize: '13px', color: '#6b7280' }}>{sortedData.length} words</span>
311
+ <input
312
+ type="text"
313
+ placeholder="Filter words..."
314
+ value={filter}
315
+ onChange={(e) => onFilterChange(e.target.value)}
316
+ style={{
317
+ padding: '8px 12px',
318
+ border: '1px solid #e2e8f0',
319
+ borderRadius: '6px',
320
+ fontSize: '13px',
321
+ width: '200px',
322
+ }}
323
+ />
324
+ </div>
325
+
326
+ {sortedData.length === 0 ? (
327
+ <div style={{ padding: '48px', textAlign: 'center', color: '#9ca3af' }}>
328
+ {filter ? 'No matching words' : emptyMessage}
329
+ </div>
330
+ ) : (
331
+ <div style={{ overflowX: 'auto', maxHeight: '600px', overflowY: 'auto' }}>
332
+ <table style={{ width: '100%', borderCollapse: 'collapse', fontSize: '13px' }}>
333
+ <thead style={{ position: 'sticky', top: 0, background: 'white' }}>
334
+ <tr style={{ borderBottom: '2px solid #e2e8f0' }}>
335
+ {columns.map(col => (
336
+ <th
337
+ key={col.key}
338
+ onClick={() => col.sortable !== false && handleSort(col.key)}
339
+ style={{
340
+ textAlign: col.align || 'left',
341
+ padding: '10px 12px',
342
+ fontWeight: 600,
343
+ color: '#374151',
344
+ cursor: col.sortable !== false ? 'pointer' : 'default',
345
+ userSelect: 'none',
346
+ whiteSpace: 'nowrap',
347
+ background: 'white',
348
+ }}
349
+ >
350
+ {col.label}
351
+ {sortKey === col.key && (
352
+ <span style={{ marginLeft: '4px' }}>{sortDir === 'desc' ? '↓' : '↑'}</span>
353
+ )}
354
+ </th>
355
+ ))}
356
+ </tr>
357
+ </thead>
358
+ <tbody>
359
+ {sortedData.map((row, idx) => (
360
+ <tr key={idx} style={{ borderBottom: '1px solid #f1f5f9' }}>
361
+ {columns.map(col => (
362
+ <td key={col.key} style={{ padding: '10px 12px', textAlign: col.align || 'left' }}>
363
+ {col.render ? col.render(row[col.key], row) : row[col.key]}
364
+ </td>
365
+ ))}
366
+ </tr>
367
+ ))}
368
+ </tbody>
369
+ </table>
370
+ </div>
371
+ )}
372
+ </div>
373
+ );
374
+ }
375
+
376
+ // Main component
377
+ function BNCFrequencyReport() {
378
+ const { title, sourceFile, notInBnc, underused, overused, stats } = CONFIG;
379
+ const [activeTab, setActiveTab] = React.useState('overused');
380
+ const [filter, setFilter] = React.useState('');
381
+
382
+ // Reset filter when tab changes
383
+ const handleTabChange = (tab) => {
384
+ setActiveTab(tab);
385
+ setFilter('');
386
+ };
387
+
388
+ // Column definitions
389
+ const notInBncColumns = [
390
+ { key: 'word', label: 'Word', render: (v) => <code style={{ background: '#f1f5f9', padding: '2px 6px', borderRadius: '4px' }}>{v}</code> },
391
+ { key: 'observed', label: 'Mentions', align: 'right' },
392
+ { key: 'inWordnet', label: 'In WordNet', align: 'center', render: (v) => <WordnetBadge inWordnet={v} />, sortable: false },
393
+ { key: 'charType', label: 'Char Type', align: 'center', render: (v) => <CharTypeBadge charType={v} />, sortable: false },
394
+ ];
395
+
396
+ const frequencyColumns = (isOverused) => [
397
+ { key: 'word', label: 'Word', render: (v) => <code style={{ background: '#f1f5f9', padding: '2px 6px', borderRadius: '4px' }}>{v}</code> },
398
+ { key: 'observed', label: 'Observed', align: 'right' },
399
+ { key: 'expected', label: 'Expected', align: 'right', render: (v) => v !== null ? v.toFixed(2) : '—' },
400
+ { key: 'ratio', label: 'Ratio', align: 'right', render: (v) => <RatioDisplay ratio={v} isOverused={isOverused} /> },
401
+ { key: 'charType', label: 'Char Type', align: 'center', render: (v) => <CharTypeBadge charType={v} />, sortable: false },
402
+ ];
403
+
404
+ const getTabContent = () => {
405
+ switch (activeTab) {
406
+ case 'overused':
407
+ return (
408
+ <DataTable
409
+ data={overused}
410
+ columns={frequencyColumns(true)}
411
+ emptyMessage="No significantly overused words"
412
+ filter={filter}
413
+ onFilterChange={setFilter}
414
+ />
415
+ );
416
+ case 'underused':
417
+ return (
418
+ <DataTable
419
+ data={underused}
420
+ columns={frequencyColumns(false)}
421
+ emptyMessage="No significantly underused words"
422
+ filter={filter}
423
+ onFilterChange={setFilter}
424
+ />
425
+ );
426
+ case 'notInBnc':
427
+ return (
428
+ <DataTable
429
+ data={notInBnc}
430
+ columns={notInBncColumns}
431
+ emptyMessage="All words found in BNC"
432
+ filter={filter}
433
+ onFilterChange={setFilter}
434
+ />
435
+ );
436
+ }
437
+ };
438
+
439
+ return (
440
+ <div>
441
+ <div style={{ marginBottom: '24px' }}>
442
+ <h1 style={{ margin: '0 0 8px', fontSize: '24px', fontWeight: 600 }}>{title}</h1>
443
+ {sourceFile && (
444
+ <div style={{ fontSize: '14px', color: '#6b7280' }}>
445
+ Source: <code style={{ background: '#f1f5f9', padding: '2px 6px', borderRadius: '4px' }}>{sourceFile}</code>
446
+ </div>
447
+ )}
448
+ </div>
449
+
450
+ <StatsCard stats={stats} activeTab={activeTab} onTabChange={handleTabChange} />
451
+
452
+ <div className="card">
453
+ <TabBar activeTab={activeTab} onTabChange={handleTabChange} stats={stats} />
454
+ {getTabContent()}
455
+ </div>
456
+
457
+ <div style={{ marginTop: '24px', padding: '16px', background: '#f8fafc', borderRadius: '8px', fontSize: '12px', color: '#6b7280' }}>
458
+ <strong>About this analysis:</strong> Word frequencies are compared against the British National Corpus (BNC),
459
+ a 100-million word collection of British English. Ratios indicate how much more (or less) frequently
460
+ a word appears in this text compared to typical usage. Words not in BNC may be proper nouns,
461
+ technical terms, neologisms, or OCR errors.
462
+ </div>
463
+ </div>
464
+ );
465
+ }
466
+ """
467
+
468
+ extra_styles = (
469
+ CARD_STYLES
470
+ + """
471
+ code {
472
+ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
473
+ }
474
+ table {
475
+ font-variant-numeric: tabular-nums;
476
+ }
477
+ input:focus {
478
+ outline: 2px solid #3b82f6;
479
+ outline-offset: -1px;
480
+ }
481
+ tr:hover {
482
+ background: #f8fafc;
483
+ }
484
+ """
485
+ )
486
+
487
+ html = generate_html_document(
488
+ title=title,
489
+ config=config,
490
+ react_component=react_component,
491
+ component_name="BNCFrequencyReport",
492
+ extra_styles=extra_styles,
493
+ )
494
+
495
+ return write_html_file(output_file, html)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pystylometry
3
- Version: 1.3.5
3
+ Version: 1.3.6
4
4
  Summary: Comprehensive Python package for stylometric analysis
5
5
  License: MIT
6
6
  Keywords: stylometry,nlp,text-analysis,authorship,readability,lexical-diversity,readability-metrics
@@ -19,6 +19,20 @@ Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
20
20
  Classifier: Topic :: Text Processing :: Linguistic
21
21
  Classifier: Typing :: Typed
22
+ Provides-Extra: all
23
+ Provides-Extra: excel
24
+ Provides-Extra: lexical
25
+ Provides-Extra: readability
26
+ Provides-Extra: syntactic
27
+ Provides-Extra: viz
28
+ Requires-Dist: bnc-lookup (>=1.3.2) ; extra == "lexical" or extra == "all"
29
+ Requires-Dist: matplotlib (>=3.8.0,<4.0.0) ; extra == "viz" or extra == "all"
30
+ Requires-Dist: openpyxl (>=3.1.0,<4.0.0) ; extra == "lexical" or extra == "excel" or extra == "all"
31
+ Requires-Dist: pronouncing (>=0.2.0,<0.3.0) ; extra == "readability" or extra == "all"
32
+ Requires-Dist: rich (>=13.0,<14.0)
33
+ Requires-Dist: seaborn (>=0.13.0,<0.14.0) ; extra == "viz" or extra == "all"
34
+ Requires-Dist: spacy (>=3.8.0,<4.0.0) ; extra == "readability" or extra == "syntactic" or extra == "all"
35
+ Requires-Dist: wordnet-lookup ; extra == "lexical" or extra == "all"
22
36
  Project-URL: Homepage, https://github.com/craigtrim/pystylometry
23
37
  Project-URL: Issues, https://github.com/craigtrim/pystylometry/issues
24
38
  Project-URL: Repository, https://github.com/craigtrim/pystylometry
@@ -13,7 +13,7 @@ pystylometry/authorship/zeta.py,sha256=oOi9Y6ZPq15ILLVl6So9O9ERvzig26en6_dpQJWeo
13
13
  pystylometry/character/README.md,sha256=poQwhbI8MabVD_626CWjEL87IOX5YDGS0ZJTH1hNwEE,607
14
14
  pystylometry/character/__init__.py,sha256=CiiKJmZ10UJE8qAecavpOKyw-vGonsOew_mFH34ZOC0,371
15
15
  pystylometry/character/character_metrics.py,sha256=OCIGP_ivtwtzcifcxcbmp2R5SIKh2tKyvKcHAv64S8g,14029
16
- pystylometry/cli.py,sha256=HvzBZxFSiS5AAXCb6N9Eo3QonkH-ucRFp6xDF1kJTQ0,26380
16
+ pystylometry/cli.py,sha256=NRKuA4oCEJPNPkeSUttZxd0ZVQSn4kh77qOTWfjsgyM,40635
17
17
  pystylometry/consistency/README.md,sha256=HG_Rd6WRBnIz3M7J11dVDv1S2ARkMABFYrTn-VV8xRY,1058
18
18
  pystylometry/consistency/__init__.py,sha256=l7nzpS7M4yHDBbM2LGAtW0XGT2n7YjSey_1xKf45224,2181
19
19
  pystylometry/consistency/_thresholds.py,sha256=5fZwdJ_cnDy0ED7CCYs6V_zP6kIAR1p0h0NYkbZ0HRg,6381
@@ -24,8 +24,9 @@ pystylometry/dialect/_data/dialect_markers.json,sha256=DthluOA6q0rG_8IrCrFIYWh_E
24
24
  pystylometry/dialect/_loader.py,sha256=M2ATp-5754v_yX9EWvBP0r5qgNf8xlL8XadVsVb_Hco,12989
25
25
  pystylometry/dialect/detector.py,sha256=9x0ZuIfTIjsmdNSx0Ezy5AC0SAFtC4kVw11iOSBd9gQ,20147
26
26
  pystylometry/lexical/README.md,sha256=cFQ7KRZV4ubsQwIlOH3YHTbhhNl5X91Sr3zcn-3x0HI,1185
27
- pystylometry/lexical/__init__.py,sha256=ib_F-NGVydLNGT_HgaWurBT25AadTE4eNcAN1lGMKmQ,934
27
+ pystylometry/lexical/__init__.py,sha256=p5vYmHSr_kUHC2Vpng8ObncLs10cdb6s3P23DPmwzoc,1012
28
28
  pystylometry/lexical/advanced_diversity.py,sha256=rL1hlNqTnaEFcA2v4oBJlojHZMTqdvvm4jYXTFGVpYE,25664
29
+ pystylometry/lexical/bnc_frequency.py,sha256=m_AEYY4joEwVVbzBYJm9zq2-K7Nix9MLE3l4LHomjig,10580
29
30
  pystylometry/lexical/function_words.py,sha256=eel9bq_qWgWlvG0NtDiouilMt9kaFqz2rh3add2UC4U,17832
30
31
  pystylometry/lexical/hapax.py,sha256=djTqZyZIYXa3GRiPoy6TTGHPm0wCRNJ9U0Rwnf5NoDk,12173
31
32
  pystylometry/lexical/mtld.py,sha256=XpeCF8sOXZhWbaazHGuqm08mrOf_DYfkfGGAltWnyy4,7101
@@ -66,13 +67,14 @@ pystylometry/tokenizer.py,sha256=03FEF4kKp72v-ypbtMg8u0WyVJGk3YJx6Nw3SGzyAnA,181
66
67
  pystylometry/viz/README.md,sha256=mizuBpUzWgJqjC2u9C-Lu4sVDCcTQOgGsarRSkeWPf4,1031
67
68
  pystylometry/viz/__init__.py,sha256=3kHMAcJJi8oPhTqUZIRdyf311cdyPOHWaJIUv-w0V04,2219
68
69
  pystylometry/viz/drift.py,sha256=r98gQ4s_IlrEuaouxDMyue3cTjGqj10i4IeKC01IuCo,18956
69
- pystylometry/viz/jsx/__init__.py,sha256=ZCgbpMPhG5PiJ92IkJRrZwrb7RodZB9MyauO0MGgbRM,1107
70
+ pystylometry/viz/jsx/__init__.py,sha256=_-BFtPtBhQyBiKJWGPndI-m-3SRBk1JsFombYXYc2Fk,1191
70
71
  pystylometry/viz/jsx/_base.py,sha256=nd7kEc13fUcRMom3A5jqjGyTy-djIeydq2k3oPHZIHY,3708
72
+ pystylometry/viz/jsx/bnc_frequency.py,sha256=U8plmMOXMgLuJPMtL5k5MecFAX-5CdnxSLX3mVAmoLY,18391
71
73
  pystylometry/viz/jsx/report.py,sha256=DbbHnnNAEi5tmVg4PmiHb17vkBBXujyE4x1CfVBiOBw,25857
72
74
  pystylometry/viz/jsx/timeline.py,sha256=hor-xnBa6oVkSqN0AEZUCQFBOB-iTfHSFZHiEfeakPA,30716
73
75
  pystylometry/viz/jsx/viewer.py,sha256=3LO49d_2bRf_P-P-2oSKpKx4N8Ugo4oCLb3DtvyNxXI,43716
74
- pystylometry-1.3.5.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
75
- pystylometry-1.3.5.dist-info/METADATA,sha256=DEFzNf_ZQd3mulwOnFMRSyc1s30alGM6UtO-L7nloPc,4779
76
- pystylometry-1.3.5.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
77
- pystylometry-1.3.5.dist-info/entry_points.txt,sha256=XsJvKgKs3LRDuzdF45JO7ZnS0wGKg1f5qsxVYSZzLp8,165
78
- pystylometry-1.3.5.dist-info/RECORD,,
76
+ pystylometry-1.3.6.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
77
+ pystylometry-1.3.6.dist-info/METADATA,sha256=No130TQB2VZMHPz6XD_Z37ZmyT0lC2Y785PSaDkeeZc,5545
78
+ pystylometry-1.3.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
79
+ pystylometry-1.3.6.dist-info/entry_points.txt,sha256=Gr2keJe638qHrrJpCGZAP3AYduxxIaSCoBH4FwAJt7U,204
80
+ pystylometry-1.3.6.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  [console_scripts]
2
+ bnc=pystylometry.cli:bnc_frequency_cli
2
3
  pystylometry-drift=pystylometry.cli:drift_cli
3
4
  pystylometry-tokenize=pystylometry.cli:tokenize_cli
4
5
  pystylometry-viewer=pystylometry.cli:viewer_cli