pystylometry 1.3.5__py3-none-any.whl → 1.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/cli.py +374 -0
- pystylometry/lexical/__init__.py +2 -0
- pystylometry/lexical/bnc_frequency.py +309 -0
- pystylometry/viz/jsx/__init__.py +2 -0
- pystylometry/viz/jsx/bnc_frequency.py +495 -0
- {pystylometry-1.3.5.dist-info → pystylometry-1.3.6.dist-info}/METADATA +15 -1
- {pystylometry-1.3.5.dist-info → pystylometry-1.3.6.dist-info}/RECORD +10 -8
- {pystylometry-1.3.5.dist-info → pystylometry-1.3.6.dist-info}/entry_points.txt +1 -0
- {pystylometry-1.3.5.dist-info → pystylometry-1.3.6.dist-info}/LICENSE +0 -0
- {pystylometry-1.3.5.dist-info → pystylometry-1.3.6.dist-info}/WHEEL +0 -0
pystylometry/cli.py
CHANGED
|
@@ -4,6 +4,7 @@ Usage:
|
|
|
4
4
|
pystylometry-drift <file> [--window-size=N] [--stride=N] [--mode=MODE] [--json]
|
|
5
5
|
pystylometry-drift <file> --plot [output.png]
|
|
6
6
|
pystylometry-tokenize <file> [--json] [--metadata] [--stats]
|
|
7
|
+
bnc --input-file <file> [--output-file <file>] [--format csv|html|json]
|
|
7
8
|
|
|
8
9
|
Example:
|
|
9
10
|
pystylometry-drift manuscript.txt
|
|
@@ -14,6 +15,9 @@ Example:
|
|
|
14
15
|
pystylometry-tokenize manuscript.txt
|
|
15
16
|
pystylometry-tokenize manuscript.txt --json --metadata
|
|
16
17
|
pystylometry-tokenize manuscript.txt --stats
|
|
18
|
+
bnc --input-file manuscript.txt
|
|
19
|
+
bnc --input-file manuscript.txt --output-file report.html --format html
|
|
20
|
+
bnc -i manuscript.txt --format json
|
|
17
21
|
"""
|
|
18
22
|
|
|
19
23
|
from __future__ import annotations
|
|
@@ -744,5 +748,375 @@ Examples:
|
|
|
744
748
|
print(token)
|
|
745
749
|
|
|
746
750
|
|
|
751
|
+
def bnc_frequency_cli() -> None:
|
|
752
|
+
"""CLI entry point for BNC word frequency analysis."""
|
|
753
|
+
parser = argparse.ArgumentParser(
|
|
754
|
+
prog="bnc",
|
|
755
|
+
description="Analyze word frequencies against the British National Corpus (BNC).",
|
|
756
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
757
|
+
epilog="""
|
|
758
|
+
Examples:
|
|
759
|
+
bnc --input-file manuscript.txt
|
|
760
|
+
bnc --input-file manuscript.txt --output-file report.html
|
|
761
|
+
bnc --input-file manuscript.txt --format json
|
|
762
|
+
bnc --input-file manuscript.txt --overuse-threshold 2.0 --min-mentions 3
|
|
763
|
+
bnc --input-file manuscript.txt --no-wordnet
|
|
764
|
+
|
|
765
|
+
Output:
|
|
766
|
+
Generates a report with three sections:
|
|
767
|
+
- Not in BNC: Words not found in the corpus (with WordNet status, character type)
|
|
768
|
+
- Most Underused: Words appearing less frequently than expected
|
|
769
|
+
- Most Overused: Words appearing more frequently than expected
|
|
770
|
+
|
|
771
|
+
Thresholds:
|
|
772
|
+
Words with ratio > overuse-threshold are "overused"
|
|
773
|
+
Words with ratio < underuse-threshold are "underused"
|
|
774
|
+
Ratio = observed_count / expected_count (based on BNC frequencies)
|
|
775
|
+
""",
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
parser.add_argument(
|
|
779
|
+
"--input-file",
|
|
780
|
+
"-i",
|
|
781
|
+
type=Path,
|
|
782
|
+
required=True,
|
|
783
|
+
metavar="FILE",
|
|
784
|
+
help="Path to text file to analyze",
|
|
785
|
+
)
|
|
786
|
+
parser.add_argument(
|
|
787
|
+
"--output-file",
|
|
788
|
+
"-o",
|
|
789
|
+
type=Path,
|
|
790
|
+
default=None,
|
|
791
|
+
metavar="FILE",
|
|
792
|
+
help="Output file (default: <input>_bnc_frequency.<ext> based on --format)",
|
|
793
|
+
)
|
|
794
|
+
parser.add_argument(
|
|
795
|
+
"--overuse-threshold",
|
|
796
|
+
type=float,
|
|
797
|
+
default=1.3,
|
|
798
|
+
metavar="N",
|
|
799
|
+
help="Ratio above which words are considered overused (default: 1.3)",
|
|
800
|
+
)
|
|
801
|
+
parser.add_argument(
|
|
802
|
+
"--underuse-threshold",
|
|
803
|
+
type=float,
|
|
804
|
+
default=0.8,
|
|
805
|
+
metavar="N",
|
|
806
|
+
help="Ratio below which words are considered underused (default: 0.8)",
|
|
807
|
+
)
|
|
808
|
+
parser.add_argument(
|
|
809
|
+
"--min-mentions",
|
|
810
|
+
type=int,
|
|
811
|
+
default=1,
|
|
812
|
+
metavar="N",
|
|
813
|
+
help="Minimum word occurrences to include (default: 1)",
|
|
814
|
+
)
|
|
815
|
+
parser.add_argument(
|
|
816
|
+
"--no-wordnet",
|
|
817
|
+
action="store_true",
|
|
818
|
+
help="Skip WordNet lookup for unknown words",
|
|
819
|
+
)
|
|
820
|
+
parser.add_argument(
|
|
821
|
+
"--format",
|
|
822
|
+
choices=["csv", "html", "json", "excel"],
|
|
823
|
+
default="csv",
|
|
824
|
+
help="Output format: csv (tab-delimited), html (interactive), json, excel (default: csv)",
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
args = parser.parse_args()
|
|
828
|
+
|
|
829
|
+
# Import rich for colored output
|
|
830
|
+
from rich.console import Console
|
|
831
|
+
from rich.panel import Panel
|
|
832
|
+
from rich.table import Table
|
|
833
|
+
from rich.text import Text
|
|
834
|
+
|
|
835
|
+
console = Console(stderr=True)
|
|
836
|
+
|
|
837
|
+
# Validate file exists
|
|
838
|
+
if not args.input_file.exists():
|
|
839
|
+
console.print(f"[red]Error:[/red] File not found: {args.input_file}")
|
|
840
|
+
sys.exit(1)
|
|
841
|
+
|
|
842
|
+
# Read file
|
|
843
|
+
try:
|
|
844
|
+
text = args.input_file.read_text(encoding="utf-8")
|
|
845
|
+
except Exception as e:
|
|
846
|
+
console.print(f"[red]Error reading file:[/red] {e}")
|
|
847
|
+
sys.exit(1)
|
|
848
|
+
|
|
849
|
+
# Determine output path (extension based on format)
|
|
850
|
+
suffix_map = {"csv": ".tsv", "html": ".html", "json": ".json", "excel": ".xlsx"}
|
|
851
|
+
if args.output_file:
|
|
852
|
+
output_path = args.output_file
|
|
853
|
+
else:
|
|
854
|
+
suffix = suffix_map[args.format]
|
|
855
|
+
output_path = args.input_file.with_name(f"{args.input_file.stem}_bnc_frequency{suffix}")
|
|
856
|
+
|
|
857
|
+
# Calculate file stats
|
|
858
|
+
token_count = len(text.split())
|
|
859
|
+
char_count = len(text)
|
|
860
|
+
|
|
861
|
+
# Print header
|
|
862
|
+
console.print()
|
|
863
|
+
header = Text()
|
|
864
|
+
header.append("PYSTYLOMETRY", style="bold cyan")
|
|
865
|
+
header.append(" — ", style="dim")
|
|
866
|
+
header.append("BNC Word Frequency Analysis", style="bold white")
|
|
867
|
+
console.print(Panel(header, border_style="cyan"))
|
|
868
|
+
|
|
869
|
+
# Input section
|
|
870
|
+
console.print()
|
|
871
|
+
console.print("[bold]INPUT[/bold]", style="cyan")
|
|
872
|
+
console.print("─" * 60, style="dim")
|
|
873
|
+
console.print(f" File: [white]{args.input_file}[/white]")
|
|
874
|
+
console.print(
|
|
875
|
+
f" Size: [green]{char_count:,}[/green] chars / [green]{token_count:,}[/green] tokens"
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
# Parameters section
|
|
879
|
+
console.print()
|
|
880
|
+
console.print("[bold]PARAMETERS[/bold]", style="cyan")
|
|
881
|
+
console.print("─" * 60, style="dim")
|
|
882
|
+
console.print(f" Overuse threshold: [yellow]{args.overuse_threshold}x[/yellow]")
|
|
883
|
+
console.print(f" Underuse threshold: [yellow]{args.underuse_threshold}x[/yellow]")
|
|
884
|
+
console.print(f" Min mentions: [yellow]{args.min_mentions}[/yellow]")
|
|
885
|
+
console.print(f" WordNet lookup: [yellow]{'no' if args.no_wordnet else 'yes'}[/yellow]")
|
|
886
|
+
|
|
887
|
+
# Output section
|
|
888
|
+
console.print()
|
|
889
|
+
console.print("[bold]OUTPUT[/bold]", style="cyan")
|
|
890
|
+
console.print("─" * 60, style="dim")
|
|
891
|
+
fmt_display = {
|
|
892
|
+
"csv": "Tab-delimited CSV",
|
|
893
|
+
"html": "Interactive HTML",
|
|
894
|
+
"json": "JSON",
|
|
895
|
+
"excel": "Excel (.xlsx)",
|
|
896
|
+
}
|
|
897
|
+
console.print(f" Format: [magenta]{fmt_display[args.format]}[/magenta]")
|
|
898
|
+
console.print(f" Destination: [white]{output_path}[/white]")
|
|
899
|
+
|
|
900
|
+
# Run analysis with spinner
|
|
901
|
+
console.print()
|
|
902
|
+
with console.status("[bold cyan]Running analysis...[/bold cyan]", spinner="dots"):
|
|
903
|
+
from pystylometry.lexical.bnc_frequency import compute_bnc_frequency
|
|
904
|
+
|
|
905
|
+
result = compute_bnc_frequency(
|
|
906
|
+
text,
|
|
907
|
+
overuse_threshold=args.overuse_threshold,
|
|
908
|
+
underuse_threshold=args.underuse_threshold,
|
|
909
|
+
include_wordnet=not args.no_wordnet,
|
|
910
|
+
min_mentions=args.min_mentions,
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
# Output results
|
|
914
|
+
if args.format == "json":
|
|
915
|
+
output = {
|
|
916
|
+
"stats": {
|
|
917
|
+
"total_tokens": result.total_tokens,
|
|
918
|
+
"unique_tokens": result.unique_tokens,
|
|
919
|
+
"overused_count": len(result.overused),
|
|
920
|
+
"underused_count": len(result.underused),
|
|
921
|
+
"not_in_bnc_count": len(result.not_in_bnc),
|
|
922
|
+
},
|
|
923
|
+
"overused": [
|
|
924
|
+
{
|
|
925
|
+
"word": w.word,
|
|
926
|
+
"observed": w.observed,
|
|
927
|
+
"expected": w.expected,
|
|
928
|
+
"ratio": w.ratio,
|
|
929
|
+
"char_type": w.char_type,
|
|
930
|
+
}
|
|
931
|
+
for w in result.overused
|
|
932
|
+
],
|
|
933
|
+
"underused": [
|
|
934
|
+
{
|
|
935
|
+
"word": w.word,
|
|
936
|
+
"observed": w.observed,
|
|
937
|
+
"expected": w.expected,
|
|
938
|
+
"ratio": w.ratio,
|
|
939
|
+
"char_type": w.char_type,
|
|
940
|
+
}
|
|
941
|
+
for w in result.underused
|
|
942
|
+
],
|
|
943
|
+
"not_in_bnc": [
|
|
944
|
+
{
|
|
945
|
+
"word": w.word,
|
|
946
|
+
"observed": w.observed,
|
|
947
|
+
"in_wordnet": w.in_wordnet,
|
|
948
|
+
"char_type": w.char_type,
|
|
949
|
+
}
|
|
950
|
+
for w in result.not_in_bnc
|
|
951
|
+
],
|
|
952
|
+
}
|
|
953
|
+
output_path.write_text(json.dumps(output, indent=2))
|
|
954
|
+
console.print(f'[green]✓[/green] JSON saved to: [white]"{output_path}"[/white]')
|
|
955
|
+
|
|
956
|
+
elif args.format == "csv":
|
|
957
|
+
# Tab-delimited output with category column
|
|
958
|
+
lines = ["category\tword\tobserved\texpected\tratio\tin_wordnet\tchar_type"]
|
|
959
|
+
|
|
960
|
+
def fmt_wordnet(val: bool | None) -> str:
|
|
961
|
+
if val is True:
|
|
962
|
+
return "yes"
|
|
963
|
+
elif val is False:
|
|
964
|
+
return "no"
|
|
965
|
+
return ""
|
|
966
|
+
|
|
967
|
+
for w in result.overused:
|
|
968
|
+
expected = f"{w.expected:.2f}" if w.expected else ""
|
|
969
|
+
ratio = f"{w.ratio:.4f}" if w.ratio else ""
|
|
970
|
+
in_wn = fmt_wordnet(w.in_wordnet)
|
|
971
|
+
lines.append(
|
|
972
|
+
f"overused\t{w.word}\t{w.observed}\t{expected}\t{ratio}\t{in_wn}\t{w.char_type}"
|
|
973
|
+
)
|
|
974
|
+
|
|
975
|
+
for w in result.underused:
|
|
976
|
+
expected = f"{w.expected:.2f}" if w.expected else ""
|
|
977
|
+
ratio = f"{w.ratio:.4f}" if w.ratio else ""
|
|
978
|
+
in_wn = fmt_wordnet(w.in_wordnet)
|
|
979
|
+
lines.append(
|
|
980
|
+
f"underused\t{w.word}\t{w.observed}\t{expected}\t{ratio}\t{in_wn}\t{w.char_type}"
|
|
981
|
+
)
|
|
982
|
+
|
|
983
|
+
for w in result.not_in_bnc:
|
|
984
|
+
in_wn = fmt_wordnet(w.in_wordnet)
|
|
985
|
+
lines.append(f"not-in-bnc\t{w.word}\t{w.observed}\t\t\t{in_wn}\t{w.char_type}")
|
|
986
|
+
|
|
987
|
+
output_path.write_text("\n".join(lines))
|
|
988
|
+
console.print(f'[green]✓[/green] TSV saved to: [white]"{output_path}"[/white]')
|
|
989
|
+
|
|
990
|
+
elif args.format == "excel":
|
|
991
|
+
try:
|
|
992
|
+
from openpyxl import Workbook # type: ignore[import-untyped]
|
|
993
|
+
from openpyxl.styles import Alignment, PatternFill # type: ignore[import-untyped]
|
|
994
|
+
except ImportError:
|
|
995
|
+
console.print("[red]Error:[/red] Excel export requires openpyxl.")
|
|
996
|
+
console.print(" Install with: [yellow]pip install pystylometry[excel][/yellow]")
|
|
997
|
+
console.print(" Or for pipx: [yellow]pipx inject pystylometry openpyxl[/yellow]")
|
|
998
|
+
sys.exit(1)
|
|
999
|
+
|
|
1000
|
+
wb = Workbook()
|
|
1001
|
+
|
|
1002
|
+
# Remove default sheet
|
|
1003
|
+
wb.remove(wb.active)
|
|
1004
|
+
|
|
1005
|
+
# Cell style: width 15, centered, vertically centered
|
|
1006
|
+
align = Alignment(horizontal="center", vertical="center")
|
|
1007
|
+
|
|
1008
|
+
def fmt_wordnet_excel(val: bool | None) -> str:
|
|
1009
|
+
if val is True:
|
|
1010
|
+
return "yes"
|
|
1011
|
+
elif val is False:
|
|
1012
|
+
return "no"
|
|
1013
|
+
return ""
|
|
1014
|
+
|
|
1015
|
+
# Overused sheet (sorted by ratio, high to low)
|
|
1016
|
+
ws_over = wb.create_sheet("overused")
|
|
1017
|
+
ws_over.append(["word", "observed", "expected", "ratio", "in_wordnet", "char_type"])
|
|
1018
|
+
for w in sorted(result.overused, key=lambda x: x.ratio or 0, reverse=True):
|
|
1019
|
+
in_wn = fmt_wordnet_excel(w.in_wordnet)
|
|
1020
|
+
ws_over.append([w.word, w.observed, w.expected, w.ratio, in_wn, w.char_type])
|
|
1021
|
+
|
|
1022
|
+
# Underused sheet (sorted by ratio, high to low)
|
|
1023
|
+
ws_under = wb.create_sheet("underused")
|
|
1024
|
+
ws_under.append(["word", "observed", "expected", "ratio", "in_wordnet", "char_type"])
|
|
1025
|
+
for w in sorted(result.underused, key=lambda x: x.ratio or 0, reverse=True):
|
|
1026
|
+
in_wn = fmt_wordnet_excel(w.in_wordnet)
|
|
1027
|
+
ws_under.append([w.word, w.observed, w.expected, w.ratio, in_wn, w.char_type])
|
|
1028
|
+
|
|
1029
|
+
# Not in BNC sheet
|
|
1030
|
+
ws_notbnc = wb.create_sheet("not-in-bnc")
|
|
1031
|
+
ws_notbnc.append(["word", "observed", "in_wordnet", "char_type", "plural_form"])
|
|
1032
|
+
for w in result.not_in_bnc:
|
|
1033
|
+
in_wn = fmt_wordnet_excel(w.in_wordnet)
|
|
1034
|
+
plural = "yes" if w.word.endswith("s") else "no"
|
|
1035
|
+
ws_notbnc.append([w.word, w.observed, in_wn, w.char_type, plural])
|
|
1036
|
+
|
|
1037
|
+
# Apply formatting to all sheets
|
|
1038
|
+
for ws in [ws_over, ws_under, ws_notbnc]:
|
|
1039
|
+
for col in ws.columns:
|
|
1040
|
+
col_letter = col[0].column_letter
|
|
1041
|
+
# Word column (A) gets width 30, others get 15
|
|
1042
|
+
ws.column_dimensions[col_letter].width = 30 if col_letter == "A" else 15
|
|
1043
|
+
for row in ws.iter_rows():
|
|
1044
|
+
for cell in row:
|
|
1045
|
+
cell.alignment = align
|
|
1046
|
+
|
|
1047
|
+
# Apply number formatting to expected (C) and ratio (D) columns
|
|
1048
|
+
for ws in [ws_over, ws_under]:
|
|
1049
|
+
for row in range(2, ws.max_row + 1): # Skip header row
|
|
1050
|
+
ws[f"C{row}"].number_format = "0.00"
|
|
1051
|
+
ws[f"D{row}"].number_format = "0.00"
|
|
1052
|
+
|
|
1053
|
+
# Apply background colors to in_wordnet column
|
|
1054
|
+
fill_yes = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
|
1055
|
+
fill_no = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
|
1056
|
+
|
|
1057
|
+
# in_wordnet is column E for overused/underused, column C for not-in-bnc
|
|
1058
|
+
for ws in [ws_over, ws_under]:
|
|
1059
|
+
for row in range(2, ws.max_row + 1):
|
|
1060
|
+
cell = ws[f"E{row}"]
|
|
1061
|
+
if cell.value == "yes":
|
|
1062
|
+
cell.fill = fill_yes
|
|
1063
|
+
elif cell.value == "no":
|
|
1064
|
+
cell.fill = fill_no
|
|
1065
|
+
|
|
1066
|
+
for row in range(2, ws_notbnc.max_row + 1):
|
|
1067
|
+
cell = ws_notbnc[f"C{row}"]
|
|
1068
|
+
if cell.value == "yes":
|
|
1069
|
+
cell.fill = fill_yes
|
|
1070
|
+
elif cell.value == "no":
|
|
1071
|
+
cell.fill = fill_no
|
|
1072
|
+
|
|
1073
|
+
# Apply background colors to plural_form column (E) in not-in-bnc
|
|
1074
|
+
fill_plural_yes = PatternFill(
|
|
1075
|
+
start_color="BDD7EE", end_color="BDD7EE", fill_type="solid"
|
|
1076
|
+
) # Light blue
|
|
1077
|
+
fill_plural_no = PatternFill(
|
|
1078
|
+
start_color="FCE4D6", end_color="FCE4D6", fill_type="solid"
|
|
1079
|
+
) # Light peach
|
|
1080
|
+
for row in range(2, ws_notbnc.max_row + 1):
|
|
1081
|
+
cell = ws_notbnc[f"E{row}"]
|
|
1082
|
+
if cell.value == "yes":
|
|
1083
|
+
cell.fill = fill_plural_yes
|
|
1084
|
+
elif cell.value == "no":
|
|
1085
|
+
cell.fill = fill_plural_no
|
|
1086
|
+
|
|
1087
|
+
wb.save(output_path)
|
|
1088
|
+
console.print(f'[green]✓[/green] Excel saved to: [white]"{output_path}"[/white]')
|
|
1089
|
+
|
|
1090
|
+
else: # html
|
|
1091
|
+
from pystylometry.viz.jsx import export_bnc_frequency_jsx
|
|
1092
|
+
|
|
1093
|
+
export_bnc_frequency_jsx(
|
|
1094
|
+
result,
|
|
1095
|
+
output_file=output_path,
|
|
1096
|
+
title=f"BNC Frequency Analysis: {args.input_file.name}",
|
|
1097
|
+
source_file=str(args.input_file),
|
|
1098
|
+
)
|
|
1099
|
+
|
|
1100
|
+
abs_path = output_path.resolve()
|
|
1101
|
+
file_url = f"file://{abs_path}"
|
|
1102
|
+
console.print(f'[green]✓[/green] HTML report saved to: [white]"{output_path}"[/white]')
|
|
1103
|
+
console.print(f" Open in browser: [link={file_url}]{file_url}[/link]")
|
|
1104
|
+
|
|
1105
|
+
# Summary table
|
|
1106
|
+
console.print()
|
|
1107
|
+
table = Table(title="Summary", border_style="cyan", header_style="bold cyan")
|
|
1108
|
+
table.add_column("Metric", style="white")
|
|
1109
|
+
table.add_column("Count", justify="right", style="green")
|
|
1110
|
+
|
|
1111
|
+
table.add_row("Total tokens", f"{result.total_tokens:,}")
|
|
1112
|
+
table.add_row("Unique words", f"{result.unique_tokens:,}")
|
|
1113
|
+
table.add_row("Not in BNC", f"[dim]{len(result.not_in_bnc):,}[/dim]")
|
|
1114
|
+
table.add_row("Underused", f"[blue]{len(result.underused):,}[/blue]")
|
|
1115
|
+
table.add_row("Overused", f"[red]{len(result.overused):,}[/red]")
|
|
1116
|
+
|
|
1117
|
+
console.print(table)
|
|
1118
|
+
console.print()
|
|
1119
|
+
|
|
1120
|
+
|
|
747
1121
|
if __name__ == "__main__":
|
|
748
1122
|
drift_cli()
|
pystylometry/lexical/__init__.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
# Local implementations
|
|
4
4
|
from .advanced_diversity import compute_hdd, compute_mattr, compute_msttr, compute_vocd_d
|
|
5
|
+
from .bnc_frequency import compute_bnc_frequency
|
|
5
6
|
from .function_words import compute_function_words
|
|
6
7
|
from .hapax import compute_hapax_ratios, compute_hapax_with_lexicon_analysis
|
|
7
8
|
from .mtld import compute_mtld
|
|
@@ -17,6 +18,7 @@ __all__ = [
|
|
|
17
18
|
"compute_yule",
|
|
18
19
|
"compute_hapax_ratios",
|
|
19
20
|
"compute_hapax_with_lexicon_analysis",
|
|
21
|
+
"compute_bnc_frequency",
|
|
20
22
|
"compute_function_words",
|
|
21
23
|
"compute_vocd_d",
|
|
22
24
|
"compute_mattr",
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
"""BNC (British National Corpus) frequency analysis for stylometric comparison.
|
|
2
|
+
|
|
3
|
+
This module computes word frequency ratios by comparing observed word frequencies
|
|
4
|
+
in a text against expected frequencies from the British National Corpus (BNC).
|
|
5
|
+
Words can be categorized as:
|
|
6
|
+
- Overused: appear more frequently than expected (ratio > 1)
|
|
7
|
+
- Underused: appear less frequently than expected (ratio < 1)
|
|
8
|
+
- Not in BNC: words that don't exist in the BNC corpus
|
|
9
|
+
|
|
10
|
+
Related GitHub Issue:
|
|
11
|
+
#TBD - BNC frequency analysis CLI
|
|
12
|
+
https://github.com/craigtrim/pystylometry/issues/TBD
|
|
13
|
+
|
|
14
|
+
References:
|
|
15
|
+
British National Corpus: http://www.natcorp.ox.ac.uk/
|
|
16
|
+
The BNC is a 100-million word collection of samples of written and spoken
|
|
17
|
+
language from a wide range of sources, designed to represent a wide
|
|
18
|
+
cross-section of British English from the late 20th century.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import re
|
|
24
|
+
import unicodedata
|
|
25
|
+
from collections import Counter
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
from typing import Literal
|
|
28
|
+
|
|
29
|
+
from .._utils import check_optional_dependency
|
|
30
|
+
|
|
31
|
+
# Unicode apostrophe variants to normalize to ASCII apostrophe (U+0027)
|
|
32
|
+
# See: https://github.com/craigtrim/pystylometry/issues/45
|
|
33
|
+
_APOSTROPHE_VARIANTS = (
|
|
34
|
+
"\u0060" # GRAVE ACCENT
|
|
35
|
+
"\u00B4" # ACUTE ACCENT
|
|
36
|
+
"\u2018" # LEFT SINGLE QUOTATION MARK
|
|
37
|
+
"\u2019" # RIGHT SINGLE QUOTATION MARK
|
|
38
|
+
"\u201B" # SINGLE HIGH-REVERSED-9 QUOTATION MARK
|
|
39
|
+
"\u2032" # PRIME
|
|
40
|
+
"\u2035" # REVERSED PRIME
|
|
41
|
+
"\u02B9" # MODIFIER LETTER PRIME
|
|
42
|
+
"\u02BC" # MODIFIER LETTER APOSTROPHE
|
|
43
|
+
"\u02C8" # MODIFIER LETTER VERTICAL LINE
|
|
44
|
+
"\u0313" # COMBINING COMMA ABOVE
|
|
45
|
+
"\u0315" # COMBINING COMMA ABOVE RIGHT
|
|
46
|
+
"\u055A" # ARMENIAN APOSTROPHE
|
|
47
|
+
"\u05F3" # HEBREW PUNCTUATION GERESH
|
|
48
|
+
"\u07F4" # NKO HIGH TONE APOSTROPHE
|
|
49
|
+
"\u07F5" # NKO LOW TONE APOSTROPHE
|
|
50
|
+
"\uFF07" # FULLWIDTH APOSTROPHE
|
|
51
|
+
"\u1FBF" # GREEK PSILI
|
|
52
|
+
"\u1FBD" # GREEK KORONIS
|
|
53
|
+
"\uA78C" # LATIN SMALL LETTER SALTILLO
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _normalize_apostrophes(text: str) -> str:
|
|
58
|
+
"""Normalize Unicode apostrophe variants to ASCII apostrophe.
|
|
59
|
+
|
|
60
|
+
Many texts (especially ebooks, PDFs, and word processor output) use
|
|
61
|
+
typographic "smart quotes" instead of ASCII apostrophes. This function
|
|
62
|
+
normalizes all variants to the standard ASCII apostrophe (U+0027) to
|
|
63
|
+
ensure consistent BNC lookups.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
text: Input text potentially containing apostrophe variants
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Text with all apostrophe variants normalized to ASCII apostrophe
|
|
70
|
+
|
|
71
|
+
Example:
|
|
72
|
+
>>> _normalize_apostrophes("don't") # curly apostrophe
|
|
73
|
+
"don't" # ASCII apostrophe
|
|
74
|
+
"""
|
|
75
|
+
for char in _APOSTROPHE_VARIANTS:
|
|
76
|
+
text = text.replace(char, "'")
|
|
77
|
+
return text
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class WordAnalysis:
|
|
82
|
+
"""Analysis of a single word against BNC frequency.
|
|
83
|
+
|
|
84
|
+
Attributes:
|
|
85
|
+
word: The word being analyzed (lowercase)
|
|
86
|
+
observed: Number of times the word appears in the text
|
|
87
|
+
expected: Expected count based on BNC relative frequency
|
|
88
|
+
ratio: observed / expected (None if not in BNC)
|
|
89
|
+
in_wordnet: Whether the word exists in WordNet
|
|
90
|
+
char_type: Classification of character content
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
word: str
|
|
94
|
+
observed: int
|
|
95
|
+
expected: float | None
|
|
96
|
+
ratio: float | None
|
|
97
|
+
in_wordnet: bool | None
|
|
98
|
+
char_type: Literal["latin", "unicode", "numeric", "mixed", "punctuation"]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass
|
|
102
|
+
class BNCFrequencyResult:
|
|
103
|
+
"""Result of BNC frequency analysis.
|
|
104
|
+
|
|
105
|
+
Attributes:
|
|
106
|
+
overused: Words appearing more frequently than expected (ratio > threshold)
|
|
107
|
+
underused: Words appearing less frequently than expected (ratio < threshold)
|
|
108
|
+
not_in_bnc: Words not found in the BNC corpus
|
|
109
|
+
total_tokens: Total word count in the text
|
|
110
|
+
unique_tokens: Number of unique words
|
|
111
|
+
overuse_threshold: Ratio above which words are considered overused
|
|
112
|
+
underuse_threshold: Ratio below which words are considered underused
|
|
113
|
+
metadata: Additional analysis metadata
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
overused: list[WordAnalysis]
|
|
117
|
+
underused: list[WordAnalysis]
|
|
118
|
+
not_in_bnc: list[WordAnalysis]
|
|
119
|
+
total_tokens: int
|
|
120
|
+
unique_tokens: int
|
|
121
|
+
overuse_threshold: float
|
|
122
|
+
underuse_threshold: float
|
|
123
|
+
metadata: dict
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _classify_char_type(
|
|
127
|
+
word: str,
|
|
128
|
+
) -> Literal["latin", "unicode", "numeric", "mixed", "punctuation"]:
|
|
129
|
+
"""Classify the character content of a word.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
word: Word to classify
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Character type classification:
|
|
136
|
+
- latin: Pure ASCII alphabetic characters (a-z, A-Z)
|
|
137
|
+
- unicode: Contains non-ASCII characters (accents, etc.)
|
|
138
|
+
- numeric: Contains only digits
|
|
139
|
+
- mixed: Contains letters and numbers or other combinations
|
|
140
|
+
- punctuation: Contains only punctuation
|
|
141
|
+
"""
|
|
142
|
+
if not word:
|
|
143
|
+
return "punctuation"
|
|
144
|
+
|
|
145
|
+
has_ascii_alpha = bool(re.search(r"[a-zA-Z]", word))
|
|
146
|
+
has_unicode_alpha = any(unicodedata.category(c).startswith("L") and ord(c) > 127 for c in word)
|
|
147
|
+
has_digit = any(c.isdigit() for c in word)
|
|
148
|
+
has_punct = any(unicodedata.category(c).startswith("P") for c in word)
|
|
149
|
+
|
|
150
|
+
# Determine classification
|
|
151
|
+
if has_unicode_alpha:
|
|
152
|
+
return "unicode"
|
|
153
|
+
elif has_digit and not has_ascii_alpha:
|
|
154
|
+
return "numeric"
|
|
155
|
+
elif has_digit and has_ascii_alpha:
|
|
156
|
+
return "mixed"
|
|
157
|
+
elif has_ascii_alpha and not has_punct:
|
|
158
|
+
return "latin"
|
|
159
|
+
elif has_ascii_alpha and has_punct:
|
|
160
|
+
return "mixed"
|
|
161
|
+
elif not has_ascii_alpha and not has_digit:
|
|
162
|
+
return "punctuation"
|
|
163
|
+
else:
|
|
164
|
+
return "mixed"
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def compute_bnc_frequency(
|
|
168
|
+
text: str,
|
|
169
|
+
overuse_threshold: float = 1.3,
|
|
170
|
+
underuse_threshold: float = 0.8,
|
|
171
|
+
include_wordnet: bool = True,
|
|
172
|
+
min_mentions: int = 1,
|
|
173
|
+
) -> BNCFrequencyResult:
|
|
174
|
+
"""Compute BNC frequency analysis for a text.
|
|
175
|
+
|
|
176
|
+
Compares observed word frequencies against expected frequencies from the
|
|
177
|
+
British National Corpus. Words are categorized as overused, underused,
|
|
178
|
+
or not in BNC based on their frequency ratios.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
text: Input text to analyze
|
|
182
|
+
overuse_threshold: Ratio above which words are considered overused (default: 1.3)
|
|
183
|
+
underuse_threshold: Ratio below which words are considered underused (default: 0.8)
|
|
184
|
+
include_wordnet: Whether to check WordNet for unknown words (default: True)
|
|
185
|
+
min_mentions: Minimum number of mentions to include word (default: 1)
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
BNCFrequencyResult with categorized word lists
|
|
189
|
+
|
|
190
|
+
Raises:
|
|
191
|
+
ImportError: If bnc-lookup package is not installed
|
|
192
|
+
|
|
193
|
+
Example:
|
|
194
|
+
>>> result = compute_bnc_frequency("The captain ordered the larboard watch...")
|
|
195
|
+
>>> result.overused[:3] # Top 3 overused words
|
|
196
|
+
[WordAnalysis(word='larboard', ratio=33153.5, ...), ...]
|
|
197
|
+
>>> result.not_in_bnc[:3] # Words not in BNC
|
|
198
|
+
[WordAnalysis(word='xyzbot', ...), ...]
|
|
199
|
+
"""
|
|
200
|
+
# Check dependency
|
|
201
|
+
check_optional_dependency("bnc_lookup", "lexical")
|
|
202
|
+
from bnc_lookup import relative_frequency # type: ignore[import-untyped]
|
|
203
|
+
|
|
204
|
+
# Optional wordnet lookup
|
|
205
|
+
wordnet_checker = None
|
|
206
|
+
if include_wordnet:
|
|
207
|
+
try:
|
|
208
|
+
from wordnet_lookup import (
|
|
209
|
+
is_wordnet_term as _is_wordnet_term, # type: ignore[import-untyped]
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
wordnet_checker = _is_wordnet_term
|
|
213
|
+
except ImportError:
|
|
214
|
+
# WordNet lookup is optional
|
|
215
|
+
pass
|
|
216
|
+
|
|
217
|
+
# Tokenize text (simple whitespace + punctuation stripping)
|
|
218
|
+
# First normalize apostrophes to ensure consistent BNC lookups (Issue #45)
|
|
219
|
+
normalized_text = _normalize_apostrophes(text)
|
|
220
|
+
raw_tokens = normalized_text.split()
|
|
221
|
+
tokens = []
|
|
222
|
+
for raw in raw_tokens:
|
|
223
|
+
# Strip leading/trailing punctuation, lowercase
|
|
224
|
+
cleaned = re.sub(r"^[^\w]+|[^\w]+$", "", raw).lower()
|
|
225
|
+
if cleaned:
|
|
226
|
+
tokens.append(cleaned)
|
|
227
|
+
|
|
228
|
+
total_tokens = len(tokens)
|
|
229
|
+
|
|
230
|
+
# Count observed frequency of each word
|
|
231
|
+
observed = Counter(tokens)
|
|
232
|
+
unique_words = list(observed.keys())
|
|
233
|
+
|
|
234
|
+
# Get BNC relative frequencies (one at a time - bnc_lookup doesn't have batch)
|
|
235
|
+
bnc_freqs = {word: relative_frequency(word) for word in unique_words}
|
|
236
|
+
|
|
237
|
+
# Analyze each word
|
|
238
|
+
overused: list[WordAnalysis] = []
|
|
239
|
+
underused: list[WordAnalysis] = []
|
|
240
|
+
not_in_bnc: list[WordAnalysis] = []
|
|
241
|
+
|
|
242
|
+
for word, obs_count in observed.items():
|
|
243
|
+
if obs_count < min_mentions:
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
# Classify character type
|
|
247
|
+
char_type = _classify_char_type(word)
|
|
248
|
+
|
|
249
|
+
# Get BNC frequency
|
|
250
|
+
rel_freq = bnc_freqs.get(word)
|
|
251
|
+
|
|
252
|
+
# Check WordNet if requested
|
|
253
|
+
in_wordnet = None
|
|
254
|
+
if wordnet_checker is not None:
|
|
255
|
+
in_wordnet = wordnet_checker(word)
|
|
256
|
+
|
|
257
|
+
if rel_freq is None or rel_freq == 0:
|
|
258
|
+
# Word not in BNC
|
|
259
|
+
analysis = WordAnalysis(
|
|
260
|
+
word=word,
|
|
261
|
+
observed=obs_count,
|
|
262
|
+
expected=None,
|
|
263
|
+
ratio=None,
|
|
264
|
+
in_wordnet=in_wordnet,
|
|
265
|
+
char_type=char_type,
|
|
266
|
+
)
|
|
267
|
+
not_in_bnc.append(analysis)
|
|
268
|
+
else:
|
|
269
|
+
# Compute expected count and ratio
|
|
270
|
+
expected = rel_freq * total_tokens
|
|
271
|
+
ratio = obs_count / expected if expected > 0 else None
|
|
272
|
+
|
|
273
|
+
analysis = WordAnalysis(
|
|
274
|
+
word=word,
|
|
275
|
+
observed=obs_count,
|
|
276
|
+
expected=expected,
|
|
277
|
+
ratio=ratio,
|
|
278
|
+
in_wordnet=in_wordnet,
|
|
279
|
+
char_type=char_type,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
if ratio is not None:
|
|
283
|
+
if ratio > overuse_threshold:
|
|
284
|
+
overused.append(analysis)
|
|
285
|
+
elif ratio < underuse_threshold:
|
|
286
|
+
underused.append(analysis)
|
|
287
|
+
|
|
288
|
+
# Sort by ratio (highest first for overused, lowest first for underused)
|
|
289
|
+
overused.sort(key=lambda x: x.ratio or 0, reverse=True)
|
|
290
|
+
underused.sort(key=lambda x: x.ratio or float("inf"))
|
|
291
|
+
# Sort not_in_bnc by observed count
|
|
292
|
+
not_in_bnc.sort(key=lambda x: x.observed, reverse=True)
|
|
293
|
+
|
|
294
|
+
return BNCFrequencyResult(
|
|
295
|
+
overused=overused,
|
|
296
|
+
underused=underused,
|
|
297
|
+
not_in_bnc=not_in_bnc,
|
|
298
|
+
total_tokens=total_tokens,
|
|
299
|
+
unique_tokens=len(unique_words),
|
|
300
|
+
overuse_threshold=overuse_threshold,
|
|
301
|
+
underuse_threshold=underuse_threshold,
|
|
302
|
+
metadata={
|
|
303
|
+
"include_wordnet": include_wordnet,
|
|
304
|
+
"min_mentions": min_mentions,
|
|
305
|
+
"overused_count": len(overused),
|
|
306
|
+
"underused_count": len(underused),
|
|
307
|
+
"not_in_bnc_count": len(not_in_bnc),
|
|
308
|
+
},
|
|
309
|
+
)
|
pystylometry/viz/jsx/__init__.py
CHANGED
|
@@ -20,6 +20,7 @@ Example:
|
|
|
20
20
|
>>> export_drift_viewer("drift_analyzer.html")
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
|
+
from .bnc_frequency import export_bnc_frequency_jsx
|
|
23
24
|
from .report import export_drift_report_jsx
|
|
24
25
|
from .timeline import export_drift_timeline_jsx
|
|
25
26
|
from .viewer import export_drift_viewer
|
|
@@ -28,4 +29,5 @@ __all__ = [
|
|
|
28
29
|
"export_drift_timeline_jsx",
|
|
29
30
|
"export_drift_report_jsx",
|
|
30
31
|
"export_drift_viewer",
|
|
32
|
+
"export_bnc_frequency_jsx",
|
|
31
33
|
]
|
|
@@ -0,0 +1,495 @@
|
|
|
1
|
+
"""Interactive HTML export for BNC frequency analysis.
|
|
2
|
+
|
|
3
|
+
This module generates a self-contained HTML report showing word frequency
|
|
4
|
+
comparisons against the British National Corpus (BNC).
|
|
5
|
+
|
|
6
|
+
The report has three sections:
|
|
7
|
+
1. Not in BNC - Words not found in the corpus (with WordNet and character type info)
|
|
8
|
+
2. Most Underused - Words appearing less frequently than expected
|
|
9
|
+
3. Most Overused - Words appearing more frequently than expected
|
|
10
|
+
|
|
11
|
+
Related GitHub Issue:
|
|
12
|
+
#TBD - BNC frequency analysis CLI
|
|
13
|
+
https://github.com/craigtrim/pystylometry/issues/TBD
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import TYPE_CHECKING
|
|
20
|
+
|
|
21
|
+
from ._base import CARD_STYLES, generate_html_document, write_html_file
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from pystylometry.lexical.bnc_frequency import BNCFrequencyResult
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def export_bnc_frequency_jsx(
|
|
28
|
+
result: "BNCFrequencyResult",
|
|
29
|
+
output_file: str | Path,
|
|
30
|
+
title: str = "BNC Word Frequency Analysis",
|
|
31
|
+
source_file: str | None = None,
|
|
32
|
+
) -> Path:
|
|
33
|
+
"""Export BNC frequency analysis as interactive HTML.
|
|
34
|
+
|
|
35
|
+
Generates a self-contained HTML file with three sections:
|
|
36
|
+
- Not in BNC: Complete table with WordNet status and character type
|
|
37
|
+
- Most Underused: Words below the underuse threshold
|
|
38
|
+
- Most Overused: Words above the overuse threshold
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
result: BNCFrequencyResult from compute_bnc_frequency()
|
|
42
|
+
output_file: Path to write the HTML file
|
|
43
|
+
title: Page title (default: "BNC Word Frequency Analysis")
|
|
44
|
+
source_file: Optional source filename to display
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Path to the written HTML file
|
|
48
|
+
|
|
49
|
+
Example:
|
|
50
|
+
>>> from pystylometry.lexical.bnc_frequency import compute_bnc_frequency
|
|
51
|
+
>>> from pystylometry.viz.jsx import export_bnc_frequency_jsx
|
|
52
|
+
>>> result = compute_bnc_frequency(text)
|
|
53
|
+
>>> export_bnc_frequency_jsx(result, "frequency_report.html")
|
|
54
|
+
"""
|
|
55
|
+
# Build data for the React component
|
|
56
|
+
not_in_bnc_data = [
|
|
57
|
+
{
|
|
58
|
+
"word": w.word,
|
|
59
|
+
"observed": w.observed,
|
|
60
|
+
"inWordnet": w.in_wordnet,
|
|
61
|
+
"charType": w.char_type,
|
|
62
|
+
}
|
|
63
|
+
for w in result.not_in_bnc
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
underused_data = [
|
|
67
|
+
{
|
|
68
|
+
"word": w.word,
|
|
69
|
+
"observed": w.observed,
|
|
70
|
+
"expected": round(w.expected, 2) if w.expected else None,
|
|
71
|
+
"ratio": round(w.ratio, 4) if w.ratio else None,
|
|
72
|
+
"charType": w.char_type,
|
|
73
|
+
}
|
|
74
|
+
for w in result.underused
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
overused_data = [
|
|
78
|
+
{
|
|
79
|
+
"word": w.word,
|
|
80
|
+
"observed": w.observed,
|
|
81
|
+
"expected": round(w.expected, 2) if w.expected else None,
|
|
82
|
+
"ratio": round(w.ratio, 1) if w.ratio else None,
|
|
83
|
+
"charType": w.char_type,
|
|
84
|
+
}
|
|
85
|
+
for w in result.overused
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
config = {
|
|
89
|
+
"title": title,
|
|
90
|
+
"sourceFile": source_file,
|
|
91
|
+
"notInBnc": not_in_bnc_data,
|
|
92
|
+
"underused": underused_data,
|
|
93
|
+
"overused": overused_data,
|
|
94
|
+
"stats": {
|
|
95
|
+
"totalTokens": result.total_tokens,
|
|
96
|
+
"uniqueTokens": result.unique_tokens,
|
|
97
|
+
"notInBncCount": len(result.not_in_bnc),
|
|
98
|
+
"underusedCount": len(result.underused),
|
|
99
|
+
"overusedCount": len(result.overused),
|
|
100
|
+
"overuseThreshold": result.overuse_threshold,
|
|
101
|
+
"underuseThreshold": result.underuse_threshold,
|
|
102
|
+
},
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
react_component = """
|
|
106
|
+
// Color mapping for character types
|
|
107
|
+
const CHAR_TYPE_COLORS = {
|
|
108
|
+
latin: { bg: '#dcfce7', text: '#166534', label: 'Latin' },
|
|
109
|
+
unicode: { bg: '#fef3c7', text: '#92400e', label: 'Unicode' },
|
|
110
|
+
numeric: { bg: '#dbeafe', text: '#1e40af', label: 'Numeric' },
|
|
111
|
+
mixed: { bg: '#f3e8ff', text: '#6b21a8', label: 'Mixed' },
|
|
112
|
+
punctuation: { bg: '#f1f5f9', text: '#475569', label: 'Punct' },
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
// Tab configuration
|
|
116
|
+
const TABS = [
|
|
117
|
+
{ id: 'overused', label: 'Most Overused', color: '#ef4444' },
|
|
118
|
+
{ id: 'underused', label: 'Most Underused', color: '#3b82f6' },
|
|
119
|
+
{ id: 'notInBnc', label: 'Not in BNC', color: '#6b7280' },
|
|
120
|
+
];
|
|
121
|
+
|
|
122
|
+
// WordNet status badge
|
|
123
|
+
function WordnetBadge({ inWordnet }) {
|
|
124
|
+
if (inWordnet === null || inWordnet === undefined) {
|
|
125
|
+
return <span style={{ color: '#9ca3af', fontSize: '12px' }}>—</span>;
|
|
126
|
+
}
|
|
127
|
+
return inWordnet ? (
|
|
128
|
+
<span style={{
|
|
129
|
+
background: '#dcfce7',
|
|
130
|
+
color: '#166534',
|
|
131
|
+
padding: '2px 8px',
|
|
132
|
+
borderRadius: '9999px',
|
|
133
|
+
fontSize: '11px',
|
|
134
|
+
fontWeight: 500,
|
|
135
|
+
}}>Yes</span>
|
|
136
|
+
) : (
|
|
137
|
+
<span style={{
|
|
138
|
+
background: '#fee2e2',
|
|
139
|
+
color: '#991b1b',
|
|
140
|
+
padding: '2px 8px',
|
|
141
|
+
borderRadius: '9999px',
|
|
142
|
+
fontSize: '11px',
|
|
143
|
+
fontWeight: 500,
|
|
144
|
+
}}>No</span>
|
|
145
|
+
);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Character type badge
|
|
149
|
+
function CharTypeBadge({ charType }) {
|
|
150
|
+
const config = CHAR_TYPE_COLORS[charType] || CHAR_TYPE_COLORS.mixed;
|
|
151
|
+
return (
|
|
152
|
+
<span style={{
|
|
153
|
+
background: config.bg,
|
|
154
|
+
color: config.text,
|
|
155
|
+
padding: '2px 8px',
|
|
156
|
+
borderRadius: '9999px',
|
|
157
|
+
fontSize: '11px',
|
|
158
|
+
fontWeight: 500,
|
|
159
|
+
}}>{config.label}</span>
|
|
160
|
+
);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Ratio display with color intensity
|
|
164
|
+
function RatioDisplay({ ratio, isOverused }) {
|
|
165
|
+
if (ratio === null || ratio === undefined) return '—';
|
|
166
|
+
|
|
167
|
+
let color, intensity;
|
|
168
|
+
if (isOverused) {
|
|
169
|
+
intensity = Math.min(Math.log2(ratio) / 6, 1);
|
|
170
|
+
const r = 239;
|
|
171
|
+
const g = Math.round(68 + (1 - intensity) * 120);
|
|
172
|
+
color = `rgb(${r}, ${g}, 68)`;
|
|
173
|
+
} else {
|
|
174
|
+
intensity = Math.min(Math.abs(Math.log2(ratio)) / 4, 1);
|
|
175
|
+
const b = 246;
|
|
176
|
+
const g = Math.round(130 + (1 - intensity) * 60);
|
|
177
|
+
color = `rgb(59, ${g}, ${b})`;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const displayValue = isOverused ? ratio.toFixed(1) + 'x' : ratio.toFixed(4);
|
|
181
|
+
|
|
182
|
+
return (
|
|
183
|
+
<span style={{
|
|
184
|
+
color: color,
|
|
185
|
+
fontWeight: 600,
|
|
186
|
+
fontFamily: 'ui-monospace, monospace',
|
|
187
|
+
}}>{displayValue}</span>
|
|
188
|
+
);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Stats summary card
|
|
192
|
+
function StatsCard({ stats, activeTab, onTabChange }) {
|
|
193
|
+
return (
|
|
194
|
+
<div className="card" style={{ marginBottom: '24px' }}>
|
|
195
|
+
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(120px, 1fr))', gap: '16px' }}>
|
|
196
|
+
<div>
|
|
197
|
+
<div style={{ fontSize: '11px', color: '#6b7280', marginBottom: '4px' }}>Total Tokens</div>
|
|
198
|
+
<div style={{ fontSize: '20px', fontWeight: 600 }}>{stats.totalTokens.toLocaleString()}</div>
|
|
199
|
+
</div>
|
|
200
|
+
<div>
|
|
201
|
+
<div style={{ fontSize: '11px', color: '#6b7280', marginBottom: '4px' }}>Unique Words</div>
|
|
202
|
+
<div style={{ fontSize: '20px', fontWeight: 600 }}>{stats.uniqueTokens.toLocaleString()}</div>
|
|
203
|
+
</div>
|
|
204
|
+
{TABS.map(tab => (
|
|
205
|
+
<div
|
|
206
|
+
key={tab.id}
|
|
207
|
+
onClick={() => onTabChange(tab.id)}
|
|
208
|
+
style={{
|
|
209
|
+
cursor: 'pointer',
|
|
210
|
+
padding: '8px',
|
|
211
|
+
margin: '-8px',
|
|
212
|
+
borderRadius: '8px',
|
|
213
|
+
background: activeTab === tab.id ? `${tab.color}10` : 'transparent',
|
|
214
|
+
border: activeTab === tab.id ? `2px solid ${tab.color}` : '2px solid transparent',
|
|
215
|
+
transition: 'all 0.15s',
|
|
216
|
+
}}
|
|
217
|
+
>
|
|
218
|
+
<div style={{ fontSize: '11px', color: '#6b7280', marginBottom: '4px' }}>{tab.label}</div>
|
|
219
|
+
<div style={{ fontSize: '20px', fontWeight: 600, color: tab.color }}>
|
|
220
|
+
{tab.id === 'overused' ? stats.overusedCount.toLocaleString() :
|
|
221
|
+
tab.id === 'underused' ? stats.underusedCount.toLocaleString() :
|
|
222
|
+
stats.notInBncCount.toLocaleString()}
|
|
223
|
+
</div>
|
|
224
|
+
</div>
|
|
225
|
+
))}
|
|
226
|
+
</div>
|
|
227
|
+
</div>
|
|
228
|
+
);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Tab bar component
|
|
232
|
+
function TabBar({ activeTab, onTabChange, stats }) {
|
|
233
|
+
return (
|
|
234
|
+
<div style={{ display: 'flex', gap: '4px', marginBottom: '16px', borderBottom: '2px solid #e2e8f0', paddingBottom: '0' }}>
|
|
235
|
+
{TABS.map(tab => {
|
|
236
|
+
const count = tab.id === 'overused' ? stats.overusedCount :
|
|
237
|
+
tab.id === 'underused' ? stats.underusedCount :
|
|
238
|
+
stats.notInBncCount;
|
|
239
|
+
const isActive = activeTab === tab.id;
|
|
240
|
+
return (
|
|
241
|
+
<button
|
|
242
|
+
key={tab.id}
|
|
243
|
+
onClick={() => onTabChange(tab.id)}
|
|
244
|
+
style={{
|
|
245
|
+
padding: '12px 20px',
|
|
246
|
+
border: 'none',
|
|
247
|
+
background: 'transparent',
|
|
248
|
+
cursor: 'pointer',
|
|
249
|
+
fontSize: '14px',
|
|
250
|
+
fontWeight: isActive ? 600 : 500,
|
|
251
|
+
color: isActive ? tab.color : '#6b7280',
|
|
252
|
+
borderBottom: isActive ? `3px solid ${tab.color}` : '3px solid transparent',
|
|
253
|
+
marginBottom: '-2px',
|
|
254
|
+
transition: 'all 0.15s',
|
|
255
|
+
}}
|
|
256
|
+
>
|
|
257
|
+
{tab.label}
|
|
258
|
+
<span style={{
|
|
259
|
+
marginLeft: '8px',
|
|
260
|
+
padding: '2px 8px',
|
|
261
|
+
borderRadius: '9999px',
|
|
262
|
+
fontSize: '12px',
|
|
263
|
+
background: isActive ? `${tab.color}20` : '#f1f5f9',
|
|
264
|
+
color: isActive ? tab.color : '#6b7280',
|
|
265
|
+
}}>{count.toLocaleString()}</span>
|
|
266
|
+
</button>
|
|
267
|
+
);
|
|
268
|
+
})}
|
|
269
|
+
</div>
|
|
270
|
+
);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// Data table component
|
|
274
|
+
function DataTable({ data, columns, emptyMessage, filter, onFilterChange }) {
|
|
275
|
+
const [sortKey, setSortKey] = React.useState(null);
|
|
276
|
+
const [sortDir, setSortDir] = React.useState('desc');
|
|
277
|
+
|
|
278
|
+
const filteredData = React.useMemo(() => {
|
|
279
|
+
if (!filter) return data;
|
|
280
|
+
const lowerFilter = filter.toLowerCase();
|
|
281
|
+
return data.filter(row => row.word.toLowerCase().includes(lowerFilter));
|
|
282
|
+
}, [data, filter]);
|
|
283
|
+
|
|
284
|
+
const sortedData = React.useMemo(() => {
|
|
285
|
+
if (!sortKey) return filteredData;
|
|
286
|
+
return [...filteredData].sort((a, b) => {
|
|
287
|
+
let aVal = a[sortKey];
|
|
288
|
+
let bVal = b[sortKey];
|
|
289
|
+
if (aVal === null || aVal === undefined) aVal = sortDir === 'desc' ? -Infinity : Infinity;
|
|
290
|
+
if (bVal === null || bVal === undefined) bVal = sortDir === 'desc' ? -Infinity : Infinity;
|
|
291
|
+
if (typeof aVal === 'string') {
|
|
292
|
+
return sortDir === 'desc' ? bVal.localeCompare(aVal) : aVal.localeCompare(bVal);
|
|
293
|
+
}
|
|
294
|
+
return sortDir === 'desc' ? bVal - aVal : aVal - bVal;
|
|
295
|
+
});
|
|
296
|
+
}, [filteredData, sortKey, sortDir]);
|
|
297
|
+
|
|
298
|
+
const handleSort = (key) => {
|
|
299
|
+
if (sortKey === key) {
|
|
300
|
+
setSortDir(sortDir === 'desc' ? 'asc' : 'desc');
|
|
301
|
+
} else {
|
|
302
|
+
setSortKey(key);
|
|
303
|
+
setSortDir('desc');
|
|
304
|
+
}
|
|
305
|
+
};
|
|
306
|
+
|
|
307
|
+
return (
|
|
308
|
+
<div>
|
|
309
|
+
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '12px' }}>
|
|
310
|
+
<span style={{ fontSize: '13px', color: '#6b7280' }}>{sortedData.length} words</span>
|
|
311
|
+
<input
|
|
312
|
+
type="text"
|
|
313
|
+
placeholder="Filter words..."
|
|
314
|
+
value={filter}
|
|
315
|
+
onChange={(e) => onFilterChange(e.target.value)}
|
|
316
|
+
style={{
|
|
317
|
+
padding: '8px 12px',
|
|
318
|
+
border: '1px solid #e2e8f0',
|
|
319
|
+
borderRadius: '6px',
|
|
320
|
+
fontSize: '13px',
|
|
321
|
+
width: '200px',
|
|
322
|
+
}}
|
|
323
|
+
/>
|
|
324
|
+
</div>
|
|
325
|
+
|
|
326
|
+
{sortedData.length === 0 ? (
|
|
327
|
+
<div style={{ padding: '48px', textAlign: 'center', color: '#9ca3af' }}>
|
|
328
|
+
{filter ? 'No matching words' : emptyMessage}
|
|
329
|
+
</div>
|
|
330
|
+
) : (
|
|
331
|
+
<div style={{ overflowX: 'auto', maxHeight: '600px', overflowY: 'auto' }}>
|
|
332
|
+
<table style={{ width: '100%', borderCollapse: 'collapse', fontSize: '13px' }}>
|
|
333
|
+
<thead style={{ position: 'sticky', top: 0, background: 'white' }}>
|
|
334
|
+
<tr style={{ borderBottom: '2px solid #e2e8f0' }}>
|
|
335
|
+
{columns.map(col => (
|
|
336
|
+
<th
|
|
337
|
+
key={col.key}
|
|
338
|
+
onClick={() => col.sortable !== false && handleSort(col.key)}
|
|
339
|
+
style={{
|
|
340
|
+
textAlign: col.align || 'left',
|
|
341
|
+
padding: '10px 12px',
|
|
342
|
+
fontWeight: 600,
|
|
343
|
+
color: '#374151',
|
|
344
|
+
cursor: col.sortable !== false ? 'pointer' : 'default',
|
|
345
|
+
userSelect: 'none',
|
|
346
|
+
whiteSpace: 'nowrap',
|
|
347
|
+
background: 'white',
|
|
348
|
+
}}
|
|
349
|
+
>
|
|
350
|
+
{col.label}
|
|
351
|
+
{sortKey === col.key && (
|
|
352
|
+
<span style={{ marginLeft: '4px' }}>{sortDir === 'desc' ? '↓' : '↑'}</span>
|
|
353
|
+
)}
|
|
354
|
+
</th>
|
|
355
|
+
))}
|
|
356
|
+
</tr>
|
|
357
|
+
</thead>
|
|
358
|
+
<tbody>
|
|
359
|
+
{sortedData.map((row, idx) => (
|
|
360
|
+
<tr key={idx} style={{ borderBottom: '1px solid #f1f5f9' }}>
|
|
361
|
+
{columns.map(col => (
|
|
362
|
+
<td key={col.key} style={{ padding: '10px 12px', textAlign: col.align || 'left' }}>
|
|
363
|
+
{col.render ? col.render(row[col.key], row) : row[col.key]}
|
|
364
|
+
</td>
|
|
365
|
+
))}
|
|
366
|
+
</tr>
|
|
367
|
+
))}
|
|
368
|
+
</tbody>
|
|
369
|
+
</table>
|
|
370
|
+
</div>
|
|
371
|
+
)}
|
|
372
|
+
</div>
|
|
373
|
+
);
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
// Main component
|
|
377
|
+
function BNCFrequencyReport() {
|
|
378
|
+
const { title, sourceFile, notInBnc, underused, overused, stats } = CONFIG;
|
|
379
|
+
const [activeTab, setActiveTab] = React.useState('overused');
|
|
380
|
+
const [filter, setFilter] = React.useState('');
|
|
381
|
+
|
|
382
|
+
// Reset filter when tab changes
|
|
383
|
+
const handleTabChange = (tab) => {
|
|
384
|
+
setActiveTab(tab);
|
|
385
|
+
setFilter('');
|
|
386
|
+
};
|
|
387
|
+
|
|
388
|
+
// Column definitions
|
|
389
|
+
const notInBncColumns = [
|
|
390
|
+
{ key: 'word', label: 'Word', render: (v) => <code style={{ background: '#f1f5f9', padding: '2px 6px', borderRadius: '4px' }}>{v}</code> },
|
|
391
|
+
{ key: 'observed', label: 'Mentions', align: 'right' },
|
|
392
|
+
{ key: 'inWordnet', label: 'In WordNet', align: 'center', render: (v) => <WordnetBadge inWordnet={v} />, sortable: false },
|
|
393
|
+
{ key: 'charType', label: 'Char Type', align: 'center', render: (v) => <CharTypeBadge charType={v} />, sortable: false },
|
|
394
|
+
];
|
|
395
|
+
|
|
396
|
+
const frequencyColumns = (isOverused) => [
|
|
397
|
+
{ key: 'word', label: 'Word', render: (v) => <code style={{ background: '#f1f5f9', padding: '2px 6px', borderRadius: '4px' }}>{v}</code> },
|
|
398
|
+
{ key: 'observed', label: 'Observed', align: 'right' },
|
|
399
|
+
{ key: 'expected', label: 'Expected', align: 'right', render: (v) => v !== null ? v.toFixed(2) : '—' },
|
|
400
|
+
{ key: 'ratio', label: 'Ratio', align: 'right', render: (v) => <RatioDisplay ratio={v} isOverused={isOverused} /> },
|
|
401
|
+
{ key: 'charType', label: 'Char Type', align: 'center', render: (v) => <CharTypeBadge charType={v} />, sortable: false },
|
|
402
|
+
];
|
|
403
|
+
|
|
404
|
+
const getTabContent = () => {
|
|
405
|
+
switch (activeTab) {
|
|
406
|
+
case 'overused':
|
|
407
|
+
return (
|
|
408
|
+
<DataTable
|
|
409
|
+
data={overused}
|
|
410
|
+
columns={frequencyColumns(true)}
|
|
411
|
+
emptyMessage="No significantly overused words"
|
|
412
|
+
filter={filter}
|
|
413
|
+
onFilterChange={setFilter}
|
|
414
|
+
/>
|
|
415
|
+
);
|
|
416
|
+
case 'underused':
|
|
417
|
+
return (
|
|
418
|
+
<DataTable
|
|
419
|
+
data={underused}
|
|
420
|
+
columns={frequencyColumns(false)}
|
|
421
|
+
emptyMessage="No significantly underused words"
|
|
422
|
+
filter={filter}
|
|
423
|
+
onFilterChange={setFilter}
|
|
424
|
+
/>
|
|
425
|
+
);
|
|
426
|
+
case 'notInBnc':
|
|
427
|
+
return (
|
|
428
|
+
<DataTable
|
|
429
|
+
data={notInBnc}
|
|
430
|
+
columns={notInBncColumns}
|
|
431
|
+
emptyMessage="All words found in BNC"
|
|
432
|
+
filter={filter}
|
|
433
|
+
onFilterChange={setFilter}
|
|
434
|
+
/>
|
|
435
|
+
);
|
|
436
|
+
}
|
|
437
|
+
};
|
|
438
|
+
|
|
439
|
+
return (
|
|
440
|
+
<div>
|
|
441
|
+
<div style={{ marginBottom: '24px' }}>
|
|
442
|
+
<h1 style={{ margin: '0 0 8px', fontSize: '24px', fontWeight: 600 }}>{title}</h1>
|
|
443
|
+
{sourceFile && (
|
|
444
|
+
<div style={{ fontSize: '14px', color: '#6b7280' }}>
|
|
445
|
+
Source: <code style={{ background: '#f1f5f9', padding: '2px 6px', borderRadius: '4px' }}>{sourceFile}</code>
|
|
446
|
+
</div>
|
|
447
|
+
)}
|
|
448
|
+
</div>
|
|
449
|
+
|
|
450
|
+
<StatsCard stats={stats} activeTab={activeTab} onTabChange={handleTabChange} />
|
|
451
|
+
|
|
452
|
+
<div className="card">
|
|
453
|
+
<TabBar activeTab={activeTab} onTabChange={handleTabChange} stats={stats} />
|
|
454
|
+
{getTabContent()}
|
|
455
|
+
</div>
|
|
456
|
+
|
|
457
|
+
<div style={{ marginTop: '24px', padding: '16px', background: '#f8fafc', borderRadius: '8px', fontSize: '12px', color: '#6b7280' }}>
|
|
458
|
+
<strong>About this analysis:</strong> Word frequencies are compared against the British National Corpus (BNC),
|
|
459
|
+
a 100-million word collection of British English. Ratios indicate how much more (or less) frequently
|
|
460
|
+
a word appears in this text compared to typical usage. Words not in BNC may be proper nouns,
|
|
461
|
+
technical terms, neologisms, or OCR errors.
|
|
462
|
+
</div>
|
|
463
|
+
</div>
|
|
464
|
+
);
|
|
465
|
+
}
|
|
466
|
+
"""
|
|
467
|
+
|
|
468
|
+
extra_styles = (
|
|
469
|
+
CARD_STYLES
|
|
470
|
+
+ """
|
|
471
|
+
code {
|
|
472
|
+
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
|
|
473
|
+
}
|
|
474
|
+
table {
|
|
475
|
+
font-variant-numeric: tabular-nums;
|
|
476
|
+
}
|
|
477
|
+
input:focus {
|
|
478
|
+
outline: 2px solid #3b82f6;
|
|
479
|
+
outline-offset: -1px;
|
|
480
|
+
}
|
|
481
|
+
tr:hover {
|
|
482
|
+
background: #f8fafc;
|
|
483
|
+
}
|
|
484
|
+
"""
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
html = generate_html_document(
|
|
488
|
+
title=title,
|
|
489
|
+
config=config,
|
|
490
|
+
react_component=react_component,
|
|
491
|
+
component_name="BNCFrequencyReport",
|
|
492
|
+
extra_styles=extra_styles,
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
return write_html_file(output_file, html)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pystylometry
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.6
|
|
4
4
|
Summary: Comprehensive Python package for stylometric analysis
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: stylometry,nlp,text-analysis,authorship,readability,lexical-diversity,readability-metrics
|
|
@@ -19,6 +19,20 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
19
19
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
20
20
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
21
21
|
Classifier: Typing :: Typed
|
|
22
|
+
Provides-Extra: all
|
|
23
|
+
Provides-Extra: excel
|
|
24
|
+
Provides-Extra: lexical
|
|
25
|
+
Provides-Extra: readability
|
|
26
|
+
Provides-Extra: syntactic
|
|
27
|
+
Provides-Extra: viz
|
|
28
|
+
Requires-Dist: bnc-lookup (>=1.3.2) ; extra == "lexical" or extra == "all"
|
|
29
|
+
Requires-Dist: matplotlib (>=3.8.0,<4.0.0) ; extra == "viz" or extra == "all"
|
|
30
|
+
Requires-Dist: openpyxl (>=3.1.0,<4.0.0) ; extra == "lexical" or extra == "excel" or extra == "all"
|
|
31
|
+
Requires-Dist: pronouncing (>=0.2.0,<0.3.0) ; extra == "readability" or extra == "all"
|
|
32
|
+
Requires-Dist: rich (>=13.0,<14.0)
|
|
33
|
+
Requires-Dist: seaborn (>=0.13.0,<0.14.0) ; extra == "viz" or extra == "all"
|
|
34
|
+
Requires-Dist: spacy (>=3.8.0,<4.0.0) ; extra == "readability" or extra == "syntactic" or extra == "all"
|
|
35
|
+
Requires-Dist: wordnet-lookup ; extra == "lexical" or extra == "all"
|
|
22
36
|
Project-URL: Homepage, https://github.com/craigtrim/pystylometry
|
|
23
37
|
Project-URL: Issues, https://github.com/craigtrim/pystylometry/issues
|
|
24
38
|
Project-URL: Repository, https://github.com/craigtrim/pystylometry
|
|
@@ -13,7 +13,7 @@ pystylometry/authorship/zeta.py,sha256=oOi9Y6ZPq15ILLVl6So9O9ERvzig26en6_dpQJWeo
|
|
|
13
13
|
pystylometry/character/README.md,sha256=poQwhbI8MabVD_626CWjEL87IOX5YDGS0ZJTH1hNwEE,607
|
|
14
14
|
pystylometry/character/__init__.py,sha256=CiiKJmZ10UJE8qAecavpOKyw-vGonsOew_mFH34ZOC0,371
|
|
15
15
|
pystylometry/character/character_metrics.py,sha256=OCIGP_ivtwtzcifcxcbmp2R5SIKh2tKyvKcHAv64S8g,14029
|
|
16
|
-
pystylometry/cli.py,sha256=
|
|
16
|
+
pystylometry/cli.py,sha256=NRKuA4oCEJPNPkeSUttZxd0ZVQSn4kh77qOTWfjsgyM,40635
|
|
17
17
|
pystylometry/consistency/README.md,sha256=HG_Rd6WRBnIz3M7J11dVDv1S2ARkMABFYrTn-VV8xRY,1058
|
|
18
18
|
pystylometry/consistency/__init__.py,sha256=l7nzpS7M4yHDBbM2LGAtW0XGT2n7YjSey_1xKf45224,2181
|
|
19
19
|
pystylometry/consistency/_thresholds.py,sha256=5fZwdJ_cnDy0ED7CCYs6V_zP6kIAR1p0h0NYkbZ0HRg,6381
|
|
@@ -24,8 +24,9 @@ pystylometry/dialect/_data/dialect_markers.json,sha256=DthluOA6q0rG_8IrCrFIYWh_E
|
|
|
24
24
|
pystylometry/dialect/_loader.py,sha256=M2ATp-5754v_yX9EWvBP0r5qgNf8xlL8XadVsVb_Hco,12989
|
|
25
25
|
pystylometry/dialect/detector.py,sha256=9x0ZuIfTIjsmdNSx0Ezy5AC0SAFtC4kVw11iOSBd9gQ,20147
|
|
26
26
|
pystylometry/lexical/README.md,sha256=cFQ7KRZV4ubsQwIlOH3YHTbhhNl5X91Sr3zcn-3x0HI,1185
|
|
27
|
-
pystylometry/lexical/__init__.py,sha256=
|
|
27
|
+
pystylometry/lexical/__init__.py,sha256=p5vYmHSr_kUHC2Vpng8ObncLs10cdb6s3P23DPmwzoc,1012
|
|
28
28
|
pystylometry/lexical/advanced_diversity.py,sha256=rL1hlNqTnaEFcA2v4oBJlojHZMTqdvvm4jYXTFGVpYE,25664
|
|
29
|
+
pystylometry/lexical/bnc_frequency.py,sha256=m_AEYY4joEwVVbzBYJm9zq2-K7Nix9MLE3l4LHomjig,10580
|
|
29
30
|
pystylometry/lexical/function_words.py,sha256=eel9bq_qWgWlvG0NtDiouilMt9kaFqz2rh3add2UC4U,17832
|
|
30
31
|
pystylometry/lexical/hapax.py,sha256=djTqZyZIYXa3GRiPoy6TTGHPm0wCRNJ9U0Rwnf5NoDk,12173
|
|
31
32
|
pystylometry/lexical/mtld.py,sha256=XpeCF8sOXZhWbaazHGuqm08mrOf_DYfkfGGAltWnyy4,7101
|
|
@@ -66,13 +67,14 @@ pystylometry/tokenizer.py,sha256=03FEF4kKp72v-ypbtMg8u0WyVJGk3YJx6Nw3SGzyAnA,181
|
|
|
66
67
|
pystylometry/viz/README.md,sha256=mizuBpUzWgJqjC2u9C-Lu4sVDCcTQOgGsarRSkeWPf4,1031
|
|
67
68
|
pystylometry/viz/__init__.py,sha256=3kHMAcJJi8oPhTqUZIRdyf311cdyPOHWaJIUv-w0V04,2219
|
|
68
69
|
pystylometry/viz/drift.py,sha256=r98gQ4s_IlrEuaouxDMyue3cTjGqj10i4IeKC01IuCo,18956
|
|
69
|
-
pystylometry/viz/jsx/__init__.py,sha256=
|
|
70
|
+
pystylometry/viz/jsx/__init__.py,sha256=_-BFtPtBhQyBiKJWGPndI-m-3SRBk1JsFombYXYc2Fk,1191
|
|
70
71
|
pystylometry/viz/jsx/_base.py,sha256=nd7kEc13fUcRMom3A5jqjGyTy-djIeydq2k3oPHZIHY,3708
|
|
72
|
+
pystylometry/viz/jsx/bnc_frequency.py,sha256=U8plmMOXMgLuJPMtL5k5MecFAX-5CdnxSLX3mVAmoLY,18391
|
|
71
73
|
pystylometry/viz/jsx/report.py,sha256=DbbHnnNAEi5tmVg4PmiHb17vkBBXujyE4x1CfVBiOBw,25857
|
|
72
74
|
pystylometry/viz/jsx/timeline.py,sha256=hor-xnBa6oVkSqN0AEZUCQFBOB-iTfHSFZHiEfeakPA,30716
|
|
73
75
|
pystylometry/viz/jsx/viewer.py,sha256=3LO49d_2bRf_P-P-2oSKpKx4N8Ugo4oCLb3DtvyNxXI,43716
|
|
74
|
-
pystylometry-1.3.
|
|
75
|
-
pystylometry-1.3.
|
|
76
|
-
pystylometry-1.3.
|
|
77
|
-
pystylometry-1.3.
|
|
78
|
-
pystylometry-1.3.
|
|
76
|
+
pystylometry-1.3.6.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
|
|
77
|
+
pystylometry-1.3.6.dist-info/METADATA,sha256=No130TQB2VZMHPz6XD_Z37ZmyT0lC2Y785PSaDkeeZc,5545
|
|
78
|
+
pystylometry-1.3.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
79
|
+
pystylometry-1.3.6.dist-info/entry_points.txt,sha256=Gr2keJe638qHrrJpCGZAP3AYduxxIaSCoBH4FwAJt7U,204
|
|
80
|
+
pystylometry-1.3.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|