pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pystylometry/README.md +42 -0
  2. pystylometry/__init__.py +45 -3
  3. pystylometry/_types.py +1017 -259
  4. pystylometry/authorship/README.md +21 -0
  5. pystylometry/authorship/__init__.py +28 -4
  6. pystylometry/authorship/additional_methods.py +260 -40
  7. pystylometry/authorship/compression.py +175 -0
  8. pystylometry/authorship/kilgarriff.py +354 -0
  9. pystylometry/character/README.md +17 -0
  10. pystylometry/character/character_metrics.py +267 -179
  11. pystylometry/cli.py +427 -0
  12. pystylometry/consistency/README.md +27 -0
  13. pystylometry/consistency/__init__.py +57 -0
  14. pystylometry/consistency/_thresholds.py +162 -0
  15. pystylometry/consistency/drift.py +549 -0
  16. pystylometry/dialect/README.md +26 -0
  17. pystylometry/dialect/__init__.py +65 -0
  18. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  19. pystylometry/dialect/_loader.py +360 -0
  20. pystylometry/dialect/detector.py +533 -0
  21. pystylometry/lexical/README.md +23 -0
  22. pystylometry/lexical/advanced_diversity.py +61 -22
  23. pystylometry/lexical/function_words.py +255 -56
  24. pystylometry/lexical/hapax.py +182 -52
  25. pystylometry/lexical/mtld.py +108 -26
  26. pystylometry/lexical/ttr.py +76 -10
  27. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  28. pystylometry/lexical/yule.py +136 -50
  29. pystylometry/ngrams/README.md +18 -0
  30. pystylometry/ngrams/entropy.py +150 -49
  31. pystylometry/ngrams/extended_ngrams.py +314 -69
  32. pystylometry/prosody/README.md +17 -0
  33. pystylometry/prosody/rhythm_prosody.py +773 -11
  34. pystylometry/readability/README.md +23 -0
  35. pystylometry/readability/additional_formulas.py +1887 -762
  36. pystylometry/readability/ari.py +144 -82
  37. pystylometry/readability/coleman_liau.py +136 -109
  38. pystylometry/readability/flesch.py +177 -73
  39. pystylometry/readability/gunning_fog.py +165 -161
  40. pystylometry/readability/smog.py +123 -42
  41. pystylometry/stylistic/README.md +20 -0
  42. pystylometry/stylistic/cohesion_coherence.py +669 -13
  43. pystylometry/stylistic/genre_register.py +1560 -17
  44. pystylometry/stylistic/markers.py +611 -17
  45. pystylometry/stylistic/vocabulary_overlap.py +354 -13
  46. pystylometry/syntactic/README.md +20 -0
  47. pystylometry/syntactic/advanced_syntactic.py +76 -14
  48. pystylometry/syntactic/pos_ratios.py +70 -6
  49. pystylometry/syntactic/sentence_stats.py +55 -12
  50. pystylometry/syntactic/sentence_types.py +71 -15
  51. pystylometry/viz/README.md +27 -0
  52. pystylometry/viz/__init__.py +71 -0
  53. pystylometry/viz/drift.py +589 -0
  54. pystylometry/viz/jsx/__init__.py +31 -0
  55. pystylometry/viz/jsx/_base.py +144 -0
  56. pystylometry/viz/jsx/report.py +677 -0
  57. pystylometry/viz/jsx/timeline.py +716 -0
  58. pystylometry/viz/jsx/viewer.py +1032 -0
  59. pystylometry-1.3.0.dist-info/METADATA +136 -0
  60. pystylometry-1.3.0.dist-info/RECORD +76 -0
  61. {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
  62. pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
  63. pystylometry-1.0.0.dist-info/METADATA +0 -275
  64. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -9,37 +9,1580 @@ Related GitHub Issue:
9
9
 
10
10
  References:
11
11
  Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
12
- Biber, D., & Conrad, S. (2009). Register, genre, and style.
12
+ Biber, D., & Conrad, S. (2009). Register, genre, and style. Cambridge University Press.
13
+ Heylighen, F., & Dewaele, J. M. (1999). Formality of language: Definition,
14
+ measurement and behavioral determinants.
13
15
  """
14
16
 
17
+ from __future__ import annotations
18
+
19
+ import re
20
+ import time
21
+ from typing import TYPE_CHECKING
22
+
23
+ if TYPE_CHECKING:
24
+ from typing import Any
25
+
15
26
  from .._types import GenreRegisterResult
16
27
 
28
+ # =============================================================================
29
+ # WORD LISTS
30
+ # =============================================================================
31
+
32
+ # Common Latinate suffixes and prefixes (from Latin/French origin)
33
+ # These indicate formal, academic, or technical register
34
+ LATINATE_SUFFIXES = frozenset(
35
+ [
36
+ "tion",
37
+ "sion",
38
+ "ment",
39
+ "ity",
40
+ "ence",
41
+ "ance",
42
+ "ious",
43
+ "eous",
44
+ "ible",
45
+ "able",
46
+ "ive",
47
+ "ative",
48
+ "ure",
49
+ "al",
50
+ "ial",
51
+ "ual",
52
+ "ory",
53
+ "ary",
54
+ "ery",
55
+ "ant",
56
+ "ent",
57
+ "ous",
58
+ "uous",
59
+ "ic",
60
+ "ical",
61
+ ]
62
+ )
63
+
64
+ # Common Germanic/Anglo-Saxon words (everyday, informal vocabulary)
65
+ # High-frequency words of Old English origin
66
+ GERMANIC_COMMON_WORDS = frozenset(
67
+ [
68
+ # Basic verbs
69
+ "be",
70
+ "have",
71
+ "do",
72
+ "go",
73
+ "come",
74
+ "get",
75
+ "make",
76
+ "take",
77
+ "see",
78
+ "know",
79
+ "think",
80
+ "want",
81
+ "give",
82
+ "use",
83
+ "find",
84
+ "tell",
85
+ "ask",
86
+ "work",
87
+ "seem",
88
+ "feel",
89
+ "try",
90
+ "leave",
91
+ "call",
92
+ "keep",
93
+ "let",
94
+ "begin",
95
+ "show",
96
+ "hear",
97
+ "play",
98
+ "run",
99
+ "move",
100
+ "live",
101
+ "believe",
102
+ "hold",
103
+ "bring",
104
+ "happen",
105
+ "write",
106
+ "sit",
107
+ "stand",
108
+ "lose",
109
+ "pay",
110
+ "meet",
111
+ "set",
112
+ "learn",
113
+ "lead",
114
+ "understand",
115
+ "watch",
116
+ "follow",
117
+ "stop",
118
+ "speak",
119
+ "read",
120
+ "spend",
121
+ "grow",
122
+ "open",
123
+ "walk",
124
+ "win",
125
+ "teach",
126
+ "buy",
127
+ "fall",
128
+ "reach",
129
+ "build",
130
+ "sell",
131
+ "wait",
132
+ "cut",
133
+ "kill",
134
+ "sleep",
135
+ "send",
136
+ "stay",
137
+ "rise",
138
+ "drive",
139
+ "drink",
140
+ "break",
141
+ "eat",
142
+ "pull",
143
+ "shake",
144
+ "throw",
145
+ "catch",
146
+ "draw",
147
+ "hit",
148
+ "fight",
149
+ "wear",
150
+ "hang",
151
+ "strike",
152
+ "steal",
153
+ "swim",
154
+ "blow",
155
+ "fly",
156
+ "sing",
157
+ "ring",
158
+ # Basic nouns
159
+ "man",
160
+ "woman",
161
+ "child",
162
+ "day",
163
+ "way",
164
+ "thing",
165
+ "world",
166
+ "life",
167
+ "hand",
168
+ "year",
169
+ "time",
170
+ "work",
171
+ "night",
172
+ "home",
173
+ "word",
174
+ "eye",
175
+ "head",
176
+ "house",
177
+ "room",
178
+ "friend",
179
+ "door",
180
+ "side",
181
+ "water",
182
+ "mother",
183
+ "father",
184
+ "name",
185
+ "week",
186
+ "month",
187
+ "end",
188
+ "heart",
189
+ "mind",
190
+ "body",
191
+ "sun",
192
+ "moon",
193
+ "earth",
194
+ "god",
195
+ "king",
196
+ "land",
197
+ "sea",
198
+ "light",
199
+ "stone",
200
+ "tree",
201
+ "book",
202
+ "town",
203
+ "blood",
204
+ "brother",
205
+ "sister",
206
+ "wife",
207
+ "husband",
208
+ "son",
209
+ "daughter",
210
+ "folk",
211
+ # Basic adjectives
212
+ "good",
213
+ "new",
214
+ "first",
215
+ "last",
216
+ "long",
217
+ "great",
218
+ "little",
219
+ "own",
220
+ "other",
221
+ "old",
222
+ "right",
223
+ "big",
224
+ "high",
225
+ "small",
226
+ "large",
227
+ "young",
228
+ "early",
229
+ "late",
230
+ "whole",
231
+ "true",
232
+ "wrong",
233
+ "strong",
234
+ "dark",
235
+ "bright",
236
+ "deep",
237
+ "free",
238
+ "full",
239
+ "hard",
240
+ "soft",
241
+ "hot",
242
+ "cold",
243
+ "warm",
244
+ "cool",
245
+ "wet",
246
+ "dry",
247
+ "clean",
248
+ "dirty",
249
+ "sharp",
250
+ "dull",
251
+ "thick",
252
+ "thin",
253
+ "wide",
254
+ "narrow",
255
+ "quick",
256
+ "slow",
257
+ "fast",
258
+ "sick",
259
+ "well",
260
+ "dead",
261
+ "alive",
262
+ "rich",
263
+ "poor",
264
+ "sweet",
265
+ "bitter",
266
+ "loud",
267
+ # Basic adverbs and function words
268
+ "up",
269
+ "down",
270
+ "in",
271
+ "out",
272
+ "on",
273
+ "off",
274
+ "over",
275
+ "under",
276
+ "here",
277
+ "there",
278
+ "now",
279
+ "then",
280
+ "when",
281
+ "where",
282
+ "how",
283
+ "why",
284
+ "what",
285
+ "who",
286
+ "which",
287
+ "this",
288
+ "that",
289
+ "these",
290
+ "those",
291
+ "some",
292
+ "any",
293
+ "no",
294
+ "not",
295
+ "all",
296
+ "both",
297
+ "each",
298
+ "every",
299
+ "many",
300
+ "much",
301
+ "few",
302
+ "little",
303
+ "more",
304
+ "most",
305
+ "other",
306
+ "same",
307
+ "such",
308
+ "only",
309
+ "even",
310
+ "also",
311
+ "just",
312
+ "still",
313
+ "yet",
314
+ "already",
315
+ "always",
316
+ "never",
317
+ "often",
318
+ "sometimes",
319
+ "again",
320
+ "back",
321
+ "away",
322
+ ]
323
+ )
324
+
325
+ # Latinate/formal vocabulary (Latin/French origin)
326
+ LATINATE_WORDS = frozenset(
327
+ [
328
+ # Academic/formal verbs
329
+ "obtain",
330
+ "acquire",
331
+ "achieve",
332
+ "accomplish",
333
+ "demonstrate",
334
+ "indicate",
335
+ "establish",
336
+ "determine",
337
+ "examine",
338
+ "analyze",
339
+ "evaluate",
340
+ "assess",
341
+ "consider",
342
+ "conclude",
343
+ "suggest",
344
+ "propose",
345
+ "recommend",
346
+ "require",
347
+ "utilize",
348
+ "employ",
349
+ "implement",
350
+ "facilitate",
351
+ "contribute",
352
+ "constitute",
353
+ "represent",
354
+ "comprise",
355
+ "involve",
356
+ "include",
357
+ "exclude",
358
+ "provide",
359
+ "maintain",
360
+ "sustain",
361
+ "retain",
362
+ "contain",
363
+ "attain",
364
+ "pertain",
365
+ "perceive",
366
+ "conceive",
367
+ "receive",
368
+ "deceive",
369
+ "assume",
370
+ "presume",
371
+ "consume",
372
+ "resume",
373
+ "pursue",
374
+ "ensure",
375
+ "assure",
376
+ "observe",
377
+ "preserve",
378
+ "reserve",
379
+ "deserve",
380
+ "conserve",
381
+ "serve",
382
+ "participate",
383
+ "anticipate",
384
+ "communicate",
385
+ "investigate",
386
+ "illustrate",
387
+ "concentrate",
388
+ "eliminate",
389
+ "terminate",
390
+ "dominate",
391
+ # Academic/formal nouns
392
+ "analysis",
393
+ "hypothesis",
394
+ "theory",
395
+ "concept",
396
+ "principle",
397
+ "phenomenon",
398
+ "evidence",
399
+ "conclusion",
400
+ "assumption",
401
+ "implication",
402
+ "significance",
403
+ "consequence",
404
+ "circumstance",
405
+ "occurrence",
406
+ "reference",
407
+ "preference",
408
+ "difference",
409
+ "influence",
410
+ "experience",
411
+ "existence",
412
+ "assistance",
413
+ "resistance",
414
+ "persistence",
415
+ "instance",
416
+ "substance",
417
+ "distance",
418
+ "importance",
419
+ "performance",
420
+ "appearance",
421
+ "maintenance",
422
+ "tolerance",
423
+ "accordance",
424
+ "abundance",
425
+ "guidance",
426
+ "reliance",
427
+ "compliance",
428
+ "institution",
429
+ "organization",
430
+ "administration",
431
+ "consideration",
432
+ "determination",
433
+ "representation",
434
+ "interpretation",
435
+ "implementation",
436
+ "contribution",
437
+ "distribution",
438
+ "constitution",
439
+ "resolution",
440
+ "solution",
441
+ "evolution",
442
+ "revolution",
443
+ "evaluation",
444
+ "situation",
445
+ "association",
446
+ "variation",
447
+ "correlation",
448
+ "examination",
449
+ "investigation",
450
+ "observation",
451
+ # Academic/formal adjectives
452
+ "significant",
453
+ "substantial",
454
+ "considerable",
455
+ "essential",
456
+ "fundamental",
457
+ "primary",
458
+ "secondary",
459
+ "subsequent",
460
+ "previous",
461
+ "initial",
462
+ "final",
463
+ "potential",
464
+ "actual",
465
+ "virtual",
466
+ "crucial",
467
+ "critical",
468
+ "vital",
469
+ "specific",
470
+ "particular",
471
+ "general",
472
+ "universal",
473
+ "individual",
474
+ "personal",
475
+ "professional",
476
+ "traditional",
477
+ "conventional",
478
+ "exceptional",
479
+ "additional",
480
+ "sufficient",
481
+ "efficient",
482
+ "proficient",
483
+ "deficient",
484
+ "magnificent",
485
+ "appropriate",
486
+ "adequate",
487
+ "accurate",
488
+ "precise",
489
+ "explicit",
490
+ "implicit",
491
+ "complex",
492
+ "simple",
493
+ "obvious",
494
+ "apparent",
495
+ "evident",
496
+ "prominent",
497
+ "dominant",
498
+ "relevant",
499
+ "equivalent",
500
+ "frequent",
501
+ "permanent",
502
+ ]
503
+ )
504
+
505
+ # Abstract noun suffixes (indicators of abstract concepts)
506
+ ABSTRACT_SUFFIXES = frozenset(
507
+ [
508
+ "ness",
509
+ "ity",
510
+ "ment",
511
+ "tion",
512
+ "sion",
513
+ "ance",
514
+ "ence",
515
+ "dom",
516
+ "hood",
517
+ "ship",
518
+ "ism",
519
+ "acy",
520
+ "age",
521
+ "ure",
522
+ "th",
523
+ "ty",
524
+ ]
525
+ )
526
+
527
+ # Concrete noun categories (physical, tangible things)
528
+ CONCRETE_CATEGORIES = frozenset(
529
+ [
530
+ # Body parts
531
+ "head",
532
+ "hand",
533
+ "eye",
534
+ "face",
535
+ "arm",
536
+ "leg",
537
+ "foot",
538
+ "finger",
539
+ "hair",
540
+ "heart",
541
+ "body",
542
+ "skin",
543
+ "bone",
544
+ "blood",
545
+ "mouth",
546
+ "nose",
547
+ "ear",
548
+ "tooth",
549
+ # Natural objects
550
+ "tree",
551
+ "flower",
552
+ "grass",
553
+ "leaf",
554
+ "rock",
555
+ "stone",
556
+ "mountain",
557
+ "river",
558
+ "ocean",
559
+ "sea",
560
+ "lake",
561
+ "sun",
562
+ "moon",
563
+ "star",
564
+ "sky",
565
+ "cloud",
566
+ "rain",
567
+ "snow",
568
+ "wind",
569
+ "fire",
570
+ "water",
571
+ "earth",
572
+ "sand",
573
+ "wood",
574
+ "metal",
575
+ "gold",
576
+ # Man-made objects
577
+ "house",
578
+ "building",
579
+ "room",
580
+ "door",
581
+ "window",
582
+ "wall",
583
+ "floor",
584
+ "roof",
585
+ "table",
586
+ "chair",
587
+ "bed",
588
+ "desk",
589
+ "book",
590
+ "paper",
591
+ "pen",
592
+ "phone",
593
+ "car",
594
+ "bus",
595
+ "train",
596
+ "plane",
597
+ "ship",
598
+ "boat",
599
+ "road",
600
+ "street",
601
+ "bridge",
602
+ "clock",
603
+ "watch",
604
+ "key",
605
+ "knife",
606
+ "cup",
607
+ "plate",
608
+ "glass",
609
+ "bottle",
610
+ # Animals
611
+ "dog",
612
+ "cat",
613
+ "horse",
614
+ "cow",
615
+ "bird",
616
+ "fish",
617
+ "mouse",
618
+ "rat",
619
+ "lion",
620
+ "tiger",
621
+ "bear",
622
+ "wolf",
623
+ "fox",
624
+ "deer",
625
+ "rabbit",
626
+ "snake",
627
+ "frog",
628
+ # Food
629
+ "bread",
630
+ "meat",
631
+ "milk",
632
+ "egg",
633
+ "fruit",
634
+ "apple",
635
+ "orange",
636
+ "rice",
637
+ "potato",
638
+ "vegetable",
639
+ "cheese",
640
+ "butter",
641
+ "sugar",
642
+ "salt",
643
+ "coffee",
644
+ "tea",
645
+ # Clothing
646
+ "shirt",
647
+ "pants",
648
+ "dress",
649
+ "coat",
650
+ "hat",
651
+ "shoe",
652
+ "sock",
653
+ "glove",
654
+ ]
655
+ )
656
+
657
+ # Narrative markers (past tense, action verbs, temporal markers)
658
+ NARRATIVE_MARKERS = frozenset(
659
+ [
660
+ # Temporal markers
661
+ "suddenly",
662
+ "then",
663
+ "finally",
664
+ "meanwhile",
665
+ "afterwards",
666
+ "eventually",
667
+ "immediately",
668
+ "soon",
669
+ "later",
670
+ "before",
671
+ "after",
672
+ "once",
673
+ "while",
674
+ # Dialogue tags
675
+ "said",
676
+ "asked",
677
+ "replied",
678
+ "answered",
679
+ "whispered",
680
+ "shouted",
681
+ "cried",
682
+ "exclaimed",
683
+ "muttered",
684
+ "murmured",
685
+ "declared",
686
+ "announced",
687
+ "explained",
688
+ # Motion/action verbs (common in narratives)
689
+ "walked",
690
+ "ran",
691
+ "jumped",
692
+ "fell",
693
+ "stood",
694
+ "sat",
695
+ "looked",
696
+ "watched",
697
+ "turned",
698
+ "moved",
699
+ "came",
700
+ "went",
701
+ "left",
702
+ "arrived",
703
+ "entered",
704
+ "escaped",
705
+ "grabbed",
706
+ "dropped",
707
+ "threw",
708
+ "caught",
709
+ "hit",
710
+ "pushed",
711
+ "pulled",
712
+ ]
713
+ )
714
+
715
+ # Expository markers (present tense, linking verbs, logical connectors)
716
+ EXPOSITORY_MARKERS = frozenset(
717
+ [
718
+ # Logical connectors
719
+ "therefore",
720
+ "thus",
721
+ "hence",
722
+ "consequently",
723
+ "furthermore",
724
+ "moreover",
725
+ "however",
726
+ "nevertheless",
727
+ "although",
728
+ "whereas",
729
+ "because",
730
+ "since",
731
+ "indeed",
732
+ "specifically",
733
+ "particularly",
734
+ "generally",
735
+ "typically",
736
+ # Definitional markers
737
+ "defined",
738
+ "means",
739
+ "refers",
740
+ "consists",
741
+ "comprises",
742
+ "includes",
743
+ "involves",
744
+ "represents",
745
+ "indicates",
746
+ "suggests",
747
+ "demonstrates",
748
+ "shows",
749
+ # Structural markers
750
+ "firstly",
751
+ "secondly",
752
+ "thirdly",
753
+ "additionally",
754
+ "similarly",
755
+ "conversely",
756
+ "alternatively",
757
+ "namely",
758
+ "essentially",
759
+ ]
760
+ )
761
+
762
+ # Legal register markers
763
+ LEGAL_MARKERS = frozenset(
764
+ [
765
+ "whereas",
766
+ "hereby",
767
+ "herein",
768
+ "hereof",
769
+ "thereof",
770
+ "therein",
771
+ "wherein",
772
+ "forthwith",
773
+ "notwithstanding",
774
+ "pursuant",
775
+ "aforesaid",
776
+ "heretofore",
777
+ "hereafter",
778
+ "henceforth",
779
+ "whereby",
780
+ "whereupon",
781
+ "inasmuch",
782
+ "insofar",
783
+ "shall",
784
+ "aforementioned",
785
+ "undersigned",
786
+ "plaintiff",
787
+ "defendant",
788
+ "jurisdiction",
789
+ "statute",
790
+ "provision",
791
+ "liability",
792
+ "indemnify",
793
+ "covenant",
794
+ "stipulate",
795
+ "terminate",
796
+ "constitute",
797
+ "enforce",
798
+ "comply",
799
+ ]
800
+ )
801
+
802
+ # Academic register markers
803
+ ACADEMIC_MARKERS = frozenset(
804
+ [
805
+ "hypothesis",
806
+ "methodology",
807
+ "analysis",
808
+ "findings",
809
+ "conclusion",
810
+ "research",
811
+ "study",
812
+ "evidence",
813
+ "literature",
814
+ "theory",
815
+ "framework",
816
+ "significant",
817
+ "correlation",
818
+ "variable",
819
+ "sample",
820
+ "data",
821
+ "results",
822
+ "furthermore",
823
+ "moreover",
824
+ "however",
825
+ "therefore",
826
+ "consequently",
827
+ "previous",
828
+ "current",
829
+ "subsequent",
830
+ "demonstrate",
831
+ "indicate",
832
+ "suggest",
833
+ "examine",
834
+ "investigate",
835
+ "analyze",
836
+ "evaluate",
837
+ "assess",
838
+ "determine",
839
+ ]
840
+ )
841
+
842
+ # Journalistic register markers
843
+ JOURNALISTIC_MARKERS = frozenset(
844
+ [
845
+ "reported",
846
+ "announced",
847
+ "revealed",
848
+ "disclosed",
849
+ "confirmed",
850
+ "denied",
851
+ "claimed",
852
+ "alleged",
853
+ "stated",
854
+ "according",
855
+ "sources",
856
+ "officials",
857
+ "authorities",
858
+ "spokesperson",
859
+ "investigation",
860
+ "breaking",
861
+ "developing",
862
+ "exclusive",
863
+ "update",
864
+ "latest",
865
+ "controversy",
866
+ "scandal",
867
+ "crisis",
868
+ ]
869
+ )
870
+
871
+ # Conversational/informal markers
872
+ CONVERSATIONAL_MARKERS = frozenset(
873
+ [
874
+ "yeah",
875
+ "yep",
876
+ "nope",
877
+ "okay",
878
+ "ok",
879
+ "hey",
880
+ "hi",
881
+ "hello",
882
+ "bye",
883
+ "well",
884
+ "like",
885
+ "just",
886
+ "really",
887
+ "actually",
888
+ "basically",
889
+ "literally",
890
+ "totally",
891
+ "gonna",
892
+ "wanna",
893
+ "gotta",
894
+ "kinda",
895
+ "sorta",
896
+ "dunno",
897
+ "lemme",
898
+ "gimme",
899
+ "stuff",
900
+ "thing",
901
+ "things",
902
+ "guy",
903
+ "guys",
904
+ "kids",
905
+ "folks",
906
+ "awesome",
907
+ "cool",
908
+ "nice",
909
+ "great",
910
+ "amazing",
911
+ "terrible",
912
+ "horrible",
913
+ "crazy",
914
+ ]
915
+ )
916
+
917
+ # First person pronouns
918
+ FIRST_PERSON_PRONOUNS = frozenset(
919
+ [
920
+ "i",
921
+ "me",
922
+ "my",
923
+ "mine",
924
+ "myself",
925
+ "we",
926
+ "us",
927
+ "our",
928
+ "ours",
929
+ "ourselves",
930
+ ]
931
+ )
932
+
933
+ # Second person pronouns
934
+ SECOND_PERSON_PRONOUNS = frozenset(
935
+ [
936
+ "you",
937
+ "your",
938
+ "yours",
939
+ "yourself",
940
+ "yourselves",
941
+ ]
942
+ )
943
+
944
+ # Third person pronouns
945
+ THIRD_PERSON_PRONOUNS = frozenset(
946
+ [
947
+ "he",
948
+ "him",
949
+ "his",
950
+ "himself",
951
+ "she",
952
+ "her",
953
+ "hers",
954
+ "herself",
955
+ "it",
956
+ "its",
957
+ "itself",
958
+ "they",
959
+ "them",
960
+ "their",
961
+ "theirs",
962
+ "themselves",
963
+ "one",
964
+ "oneself",
965
+ ]
966
+ )
967
+
968
+ # Impersonal constructions (start of sentences)
969
+ IMPERSONAL_PATTERNS = [
970
+ r"\bit\s+is\b",
971
+ r"\bit\s+was\b",
972
+ r"\bit\s+has\s+been\b",
973
+ r"\bit\s+seems\b",
974
+ r"\bit\s+appears\b",
975
+ r"\bthere\s+is\b",
976
+ r"\bthere\s+are\b",
977
+ r"\bthere\s+was\b",
978
+ r"\bthere\s+were\b",
979
+ r"\bthere\s+has\s+been\b",
980
+ r"\bthere\s+have\s+been\b",
981
+ r"\bone\s+can\b",
982
+ r"\bone\s+may\b",
983
+ r"\bone\s+might\b",
984
+ r"\bone\s+should\b",
985
+ r"\bone\s+must\b",
986
+ ]
987
+
988
+
989
+ # =============================================================================
990
+ # HELPER FUNCTIONS
991
+ # =============================================================================
992
+
993
+
994
+ def _tokenize(text: str) -> list[str]:
995
+ """Tokenize text into lowercase words."""
996
+ return re.findall(r"\b[a-zA-Z]+\b", text.lower())
997
+
998
+
999
+ def _count_latinate_words(tokens: list[str]) -> int:
1000
+ """Count words with Latinate characteristics.
1001
+
1002
+ A word is considered Latinate if it:
1003
+ 1. Is in the explicit Latinate word list, OR
1004
+ 2. Has a Latinate suffix and is longer than 6 characters
1005
+
1006
+ This heuristic captures formal vocabulary of Latin/French origin.
1007
+ """
1008
+ count = 0
1009
+ for token in tokens:
1010
+ if token in LATINATE_WORDS:
1011
+ count += 1
1012
+ elif len(token) > 6:
1013
+ for suffix in LATINATE_SUFFIXES:
1014
+ if token.endswith(suffix):
1015
+ count += 1
1016
+ break
1017
+ return count
1018
+
1019
+
1020
+ def _count_germanic_words(tokens: list[str]) -> int:
1021
+ """Count words with Germanic/Anglo-Saxon characteristics.
1022
+
1023
+ A word is considered Germanic if it:
1024
+ 1. Is in the explicit Germanic common word list, OR
1025
+ 2. Is short (4 letters or fewer) and doesn't have Latinate suffix
1026
+
1027
+ This captures everyday vocabulary of Old English origin.
1028
+ """
1029
+ count = 0
1030
+ for token in tokens:
1031
+ if token in GERMANIC_COMMON_WORDS:
1032
+ count += 1
1033
+ elif len(token) <= 4:
1034
+ # Short words are typically Germanic
1035
+ is_latinate = False
1036
+ for suffix in LATINATE_SUFFIXES:
1037
+ if token.endswith(suffix):
1038
+ is_latinate = True
1039
+ break
1040
+ if not is_latinate:
1041
+ count += 1
1042
+ return count
1043
+
1044
+
1045
+ def _count_nominalizations(tokens: list[str]) -> int:
1046
+ """Count nominalizations (verbs/adjectives turned into nouns).
1047
+
1048
+ Nominalizations are identified by suffixes like -tion, -ment, -ness, -ity.
1049
+ These are characteristic of formal, academic writing.
1050
+
1051
+ Reference:
1052
+ Biber, D. (1988). Nominalizations are one of the strongest markers
1053
+ of informational, formal register.
1054
+ """
1055
+ count = 0
1056
+ nominalization_suffixes = ["tion", "sion", "ment", "ness", "ity", "ance", "ence"]
1057
+ for token in tokens:
1058
+ if len(token) > 5: # Avoid false positives on short words
1059
+ for suffix in nominalization_suffixes:
1060
+ if token.endswith(suffix):
1061
+ count += 1
1062
+ break
1063
+ return count
1064
+
1065
+
1066
+ def _count_abstract_nouns(tokens: list[str]) -> int:
1067
+ """Count abstract nouns (concepts, ideas, qualities).
1068
+
1069
+ Abstract nouns are identified by:
1070
+ 1. Suffixes indicating abstract concepts (-ness, -ity, -ism, etc.)
1071
+ 2. NOT being in the concrete word list
1072
+ """
1073
+ count = 0
1074
+ for token in tokens:
1075
+ if len(token) > 4:
1076
+ for suffix in ABSTRACT_SUFFIXES:
1077
+ if token.endswith(suffix):
1078
+ count += 1
1079
+ break
1080
+ return count
1081
+
1082
+
1083
+ def _count_concrete_nouns(tokens: list[str]) -> int:
1084
+ """Count concrete nouns (physical, tangible things)."""
1085
+ return sum(1 for token in tokens if token in CONCRETE_CATEGORIES)
1086
+
1087
+
1088
+ def _count_pronouns(tokens: list[str]) -> dict[str, int]:
1089
+ """Count pronouns by person (first, second, third)."""
1090
+ first = sum(1 for t in tokens if t in FIRST_PERSON_PRONOUNS)
1091
+ second = sum(1 for t in tokens if t in SECOND_PERSON_PRONOUNS)
1092
+ third = sum(1 for t in tokens if t in THIRD_PERSON_PRONOUNS)
1093
+ return {"first": first, "second": second, "third": third}
1094
+
1095
+
1096
+ def _count_impersonal_constructions(text: str) -> int:
1097
+ """Count impersonal constructions like 'It is...', 'There are...'."""
1098
+ count = 0
1099
+ text_lower = text.lower()
1100
+ for pattern in IMPERSONAL_PATTERNS:
1101
+ count += len(re.findall(pattern, text_lower))
1102
+ return count
1103
+
1104
+
1105
+ def _count_narrative_markers(tokens: list[str]) -> int:
1106
+ """Count markers characteristic of narrative text."""
1107
+ return sum(1 for t in tokens if t in NARRATIVE_MARKERS)
1108
+
1109
+
1110
+ def _count_expository_markers(tokens: list[str]) -> int:
1111
+ """Count markers characteristic of expository text."""
1112
+ return sum(1 for t in tokens if t in EXPOSITORY_MARKERS)
1113
+
1114
+
1115
+ def _count_register_markers(tokens: list[str]) -> dict[str, int]:
1116
+ """Count markers for different registers."""
1117
+ return {
1118
+ "legal": sum(1 for t in tokens if t in LEGAL_MARKERS),
1119
+ "academic": sum(1 for t in tokens if t in ACADEMIC_MARKERS),
1120
+ "journalistic": sum(1 for t in tokens if t in JOURNALISTIC_MARKERS),
1121
+ "conversational": sum(1 for t in tokens if t in CONVERSATIONAL_MARKERS),
1122
+ }
1123
+
1124
+
1125
+ def _estimate_dialogue_ratio(text: str) -> float:
1126
+ """Estimate the proportion of text that is dialogue.
1127
+
1128
+ Dialogue is detected by quotation marks. This is a heuristic
1129
+ that works for most fiction but may miss some edge cases.
1130
+ """
1131
+ # Find all quoted strings (both single and double quotes)
1132
+ double_quoted = re.findall(r'"[^"]*"', text)
1133
+ single_quoted = re.findall(r"'[^']*'", text)
1134
+
1135
+ # Calculate total dialogue character count
1136
+ dialogue_chars = sum(len(q) for q in double_quoted)
1137
+ dialogue_chars += sum(len(q) for q in single_quoted if len(q) > 3) # Avoid contractions
1138
+
1139
+ total_chars = len(text.strip())
1140
+ if total_chars == 0:
1141
+ return 0.0
1142
+
1143
+ return min(1.0, dialogue_chars / total_chars)
1144
+
1145
+
1146
+ def _count_quotations(text: str) -> int:
1147
+ """Count number of quotation instances."""
1148
+ double_quotes = len(re.findall(r'"[^"]*"', text))
1149
+ # Only count single quotes that look like actual quotes (longer than contractions)
1150
+ single_quotes = len([q for q in re.findall(r"'[^']*'", text) if len(q) > 5])
1151
+ return double_quotes + single_quotes
1152
+
1153
+
1154
+ def _detect_passive_voice(text: str) -> int:
1155
+ """Detect passive voice constructions using regex patterns.
1156
+
1157
+ Passive voice pattern: form of "be" + past participle
1158
+ Examples: "was written", "is considered", "were found"
1159
+
1160
+ This is a heuristic approach. For more accurate detection,
1161
+ use spaCy's dependency parser (when available).
1162
+ """
1163
+ # Pattern: be verb + optional adverb + past participle (-ed, -en, irregular)
1164
+ passive_patterns = [
1165
+ r"\b(?:is|are|was|were|been|being|be)\s+(?:\w+ly\s+)?(?:\w+ed|written|taken|given|made|done|seen|known|found|told|shown|left|thought|felt|become|begun|broken|chosen|fallen|forgotten|frozen|hidden|spoken|stolen|sworn|woken)\b",
1166
+ ]
1167
+
1168
+ count = 0
1169
+ text_lower = text.lower()
1170
+ for pattern in passive_patterns:
1171
+ count += len(re.findall(pattern, text_lower, re.IGNORECASE))
17
1172
 
18
- def compute_genre_register(text: str, model: str = "en_core_web_sm") -> GenreRegisterResult:
1173
+ return count
1174
+
1175
+
1176
+ def _calculate_formality_score(
1177
+ latinate_ratio: float,
1178
+ nominalization_density: float,
1179
+ passive_density: float,
1180
+ first_person_ratio: float,
1181
+ conversational_count: int,
1182
+ word_count: int,
1183
+ ) -> float:
1184
+ """Calculate composite formality score (0-100).
1185
+
1186
+ Based on Heylighen & Dewaele (1999) F-score formula, adapted
1187
+ for the available features.
1188
+
1189
+ Higher scores indicate more formal text.
1190
+ """
1191
+ # Base score from Latinate vocabulary (0-40 points)
1192
+ latinate_score = min(40.0, latinate_ratio * 100)
1193
+
1194
+ # Nominalization contribution (0-20 points)
1195
+ # Typical academic text has 3-6 nominalizations per 100 words
1196
+ nom_score = min(20.0, nominalization_density * 4)
1197
+
1198
+ # Passive voice contribution (0-15 points)
1199
+ # Typical formal text has 1-3 passives per 100 words
1200
+ passive_score = min(15.0, passive_density * 5)
1201
+
1202
+ # First person penalty (reduces formality)
1203
+ # High first-person usage is informal
1204
+ first_person_penalty = first_person_ratio * 15
1205
+
1206
+ # Conversational marker penalty
1207
+ conv_density = (conversational_count / max(1, word_count)) * 100
1208
+ conv_penalty = min(20.0, conv_density * 10)
1209
+
1210
+ # Calculate final score
1211
+ score = latinate_score + nom_score + passive_score - first_person_penalty - conv_penalty
1212
+
1213
+ # Normalize to 0-100 range
1214
+ return max(0.0, min(100.0, score))
1215
+
1216
+
1217
+ def _classify_register(formality_score: float, features: dict[str, float]) -> str:
1218
+ """Classify text into register category.
1219
+
1220
+ Registers (from Joos, 1961):
1221
+ - frozen: ritualized, unchanging (legal documents, prayers)
1222
+ - formal: one-way, no feedback expected (academic papers, reports)
1223
+ - consultative: professional discourse (business, technical)
1224
+ - casual: relaxed, everyday speech (conversations with friends)
1225
+ - intimate: private, personal (close relationships)
1226
+
1227
+ Reference:
1228
+ Joos, M. (1961). The Five Clocks. Harcourt, Brace & World.
1229
+ """
1230
+ legal_markers = features.get("legal_marker_count", 0)
1231
+
1232
+ if formality_score >= 80 or legal_markers >= 3:
1233
+ return "frozen"
1234
+ elif formality_score >= 60:
1235
+ return "formal"
1236
+ elif formality_score >= 40:
1237
+ return "consultative"
1238
+ elif formality_score >= 20:
1239
+ return "casual"
1240
+ else:
1241
+ return "intimate"
1242
+
1243
+
1244
+ def _calculate_genre_scores(
1245
+ features: dict[str, Any],
1246
+ ) -> dict[str, float]:
1247
+ """Calculate scores for each genre category.
1248
+
1249
+ Returns dict with scores from 0.0 to 1.0 for each genre.
1250
+ """
1251
+ word_count = max(1, features["word_count"])
1252
+
1253
+ # Academic score
1254
+ academic_score = 0.0
1255
+ academic_score += min(0.3, features["nominalization_density"] / 10)
1256
+ academic_score += min(0.2, features["latinate_ratio"] * 0.5)
1257
+ academic_score += min(0.2, features["passive_density"] / 5)
1258
+ academic_score += min(0.15, (features["academic_markers"] / word_count) * 20)
1259
+ academic_score += min(0.15, features["impersonal_density"] / 3)
1260
+ academic_score = min(1.0, academic_score)
1261
+
1262
+ # Journalistic score
1263
+ journalistic_score = 0.0
1264
+ journalistic_score += min(0.3, (features["journalistic_markers"] / word_count) * 30)
1265
+ journalistic_score += min(0.2, features["quotation_density"] / 3)
1266
+ journalistic_score += min(
1267
+ 0.2, 0.5 - abs(features["formality_score"] / 100 - 0.5)
1268
+ ) # Middle formality
1269
+ journalistic_score += min(0.15, features["third_person_ratio"] * 0.2)
1270
+ journalistic_score += 0.15 if features["narrative_expository_ratio"] > 0.3 else 0.0
1271
+ journalistic_score = min(1.0, journalistic_score)
1272
+
1273
+ # Fiction score
1274
+ fiction_score = 0.0
1275
+ fiction_score += min(0.25, features["dialogue_ratio"] * 0.5)
1276
+ fiction_score += min(0.25, features["narrative_density"] / 5)
1277
+ fiction_score += min(0.2, features["concrete_ratio"] * 0.3)
1278
+ fiction_score += min(0.15, features["first_person_ratio"] * 0.2)
1279
+ fiction_score += min(0.15, (1.0 - features["latinate_ratio"]) * 0.2) # Less formal
1280
+ fiction_score = min(1.0, fiction_score)
1281
+
1282
+ # Legal score
1283
+ legal_score = 0.0
1284
+ legal_score += min(0.4, (features["legal_markers"] / word_count) * 50)
1285
+ legal_score += min(0.2, features["nominalization_density"] / 8)
1286
+ legal_score += min(0.2, features["passive_density"] / 4)
1287
+ legal_score += min(0.1, features["latinate_ratio"] * 0.3)
1288
+ legal_score += 0.1 if features["formality_score"] > 70 else 0.0
1289
+ legal_score = min(1.0, legal_score)
1290
+
1291
+ # Conversational score
1292
+ conv_score = 0.0
1293
+ conv_score += min(0.3, (features["conversational_markers"] / word_count) * 30)
1294
+ conv_score += min(0.2, features["first_person_ratio"] * 0.3)
1295
+ conv_score += min(0.2, features["second_person_ratio"] * 0.4)
1296
+ conv_score += min(0.15, (1.0 - features["latinate_ratio"]) * 0.25)
1297
+ conv_score += min(0.15, (100 - features["formality_score"]) / 100 * 0.2)
1298
+ conv_score = min(1.0, conv_score)
1299
+
1300
+ return {
1301
+ "academic": academic_score,
1302
+ "journalistic": journalistic_score,
1303
+ "fiction": fiction_score,
1304
+ "legal": legal_score,
1305
+ "conversational": conv_score,
1306
+ }
1307
+
1308
+
1309
+ def _predict_genre(scores: dict[str, float]) -> tuple[str, float]:
1310
+ """Predict the most likely genre and confidence."""
1311
+ if not scores:
1312
+ return "unknown", 0.0
1313
+
1314
+ best_genre = max(scores, key=scores.get) # type: ignore[arg-type]
1315
+ best_score = scores[best_genre]
1316
+
1317
+ # Calculate confidence based on margin over second-best
1318
+ sorted_scores = sorted(scores.values(), reverse=True)
1319
+ if len(sorted_scores) >= 2:
1320
+ margin = sorted_scores[0] - sorted_scores[1]
1321
+ confidence = min(1.0, best_score * (1 + margin))
1322
+ else:
1323
+ confidence = best_score
1324
+
1325
+ return best_genre, confidence
1326
+
1327
+
1328
+ def _count_technical_terms(tokens: list[str], text: str) -> int:
1329
+ """Count potential technical/specialized terms.
1330
+
1331
+ Heuristics for technical terms:
1332
+ 1. Capitalized words not at sentence start
1333
+ 2. Words with numbers mixed in
1334
+ 3. Acronyms (all caps, 2-5 letters)
1335
+ 4. Very long words (>12 chars) that aren't common
19
1336
  """
20
- Analyze genre and register features for text classification.
1337
+ count = 0
1338
+
1339
+ # Count acronyms
1340
+ acronyms = re.findall(r"\b[A-Z]{2,5}\b", text)
1341
+ count += len(acronyms)
1342
+
1343
+ # Count words with numbers
1344
+ alphanumeric = re.findall(r"\b[a-zA-Z]+\d+[a-zA-Z]*\b|\b\d+[a-zA-Z]+\b", text)
1345
+ count += len(alphanumeric)
1346
+
1347
+ # Count very long words
1348
+ for token in tokens:
1349
+ if len(token) > 12 and token not in LATINATE_WORDS:
1350
+ count += 1
1351
+
1352
+ return count
1353
+
1354
+
1355
+ # =============================================================================
1356
+ # MAIN FUNCTION
1357
+ # =============================================================================
1358
+
1359
+
1360
+ def compute_genre_register(
1361
+ text: str,
1362
+ model: str = "en_core_web_sm", # noqa: ARG001 - Reserved for future spaCy integration
1363
+ ) -> GenreRegisterResult:
1364
+ """Analyze genre and register features for text classification.
1365
+
1366
+ This function extracts linguistic features that distinguish between
1367
+ different text types (academic, journalistic, fiction, legal, conversational)
1368
+ and formality levels (frozen, formal, consultative, casual, intimate).
1369
+
1370
+ The analysis is based on Biber's multidimensional approach to register
1371
+ variation, combined with Heylighen & Dewaele's formality metrics.
21
1372
 
22
1373
  Related GitHub Issue:
23
1374
  #23 - Genre and Register Features
24
1375
  https://github.com/craigtrim/pystylometry/issues/23
25
1376
 
26
1377
  Args:
27
- text: Input text to analyze
28
- model: spaCy model for linguistic analysis
1378
+ text: Input text to analyze.
1379
+ model: spaCy model name (reserved for future enhanced analysis).
1380
+ Currently unused; analysis uses regex-based heuristics.
29
1381
 
30
1382
  Returns:
31
- GenreRegisterResult with formality scores, register classification,
32
- genre predictions, and feature scores for major genres.
1383
+ GenreRegisterResult with comprehensive genre and register metrics.
1384
+
1385
+ Features analyzed:
1386
+ - Formality markers (Latinate words, nominalizations, passive voice)
1387
+ - Personal vs. impersonal style (pronoun distribution)
1388
+ - Abstract vs. concrete vocabulary
1389
+ - Technical term density
1390
+ - Narrative vs. expository markers
1391
+ - Dialogue presence and ratio
1392
+ - Register classification (frozen to intimate)
1393
+ - Genre prediction with confidence scores
33
1394
 
34
1395
  Example:
35
- >>> result = compute_genre_register("Academic paper text...")
36
- >>> print(f"Formality score: {result.formality_score:.2f}")
37
- >>> print(f"Predicted genre: {result.predicted_genre}")
38
- >>> print(f"Academic score: {result.academic_score:.3f}")
39
- """
40
- # TODO: Implement genre/register analysis
41
- # GitHub Issue #23: https://github.com/craigtrim/pystylometry/issues/23
42
- raise NotImplementedError(
43
- "Genre/register classification not yet implemented. "
44
- "See GitHub Issue #23: https://github.com/craigtrim/pystylometry/issues/23"
1396
+ >>> result = compute_genre_register("The court hereby finds...")
1397
+ >>> print(f"Formality: {result.formality_score:.1f}")
1398
+ >>> print(f"Register: {result.register_classification}")
1399
+ >>> print(f"Genre: {result.predicted_genre}")
1400
+
1401
+ References:
1402
+ Biber, D. (1988). Variation across speech and writing.
1403
+ Cambridge University Press.
1404
+ Biber, D., & Conrad, S. (2009). Register, genre, and style.
1405
+ Cambridge University Press.
1406
+ Heylighen, F., & Dewaele, J. M. (1999). Formality of language.
1407
+ Joos, M. (1961). The Five Clocks. Harcourt, Brace & World.
1408
+ """
1409
+ start_time = time.time()
1410
+
1411
+ # Tokenize
1412
+ tokens = _tokenize(text)
1413
+ word_count = len(tokens)
1414
+
1415
+ # Handle empty text
1416
+ if word_count == 0:
1417
+ return GenreRegisterResult(
1418
+ formality_score=0.0,
1419
+ latinate_ratio=0.0,
1420
+ nominalization_density=0.0,
1421
+ passive_voice_density=0.0,
1422
+ first_person_ratio=0.0,
1423
+ second_person_ratio=0.0,
1424
+ third_person_ratio=0.0,
1425
+ impersonal_construction_density=0.0,
1426
+ abstract_noun_ratio=0.0,
1427
+ concrete_noun_ratio=0.0,
1428
+ abstractness_score=0.0,
1429
+ technical_term_density=0.0,
1430
+ jargon_density=0.0,
1431
+ narrative_marker_density=0.0,
1432
+ expository_marker_density=0.0,
1433
+ narrative_expository_ratio=0.0,
1434
+ dialogue_ratio=0.0,
1435
+ quotation_density=0.0,
1436
+ register_classification="unknown",
1437
+ predicted_genre="unknown",
1438
+ genre_confidence=0.0,
1439
+ academic_score=0.0,
1440
+ journalistic_score=0.0,
1441
+ fiction_score=0.0,
1442
+ legal_score=0.0,
1443
+ conversational_score=0.0,
1444
+ metadata={
1445
+ "word_count": 0,
1446
+ "computation_time": time.time() - start_time,
1447
+ },
1448
+ )
1449
+
1450
+ # Count various features
1451
+ latinate_count = _count_latinate_words(tokens)
1452
+ germanic_count = _count_germanic_words(tokens)
1453
+ nominalization_count = _count_nominalizations(tokens)
1454
+ abstract_count = _count_abstract_nouns(tokens)
1455
+ concrete_count = _count_concrete_nouns(tokens)
1456
+ pronoun_counts = _count_pronouns(tokens)
1457
+ impersonal_count = _count_impersonal_constructions(text)
1458
+ narrative_count = _count_narrative_markers(tokens)
1459
+ expository_count = _count_expository_markers(tokens)
1460
+ register_markers = _count_register_markers(tokens)
1461
+ passive_count = _detect_passive_voice(text)
1462
+ dialogue_ratio = _estimate_dialogue_ratio(text)
1463
+ quotation_count = _count_quotations(text)
1464
+ technical_count = _count_technical_terms(tokens, text)
1465
+
1466
+ # Calculate ratios and densities
1467
+ total_latinate_germanic = latinate_count + germanic_count
1468
+ latinate_ratio = (
1469
+ latinate_count / total_latinate_germanic if total_latinate_germanic > 0 else 0.0
1470
+ )
1471
+
1472
+ nominalization_density = (nominalization_count / word_count) * 100
1473
+ passive_density = (passive_count / word_count) * 100
1474
+ impersonal_density = (impersonal_count / word_count) * 100
1475
+
1476
+ total_pronouns = sum(pronoun_counts.values())
1477
+ first_person_ratio = pronoun_counts["first"] / total_pronouns if total_pronouns > 0 else 0.0
1478
+ second_person_ratio = pronoun_counts["second"] / total_pronouns if total_pronouns > 0 else 0.0
1479
+ third_person_ratio = pronoun_counts["third"] / total_pronouns if total_pronouns > 0 else 0.0
1480
+
1481
+ total_noun_indicators = abstract_count + concrete_count
1482
+ abstract_ratio = abstract_count / total_noun_indicators if total_noun_indicators > 0 else 0.0
1483
+ concrete_ratio = concrete_count / total_noun_indicators if total_noun_indicators > 0 else 0.0
1484
+
1485
+ # Abstractness score: weighted by ratio and density
1486
+ abstractness_score = abstract_ratio * min(1.0, (abstract_count / word_count) * 20)
1487
+
1488
+ technical_density = (technical_count / word_count) * 100
1489
+ # Jargon density approximated by academic + legal markers
1490
+ jargon_density = ((register_markers["academic"] + register_markers["legal"]) / word_count) * 100
1491
+
1492
+ narrative_density = (narrative_count / word_count) * 100
1493
+ expository_density = (expository_count / word_count) * 100
1494
+ narrative_expository_ratio = (
1495
+ narrative_density / expository_density if expository_density > 0 else 0.0
1496
+ )
1497
+
1498
+ quotation_density = (quotation_count / word_count) * 100
1499
+
1500
+ # Calculate formality score
1501
+ formality_score = _calculate_formality_score(
1502
+ latinate_ratio=latinate_ratio,
1503
+ nominalization_density=nominalization_density,
1504
+ passive_density=passive_density,
1505
+ first_person_ratio=first_person_ratio,
1506
+ conversational_count=register_markers["conversational"],
1507
+ word_count=word_count,
1508
+ )
1509
+
1510
+ # Build features dict for genre scoring
1511
+ features: dict[str, Any] = {
1512
+ "word_count": word_count,
1513
+ "latinate_ratio": latinate_ratio,
1514
+ "nominalization_density": nominalization_density,
1515
+ "passive_density": passive_density,
1516
+ "formality_score": formality_score,
1517
+ "first_person_ratio": first_person_ratio,
1518
+ "second_person_ratio": second_person_ratio,
1519
+ "third_person_ratio": third_person_ratio,
1520
+ "impersonal_density": impersonal_density,
1521
+ "abstract_ratio": abstract_ratio,
1522
+ "concrete_ratio": concrete_ratio,
1523
+ "narrative_density": narrative_density,
1524
+ "expository_density": expository_density,
1525
+ "narrative_expository_ratio": narrative_expository_ratio,
1526
+ "dialogue_ratio": dialogue_ratio,
1527
+ "quotation_density": quotation_density,
1528
+ "legal_markers": register_markers["legal"],
1529
+ "academic_markers": register_markers["academic"],
1530
+ "journalistic_markers": register_markers["journalistic"],
1531
+ "conversational_markers": register_markers["conversational"],
1532
+ "legal_marker_count": register_markers["legal"],
1533
+ }
1534
+
1535
+ # Classify register
1536
+ register_classification = _classify_register(formality_score, features)
1537
+
1538
+ # Calculate genre scores
1539
+ genre_scores = _calculate_genre_scores(features)
1540
+ predicted_genre, genre_confidence = _predict_genre(genre_scores)
1541
+
1542
+ computation_time = time.time() - start_time
1543
+
1544
+ return GenreRegisterResult(
1545
+ formality_score=formality_score,
1546
+ latinate_ratio=latinate_ratio,
1547
+ nominalization_density=nominalization_density,
1548
+ passive_voice_density=passive_density,
1549
+ first_person_ratio=first_person_ratio,
1550
+ second_person_ratio=second_person_ratio,
1551
+ third_person_ratio=third_person_ratio,
1552
+ impersonal_construction_density=impersonal_density,
1553
+ abstract_noun_ratio=abstract_ratio,
1554
+ concrete_noun_ratio=concrete_ratio,
1555
+ abstractness_score=abstractness_score,
1556
+ technical_term_density=technical_density,
1557
+ jargon_density=jargon_density,
1558
+ narrative_marker_density=narrative_density,
1559
+ expository_marker_density=expository_density,
1560
+ narrative_expository_ratio=narrative_expository_ratio,
1561
+ dialogue_ratio=dialogue_ratio,
1562
+ quotation_density=quotation_density,
1563
+ register_classification=register_classification,
1564
+ predicted_genre=predicted_genre,
1565
+ genre_confidence=genre_confidence,
1566
+ academic_score=genre_scores["academic"],
1567
+ journalistic_score=genre_scores["journalistic"],
1568
+ fiction_score=genre_scores["fiction"],
1569
+ legal_score=genre_scores["legal"],
1570
+ conversational_score=genre_scores["conversational"],
1571
+ metadata={
1572
+ "word_count": word_count,
1573
+ "latinate_word_count": latinate_count,
1574
+ "germanic_word_count": germanic_count,
1575
+ "nominalization_count": nominalization_count,
1576
+ "passive_voice_count": passive_count,
1577
+ "abstract_noun_count": abstract_count,
1578
+ "concrete_noun_count": concrete_count,
1579
+ "pronoun_counts": pronoun_counts,
1580
+ "impersonal_count": impersonal_count,
1581
+ "narrative_marker_count": narrative_count,
1582
+ "expository_marker_count": expository_count,
1583
+ "register_marker_counts": register_markers,
1584
+ "technical_term_count": technical_count,
1585
+ "quotation_count": quotation_count,
1586
+ "computation_time": computation_time,
1587
+ },
45
1588
  )