pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +45 -3
- pystylometry/_types.py +1017 -259
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +28 -4
- pystylometry/authorship/additional_methods.py +260 -40
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +354 -0
- pystylometry/character/README.md +17 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/README.md +27 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- pystylometry-1.3.0.dist-info/RECORD +76 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
- pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/METADATA +0 -275
- pystylometry-1.0.0.dist-info/RECORD +0 -46
|
@@ -32,7 +32,13 @@ References:
|
|
|
32
32
|
import random
|
|
33
33
|
from typing import Optional
|
|
34
34
|
|
|
35
|
-
from .._types import
|
|
35
|
+
from .._types import (
|
|
36
|
+
HDDResult,
|
|
37
|
+
MATTRResult,
|
|
38
|
+
MSTTRResult,
|
|
39
|
+
VocdDResult,
|
|
40
|
+
make_distribution,
|
|
41
|
+
)
|
|
36
42
|
|
|
37
43
|
|
|
38
44
|
def _tokenize_for_diversity(text: str) -> list[str]:
|
|
@@ -61,13 +67,13 @@ def _tokenize_for_diversity(text: str) -> list[str]:
|
|
|
61
67
|
raw_tokens = text_lower.split()
|
|
62
68
|
|
|
63
69
|
# Comprehensive punctuation set for stripping
|
|
64
|
-
|
|
70
|
+
punctuation_chars = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„\"\"''‚'")
|
|
65
71
|
|
|
66
72
|
# Strip punctuation from each token
|
|
67
73
|
tokens = []
|
|
68
74
|
for token in raw_tokens:
|
|
69
75
|
# Strip leading and trailing punctuation
|
|
70
|
-
clean_token = token.strip("".join(
|
|
76
|
+
clean_token = token.strip("".join(punctuation_chars))
|
|
71
77
|
if clean_token: # Only add non-empty tokens
|
|
72
78
|
tokens.append(clean_token)
|
|
73
79
|
|
|
@@ -80,6 +86,7 @@ def compute_vocd_d(
|
|
|
80
86
|
num_samples: int = 100,
|
|
81
87
|
min_tokens: int = 100,
|
|
82
88
|
random_seed: Optional[int] = None,
|
|
89
|
+
chunk_size: int = 1000,
|
|
83
90
|
) -> VocdDResult:
|
|
84
91
|
"""
|
|
85
92
|
Compute voc-D (vocabulary D) using curve-fitting approach.
|
|
@@ -167,9 +174,7 @@ def compute_vocd_d(
|
|
|
167
174
|
|
|
168
175
|
# Step 2: Validate minimum length
|
|
169
176
|
if total_tokens < min_tokens:
|
|
170
|
-
raise ValueError(
|
|
171
|
-
f"Text has {total_tokens} tokens, minimum {min_tokens} required for voc-D"
|
|
172
|
-
)
|
|
177
|
+
raise ValueError(f"Text has {total_tokens} tokens, minimum {min_tokens} required for voc-D")
|
|
173
178
|
|
|
174
179
|
# Step 3: Determine sample sizes to test
|
|
175
180
|
# Test from 10 tokens up to min(100, total_tokens - 10)
|
|
@@ -212,12 +217,12 @@ def compute_vocd_d(
|
|
|
212
217
|
numerator += ttr / (size**0.5)
|
|
213
218
|
denominator += 1.0 / size
|
|
214
219
|
|
|
215
|
-
|
|
220
|
+
d_param = numerator / denominator if denominator > 0 else 0.0
|
|
216
221
|
|
|
217
222
|
# Step 6: Calculate R² (goodness of fit)
|
|
218
223
|
# Predicted TTR = D / sqrt(sample_size)
|
|
219
224
|
y_actual = list(sample_size_to_mean_ttr.values())
|
|
220
|
-
y_predicted = [
|
|
225
|
+
y_predicted = [d_param / (size**0.5) for size in sample_sizes]
|
|
221
226
|
|
|
222
227
|
# R² calculation
|
|
223
228
|
mean_y = sum(y_actual) / len(y_actual)
|
|
@@ -237,17 +242,25 @@ def compute_vocd_d(
|
|
|
237
242
|
"random_seed": random_seed,
|
|
238
243
|
}
|
|
239
244
|
|
|
240
|
-
# Step 8:
|
|
245
|
+
# Step 8: Create distributions (single-pass analysis)
|
|
246
|
+
d_parameter_dist = make_distribution([d_param])
|
|
247
|
+
curve_fit_r_squared_dist = make_distribution([r_squared])
|
|
248
|
+
|
|
249
|
+
# Step 9: Return result
|
|
241
250
|
return VocdDResult(
|
|
242
|
-
d_parameter=
|
|
251
|
+
d_parameter=d_param,
|
|
243
252
|
curve_fit_r_squared=r_squared,
|
|
244
253
|
sample_count=len(sample_sizes),
|
|
245
254
|
optimal_sample_size=sample_size, # Input parameter
|
|
255
|
+
d_parameter_dist=d_parameter_dist,
|
|
256
|
+
curve_fit_r_squared_dist=curve_fit_r_squared_dist,
|
|
257
|
+
chunk_size=chunk_size,
|
|
258
|
+
chunk_count=1, # Single pass analysis
|
|
246
259
|
metadata=metadata,
|
|
247
260
|
)
|
|
248
261
|
|
|
249
262
|
|
|
250
|
-
def compute_mattr(text: str, window_size: int = 50) -> MATTRResult:
|
|
263
|
+
def compute_mattr(text: str, window_size: int = 50, chunk_size: int = 1000) -> MATTRResult:
|
|
251
264
|
"""
|
|
252
265
|
Compute Moving-Average Type-Token Ratio (MATTR).
|
|
253
266
|
|
|
@@ -360,7 +373,13 @@ def compute_mattr(text: str, window_size: int = 50) -> MATTRResult:
|
|
|
360
373
|
"last_window_ttr": window_ttrs[-1],
|
|
361
374
|
}
|
|
362
375
|
|
|
363
|
-
# Step 7:
|
|
376
|
+
# Step 7: Create distributions (single-pass analysis)
|
|
377
|
+
mattr_score_dist = make_distribution([mattr_score])
|
|
378
|
+
ttr_std_dev_dist = make_distribution([ttr_std_dev])
|
|
379
|
+
min_ttr_dist = make_distribution([min_ttr])
|
|
380
|
+
max_ttr_dist = make_distribution([max_ttr])
|
|
381
|
+
|
|
382
|
+
# Step 8: Return result
|
|
364
383
|
return MATTRResult(
|
|
365
384
|
mattr_score=mattr_score,
|
|
366
385
|
window_size=window_size,
|
|
@@ -368,11 +387,17 @@ def compute_mattr(text: str, window_size: int = 50) -> MATTRResult:
|
|
|
368
387
|
ttr_std_dev=ttr_std_dev,
|
|
369
388
|
min_ttr=min_ttr,
|
|
370
389
|
max_ttr=max_ttr,
|
|
390
|
+
mattr_score_dist=mattr_score_dist,
|
|
391
|
+
ttr_std_dev_dist=ttr_std_dev_dist,
|
|
392
|
+
min_ttr_dist=min_ttr_dist,
|
|
393
|
+
max_ttr_dist=max_ttr_dist,
|
|
394
|
+
chunk_size=chunk_size,
|
|
395
|
+
chunk_count=1, # Single pass analysis
|
|
371
396
|
metadata=metadata,
|
|
372
397
|
)
|
|
373
398
|
|
|
374
399
|
|
|
375
|
-
def compute_hdd(text: str, sample_size: int = 42) -> HDDResult:
|
|
400
|
+
def compute_hdd(text: str, sample_size: int = 42, chunk_size: int = 1000) -> HDDResult:
|
|
376
401
|
"""
|
|
377
402
|
Compute HD-D (Hypergeometric Distribution D).
|
|
378
403
|
|
|
@@ -451,9 +476,7 @@ def compute_hdd(text: str, sample_size: int = 42) -> HDDResult:
|
|
|
451
476
|
|
|
452
477
|
# Step 2: Validate minimum length
|
|
453
478
|
if total_tokens < sample_size:
|
|
454
|
-
raise ValueError(
|
|
455
|
-
f"Text has {total_tokens} tokens, minimum {sample_size} required for HD-D"
|
|
456
|
-
)
|
|
479
|
+
raise ValueError(f"Text has {total_tokens} tokens, minimum {sample_size} required for HD-D")
|
|
457
480
|
|
|
458
481
|
# Step 3: Build frequency distribution
|
|
459
482
|
type_counts: dict[str, int] = {}
|
|
@@ -485,17 +508,23 @@ def compute_hdd(text: str, sample_size: int = 42) -> HDDResult:
|
|
|
485
508
|
"calculation_method": "simplified",
|
|
486
509
|
}
|
|
487
510
|
|
|
488
|
-
# Step 6:
|
|
511
|
+
# Step 6: Create distribution (single-pass analysis)
|
|
512
|
+
hdd_score_dist = make_distribution([hdd_sum])
|
|
513
|
+
|
|
514
|
+
# Step 7: Return result
|
|
489
515
|
return HDDResult(
|
|
490
516
|
hdd_score=hdd_sum,
|
|
491
517
|
sample_size=sample_size,
|
|
492
518
|
type_count=total_types,
|
|
493
519
|
token_count=total_tokens,
|
|
520
|
+
hdd_score_dist=hdd_score_dist,
|
|
521
|
+
chunk_size=chunk_size,
|
|
522
|
+
chunk_count=1, # Single pass analysis
|
|
494
523
|
metadata=metadata,
|
|
495
524
|
)
|
|
496
525
|
|
|
497
526
|
|
|
498
|
-
def compute_msttr(text: str, segment_size: int = 100) -> MSTTRResult:
|
|
527
|
+
def compute_msttr(text: str, segment_size: int = 100, chunk_size: int = 1000) -> MSTTRResult:
|
|
499
528
|
"""
|
|
500
529
|
Compute Mean Segmental Type-Token Ratio (MSTTR).
|
|
501
530
|
|
|
@@ -604,9 +633,7 @@ def compute_msttr(text: str, segment_size: int = 100) -> MSTTRResult:
|
|
|
604
633
|
|
|
605
634
|
# Step 6: Calculate statistics
|
|
606
635
|
# Standard deviation
|
|
607
|
-
variance = sum((ttr - msttr_score) ** 2 for ttr in segment_ttrs) / len(
|
|
608
|
-
segment_ttrs
|
|
609
|
-
)
|
|
636
|
+
variance = sum((ttr - msttr_score) ** 2 for ttr in segment_ttrs) / len(segment_ttrs)
|
|
610
637
|
ttr_std_dev = variance**0.5
|
|
611
638
|
|
|
612
639
|
# Min and max
|
|
@@ -628,7 +655,13 @@ def compute_msttr(text: str, segment_size: int = 100) -> MSTTRResult:
|
|
|
628
655
|
"last_segment_ttr": segment_ttrs[-1],
|
|
629
656
|
}
|
|
630
657
|
|
|
631
|
-
# Step 9:
|
|
658
|
+
# Step 9: Create distributions (single-pass analysis)
|
|
659
|
+
msttr_score_dist = make_distribution([msttr_score])
|
|
660
|
+
ttr_std_dev_dist = make_distribution([ttr_std_dev])
|
|
661
|
+
min_ttr_dist = make_distribution([min_ttr])
|
|
662
|
+
max_ttr_dist = make_distribution([max_ttr])
|
|
663
|
+
|
|
664
|
+
# Step 10: Return result
|
|
632
665
|
return MSTTRResult(
|
|
633
666
|
msttr_score=msttr_score,
|
|
634
667
|
segment_size=segment_size,
|
|
@@ -637,5 +670,11 @@ def compute_msttr(text: str, segment_size: int = 100) -> MSTTRResult:
|
|
|
637
670
|
min_ttr=min_ttr,
|
|
638
671
|
max_ttr=max_ttr,
|
|
639
672
|
segment_ttrs=segment_ttrs,
|
|
673
|
+
msttr_score_dist=msttr_score_dist,
|
|
674
|
+
ttr_std_dev_dist=ttr_std_dev_dist,
|
|
675
|
+
min_ttr_dist=min_ttr_dist,
|
|
676
|
+
max_ttr_dist=max_ttr_dist,
|
|
677
|
+
chunk_size=chunk_size,
|
|
678
|
+
chunk_count=1, # Single pass analysis
|
|
640
679
|
metadata=metadata,
|
|
641
680
|
)
|
|
@@ -32,8 +32,7 @@ References:
|
|
|
32
32
|
words for authorship attribution. ACH/ALLC.
|
|
33
33
|
"""
|
|
34
34
|
|
|
35
|
-
from .._types import FunctionWordResult
|
|
36
|
-
|
|
35
|
+
from .._types import Distribution, FunctionWordResult, make_distribution
|
|
37
36
|
|
|
38
37
|
# Function word lists for English
|
|
39
38
|
# GitHub Issue #13: https://github.com/craigtrim/pystylometry/issues/13
|
|
@@ -42,78 +41,249 @@ from .._types import FunctionWordResult
|
|
|
42
41
|
|
|
43
42
|
# Determiners (articles, demonstratives, possessives, quantifiers)
|
|
44
43
|
DETERMINERS = {
|
|
45
|
-
"the",
|
|
46
|
-
"
|
|
47
|
-
"
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
44
|
+
"the",
|
|
45
|
+
"a",
|
|
46
|
+
"an", # Articles
|
|
47
|
+
"this",
|
|
48
|
+
"that",
|
|
49
|
+
"these",
|
|
50
|
+
"those", # Demonstratives
|
|
51
|
+
"my",
|
|
52
|
+
"your",
|
|
53
|
+
"his",
|
|
54
|
+
"her",
|
|
55
|
+
"its",
|
|
56
|
+
"our",
|
|
57
|
+
"their", # Possessive determiners
|
|
58
|
+
"some",
|
|
59
|
+
"any",
|
|
60
|
+
"no",
|
|
61
|
+
"every",
|
|
62
|
+
"each",
|
|
63
|
+
"either",
|
|
64
|
+
"neither", # Quantifiers
|
|
65
|
+
"much",
|
|
66
|
+
"many",
|
|
67
|
+
"more",
|
|
68
|
+
"most",
|
|
69
|
+
"few",
|
|
70
|
+
"fewer",
|
|
71
|
+
"less",
|
|
72
|
+
"least",
|
|
73
|
+
"all",
|
|
74
|
+
"both",
|
|
75
|
+
"half",
|
|
76
|
+
"several",
|
|
77
|
+
"enough",
|
|
51
78
|
}
|
|
52
79
|
|
|
53
80
|
# Prepositions (locative, temporal, other)
|
|
54
81
|
PREPOSITIONS = {
|
|
55
|
-
"in",
|
|
56
|
-
"
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
"
|
|
60
|
-
"
|
|
61
|
-
"
|
|
62
|
-
"
|
|
82
|
+
"in",
|
|
83
|
+
"on",
|
|
84
|
+
"at",
|
|
85
|
+
"by",
|
|
86
|
+
"for",
|
|
87
|
+
"with",
|
|
88
|
+
"from",
|
|
89
|
+
"to",
|
|
90
|
+
"of",
|
|
91
|
+
"about",
|
|
92
|
+
"above",
|
|
93
|
+
"across",
|
|
94
|
+
"after",
|
|
95
|
+
"against",
|
|
96
|
+
"along",
|
|
97
|
+
"among",
|
|
98
|
+
"around",
|
|
99
|
+
"as",
|
|
100
|
+
"before",
|
|
101
|
+
"behind",
|
|
102
|
+
"below",
|
|
103
|
+
"beneath",
|
|
104
|
+
"beside",
|
|
105
|
+
"between",
|
|
106
|
+
"beyond",
|
|
107
|
+
"but",
|
|
108
|
+
"concerning",
|
|
109
|
+
"considering",
|
|
110
|
+
"despite",
|
|
111
|
+
"down",
|
|
112
|
+
"during",
|
|
113
|
+
"except",
|
|
114
|
+
"inside",
|
|
115
|
+
"into",
|
|
116
|
+
"like",
|
|
117
|
+
"near",
|
|
118
|
+
"off",
|
|
119
|
+
"onto",
|
|
120
|
+
"out",
|
|
121
|
+
"outside",
|
|
122
|
+
"over",
|
|
123
|
+
"past",
|
|
124
|
+
"regarding",
|
|
125
|
+
"since",
|
|
126
|
+
"through",
|
|
127
|
+
"throughout",
|
|
128
|
+
"till",
|
|
129
|
+
"toward",
|
|
130
|
+
"under",
|
|
131
|
+
"underneath",
|
|
132
|
+
"until",
|
|
133
|
+
"up",
|
|
134
|
+
"upon",
|
|
135
|
+
"via",
|
|
136
|
+
"within",
|
|
137
|
+
"without",
|
|
63
138
|
}
|
|
64
139
|
|
|
65
140
|
# Conjunctions (coordinating, subordinating, correlative)
|
|
66
141
|
CONJUNCTIONS = {
|
|
67
142
|
# Coordinating
|
|
68
|
-
"and",
|
|
143
|
+
"and",
|
|
144
|
+
"but",
|
|
145
|
+
"or",
|
|
146
|
+
"nor",
|
|
147
|
+
"for",
|
|
148
|
+
"yet",
|
|
149
|
+
"so",
|
|
69
150
|
# Subordinating
|
|
70
|
-
"although",
|
|
71
|
-
"
|
|
72
|
-
"
|
|
151
|
+
"although",
|
|
152
|
+
"because",
|
|
153
|
+
"since",
|
|
154
|
+
"unless",
|
|
155
|
+
"while",
|
|
156
|
+
"if",
|
|
157
|
+
"when",
|
|
158
|
+
"where",
|
|
159
|
+
"after",
|
|
160
|
+
"before",
|
|
161
|
+
"once",
|
|
162
|
+
"until",
|
|
163
|
+
"as",
|
|
164
|
+
"though",
|
|
165
|
+
"even",
|
|
166
|
+
"whereas",
|
|
167
|
+
"wherever",
|
|
168
|
+
"whenever",
|
|
73
169
|
# Correlative components
|
|
74
|
-
"either",
|
|
170
|
+
"either",
|
|
171
|
+
"neither",
|
|
172
|
+
"both",
|
|
173
|
+
"whether",
|
|
75
174
|
}
|
|
76
175
|
|
|
77
176
|
# Pronouns (personal, possessive, reflexive, demonstrative, relative, indefinite)
|
|
78
177
|
PRONOUNS = {
|
|
79
178
|
# Personal (subject)
|
|
80
|
-
"i",
|
|
179
|
+
"i",
|
|
180
|
+
"you",
|
|
181
|
+
"he",
|
|
182
|
+
"she",
|
|
183
|
+
"it",
|
|
184
|
+
"we",
|
|
185
|
+
"they",
|
|
81
186
|
# Personal (object)
|
|
82
|
-
"me",
|
|
187
|
+
"me",
|
|
188
|
+
"him",
|
|
189
|
+
"her",
|
|
190
|
+
"us",
|
|
191
|
+
"them",
|
|
83
192
|
# Possessive
|
|
84
|
-
"mine",
|
|
193
|
+
"mine",
|
|
194
|
+
"yours",
|
|
195
|
+
"his",
|
|
196
|
+
"hers",
|
|
197
|
+
"its",
|
|
198
|
+
"ours",
|
|
199
|
+
"theirs",
|
|
85
200
|
# Reflexive
|
|
86
|
-
"myself",
|
|
87
|
-
"
|
|
201
|
+
"myself",
|
|
202
|
+
"yourself",
|
|
203
|
+
"himself",
|
|
204
|
+
"herself",
|
|
205
|
+
"itself",
|
|
206
|
+
"ourselves",
|
|
207
|
+
"yourselves",
|
|
208
|
+
"themselves",
|
|
88
209
|
# Demonstrative
|
|
89
|
-
"this",
|
|
210
|
+
"this",
|
|
211
|
+
"that",
|
|
212
|
+
"these",
|
|
213
|
+
"those",
|
|
90
214
|
# Relative
|
|
91
|
-
"who",
|
|
215
|
+
"who",
|
|
216
|
+
"whom",
|
|
217
|
+
"whose",
|
|
218
|
+
"which",
|
|
219
|
+
"that",
|
|
92
220
|
# Indefinite
|
|
93
|
-
"anybody",
|
|
94
|
-
"
|
|
95
|
-
"
|
|
221
|
+
"anybody",
|
|
222
|
+
"anyone",
|
|
223
|
+
"anything",
|
|
224
|
+
"everybody",
|
|
225
|
+
"everyone",
|
|
226
|
+
"everything",
|
|
227
|
+
"nobody",
|
|
228
|
+
"no one",
|
|
229
|
+
"nothing",
|
|
230
|
+
"somebody",
|
|
231
|
+
"someone",
|
|
232
|
+
"something",
|
|
233
|
+
"one",
|
|
96
234
|
}
|
|
97
235
|
|
|
98
236
|
# Auxiliary verbs (modal, primary)
|
|
99
237
|
AUXILIARIES = {
|
|
100
238
|
# Modals
|
|
101
|
-
"can",
|
|
102
|
-
"
|
|
239
|
+
"can",
|
|
240
|
+
"could",
|
|
241
|
+
"may",
|
|
242
|
+
"might",
|
|
243
|
+
"must",
|
|
244
|
+
"shall",
|
|
245
|
+
"should",
|
|
246
|
+
"will",
|
|
247
|
+
"would",
|
|
248
|
+
"ought",
|
|
103
249
|
# Primary auxiliaries (be, have, do)
|
|
104
|
-
"am",
|
|
105
|
-
"
|
|
106
|
-
"
|
|
250
|
+
"am",
|
|
251
|
+
"is",
|
|
252
|
+
"are",
|
|
253
|
+
"was",
|
|
254
|
+
"were",
|
|
255
|
+
"be",
|
|
256
|
+
"being",
|
|
257
|
+
"been",
|
|
258
|
+
"have",
|
|
259
|
+
"has",
|
|
260
|
+
"had",
|
|
261
|
+
"having",
|
|
262
|
+
"do",
|
|
263
|
+
"does",
|
|
264
|
+
"did",
|
|
265
|
+
"doing",
|
|
107
266
|
}
|
|
108
267
|
|
|
109
268
|
# Particles (often used with phrasal verbs)
|
|
110
269
|
PARTICLES = {
|
|
111
|
-
"up",
|
|
112
|
-
"
|
|
270
|
+
"up",
|
|
271
|
+
"down",
|
|
272
|
+
"out",
|
|
273
|
+
"off",
|
|
274
|
+
"over",
|
|
275
|
+
"in",
|
|
276
|
+
"away",
|
|
277
|
+
"back",
|
|
278
|
+
"on",
|
|
279
|
+
"along",
|
|
280
|
+
"forth",
|
|
281
|
+
"apart",
|
|
282
|
+
"aside",
|
|
113
283
|
}
|
|
114
284
|
|
|
115
285
|
|
|
116
|
-
def compute_function_words(text: str) -> FunctionWordResult:
|
|
286
|
+
def compute_function_words(text: str, chunk_size: int = 1000) -> FunctionWordResult:
|
|
117
287
|
"""
|
|
118
288
|
Compute function word frequency profiles for authorship analysis.
|
|
119
289
|
|
|
@@ -180,18 +350,21 @@ def compute_function_words(text: str) -> FunctionWordResult:
|
|
|
180
350
|
determiner and pronoun) - each category is counted independently
|
|
181
351
|
"""
|
|
182
352
|
# Step 1: Create union set of all function words (for total ratio calculation)
|
|
183
|
-
|
|
184
|
-
DETERMINERS
|
|
185
|
-
| PREPOSITIONS
|
|
186
|
-
| CONJUNCTIONS
|
|
187
|
-
| PRONOUNS
|
|
188
|
-
| AUXILIARIES
|
|
189
|
-
| PARTICLES
|
|
353
|
+
all_function_words = (
|
|
354
|
+
DETERMINERS | PREPOSITIONS | CONJUNCTIONS | PRONOUNS | AUXILIARIES | PARTICLES
|
|
190
355
|
)
|
|
191
356
|
|
|
192
357
|
# Step 2: Tokenize text (lowercase, split on whitespace, strip punctuation)
|
|
193
358
|
if not text or not text.strip():
|
|
194
359
|
# Handle empty text edge case
|
|
360
|
+
empty_dist = Distribution(
|
|
361
|
+
values=[],
|
|
362
|
+
mean=float("nan"),
|
|
363
|
+
median=float("nan"),
|
|
364
|
+
std=0.0,
|
|
365
|
+
range=0.0,
|
|
366
|
+
iqr=0.0,
|
|
367
|
+
)
|
|
195
368
|
return FunctionWordResult(
|
|
196
369
|
determiner_ratio=0.0,
|
|
197
370
|
preposition_ratio=0.0,
|
|
@@ -204,6 +377,16 @@ def compute_function_words(text: str) -> FunctionWordResult:
|
|
|
204
377
|
most_frequent_function_words=[],
|
|
205
378
|
least_frequent_function_words=[],
|
|
206
379
|
function_word_distribution={},
|
|
380
|
+
determiner_ratio_dist=empty_dist,
|
|
381
|
+
preposition_ratio_dist=empty_dist,
|
|
382
|
+
conjunction_ratio_dist=empty_dist,
|
|
383
|
+
pronoun_ratio_dist=empty_dist,
|
|
384
|
+
auxiliary_ratio_dist=empty_dist,
|
|
385
|
+
particle_ratio_dist=empty_dist,
|
|
386
|
+
total_function_word_ratio_dist=empty_dist,
|
|
387
|
+
function_word_diversity_dist=empty_dist,
|
|
388
|
+
chunk_size=chunk_size,
|
|
389
|
+
chunk_count=0,
|
|
207
390
|
metadata={
|
|
208
391
|
"total_word_count": 0,
|
|
209
392
|
"total_function_word_count": 0,
|
|
@@ -232,15 +415,13 @@ def compute_function_words(text: str) -> FunctionWordResult:
|
|
|
232
415
|
raw_tokens = text_lower.split()
|
|
233
416
|
|
|
234
417
|
# Comprehensive punctuation set for stripping
|
|
235
|
-
|
|
236
|
-
".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„""''‚'"
|
|
237
|
-
)
|
|
418
|
+
punctuation_chars = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„''‚'")
|
|
238
419
|
|
|
239
420
|
# Strip punctuation from each token
|
|
240
421
|
tokens = []
|
|
241
422
|
for token in raw_tokens:
|
|
242
423
|
# Strip leading and trailing punctuation
|
|
243
|
-
clean_token = token.strip("".join(
|
|
424
|
+
clean_token = token.strip("".join(punctuation_chars))
|
|
244
425
|
if clean_token: # Only add non-empty tokens
|
|
245
426
|
tokens.append(clean_token)
|
|
246
427
|
|
|
@@ -272,7 +453,7 @@ def compute_function_words(text: str) -> FunctionWordResult:
|
|
|
272
453
|
# Step 5: Build distribution (count each function word only once per token)
|
|
273
454
|
function_word_counts: dict[str, int] = {}
|
|
274
455
|
for token in tokens:
|
|
275
|
-
if token in
|
|
456
|
+
if token in all_function_words:
|
|
276
457
|
function_word_counts[token] = function_word_counts.get(token, 0) + 1
|
|
277
458
|
|
|
278
459
|
# Step 6: Calculate ratios
|
|
@@ -306,9 +487,7 @@ def compute_function_words(text: str) -> FunctionWordResult:
|
|
|
306
487
|
# Step 8: Find most/least frequent function words
|
|
307
488
|
if function_word_counts:
|
|
308
489
|
# Sort by count descending
|
|
309
|
-
sorted_by_count = sorted(
|
|
310
|
-
function_word_counts.items(), key=lambda x: x[1], reverse=True
|
|
311
|
-
)
|
|
490
|
+
sorted_by_count = sorted(function_word_counts.items(), key=lambda x: x[1], reverse=True)
|
|
312
491
|
|
|
313
492
|
# Top 10 most frequent
|
|
314
493
|
most_frequent = sorted_by_count[:10]
|
|
@@ -353,7 +532,17 @@ def compute_function_words(text: str) -> FunctionWordResult:
|
|
|
353
532
|
|
|
354
533
|
overlapping_words.sort()
|
|
355
534
|
|
|
356
|
-
# Step 11:
|
|
535
|
+
# Step 11: Create single-value distributions (analysis is done on full text)
|
|
536
|
+
determiner_ratio_dist = make_distribution([determiner_ratio])
|
|
537
|
+
preposition_ratio_dist = make_distribution([preposition_ratio])
|
|
538
|
+
conjunction_ratio_dist = make_distribution([conjunction_ratio])
|
|
539
|
+
pronoun_ratio_dist = make_distribution([pronoun_ratio])
|
|
540
|
+
auxiliary_ratio_dist = make_distribution([auxiliary_ratio])
|
|
541
|
+
particle_ratio_dist = make_distribution([particle_ratio])
|
|
542
|
+
total_function_word_ratio_dist = make_distribution([total_function_word_ratio])
|
|
543
|
+
function_word_diversity_dist = make_distribution([function_word_diversity])
|
|
544
|
+
|
|
545
|
+
# Step 12: Build metadata
|
|
357
546
|
metadata = {
|
|
358
547
|
"total_word_count": total_words,
|
|
359
548
|
"total_function_word_count": total_function_word_count,
|
|
@@ -374,7 +563,7 @@ def compute_function_words(text: str) -> FunctionWordResult:
|
|
|
374
563
|
"overlapping_word_categories": overlapping_word_categories,
|
|
375
564
|
}
|
|
376
565
|
|
|
377
|
-
# Step
|
|
566
|
+
# Step 13: Return result
|
|
378
567
|
return FunctionWordResult(
|
|
379
568
|
determiner_ratio=determiner_ratio,
|
|
380
569
|
preposition_ratio=preposition_ratio,
|
|
@@ -387,5 +576,15 @@ def compute_function_words(text: str) -> FunctionWordResult:
|
|
|
387
576
|
most_frequent_function_words=most_frequent,
|
|
388
577
|
least_frequent_function_words=least_frequent,
|
|
389
578
|
function_word_distribution=function_word_counts,
|
|
579
|
+
determiner_ratio_dist=determiner_ratio_dist,
|
|
580
|
+
preposition_ratio_dist=preposition_ratio_dist,
|
|
581
|
+
conjunction_ratio_dist=conjunction_ratio_dist,
|
|
582
|
+
pronoun_ratio_dist=pronoun_ratio_dist,
|
|
583
|
+
auxiliary_ratio_dist=auxiliary_ratio_dist,
|
|
584
|
+
particle_ratio_dist=particle_ratio_dist,
|
|
585
|
+
total_function_word_ratio_dist=total_function_word_ratio_dist,
|
|
586
|
+
function_word_diversity_dist=function_word_diversity_dist,
|
|
587
|
+
chunk_size=chunk_size,
|
|
588
|
+
chunk_count=1, # Single pass analysis
|
|
390
589
|
metadata=metadata,
|
|
391
590
|
)
|