pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,590 @@
|
|
|
1
|
+
"""Function word analysis for authorship attribution.
|
|
2
|
+
|
|
3
|
+
Function words (determiners, prepositions, conjunctions, pronouns, auxiliary
|
|
4
|
+
verbs) are highly frequent, content-independent words that authors use
|
|
5
|
+
subconsciously and consistently across different topics. This makes them
|
|
6
|
+
powerful markers for authorship attribution.
|
|
7
|
+
|
|
8
|
+
Related GitHub Issue:
|
|
9
|
+
#13 - Function Word Analysis
|
|
10
|
+
https://github.com/craigtrim/pystylometry/issues/13
|
|
11
|
+
|
|
12
|
+
Features implemented:
|
|
13
|
+
- Frequency profiles for all function word categories
|
|
14
|
+
- Ratios for specific grammatical categories
|
|
15
|
+
- Most/least frequently used function words
|
|
16
|
+
- Function word diversity metrics
|
|
17
|
+
|
|
18
|
+
Function word categories:
|
|
19
|
+
- Determiners: the, a, an, this, that, these, those, my, your, etc.
|
|
20
|
+
- Prepositions: in, on, at, by, for, with, from, to, of, etc.
|
|
21
|
+
- Conjunctions: and, but, or, nor, for, yet, so, because, although, etc.
|
|
22
|
+
- Pronouns: I, you, he, she, it, we, they, me, him, her, us, them, etc.
|
|
23
|
+
- Auxiliary verbs: be, have, do, can, will, shall, may, must, etc.
|
|
24
|
+
- Particles: up, down, out, off, over, away, back, etc.
|
|
25
|
+
|
|
26
|
+
References:
|
|
27
|
+
Mosteller, F., & Wallace, D. L. (1964). Inference and disputed authorship:
|
|
28
|
+
The Federalist. Addison-Wesley.
|
|
29
|
+
Burrows, J. (2002). 'Delta': A measure of stylistic difference and a guide
|
|
30
|
+
to likely authorship. Literary and Linguistic Computing, 17(3), 267-287.
|
|
31
|
+
Argamon, S., & Levitan, S. (2005). Measuring the usefulness of function
|
|
32
|
+
words for authorship attribution. ACH/ALLC.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from .._types import Distribution, FunctionWordResult, make_distribution
|
|
36
|
+
|
|
37
|
+
# Function word lists for English
|
|
38
|
+
# GitHub Issue #13: https://github.com/craigtrim/pystylometry/issues/13
|
|
39
|
+
# These lists should be comprehensive and cover all major function word categories.
|
|
40
|
+
# Consider loading from external resource files for easier maintenance.
|
|
41
|
+
|
|
42
|
+
# Determiners (articles, demonstratives, possessives, quantifiers)
|
|
43
|
+
DETERMINERS = {
|
|
44
|
+
"the",
|
|
45
|
+
"a",
|
|
46
|
+
"an", # Articles
|
|
47
|
+
"this",
|
|
48
|
+
"that",
|
|
49
|
+
"these",
|
|
50
|
+
"those", # Demonstratives
|
|
51
|
+
"my",
|
|
52
|
+
"your",
|
|
53
|
+
"his",
|
|
54
|
+
"her",
|
|
55
|
+
"its",
|
|
56
|
+
"our",
|
|
57
|
+
"their", # Possessive determiners
|
|
58
|
+
"some",
|
|
59
|
+
"any",
|
|
60
|
+
"no",
|
|
61
|
+
"every",
|
|
62
|
+
"each",
|
|
63
|
+
"either",
|
|
64
|
+
"neither", # Quantifiers
|
|
65
|
+
"much",
|
|
66
|
+
"many",
|
|
67
|
+
"more",
|
|
68
|
+
"most",
|
|
69
|
+
"few",
|
|
70
|
+
"fewer",
|
|
71
|
+
"less",
|
|
72
|
+
"least",
|
|
73
|
+
"all",
|
|
74
|
+
"both",
|
|
75
|
+
"half",
|
|
76
|
+
"several",
|
|
77
|
+
"enough",
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
# Prepositions (locative, temporal, other)
|
|
81
|
+
PREPOSITIONS = {
|
|
82
|
+
"in",
|
|
83
|
+
"on",
|
|
84
|
+
"at",
|
|
85
|
+
"by",
|
|
86
|
+
"for",
|
|
87
|
+
"with",
|
|
88
|
+
"from",
|
|
89
|
+
"to",
|
|
90
|
+
"of",
|
|
91
|
+
"about",
|
|
92
|
+
"above",
|
|
93
|
+
"across",
|
|
94
|
+
"after",
|
|
95
|
+
"against",
|
|
96
|
+
"along",
|
|
97
|
+
"among",
|
|
98
|
+
"around",
|
|
99
|
+
"as",
|
|
100
|
+
"before",
|
|
101
|
+
"behind",
|
|
102
|
+
"below",
|
|
103
|
+
"beneath",
|
|
104
|
+
"beside",
|
|
105
|
+
"between",
|
|
106
|
+
"beyond",
|
|
107
|
+
"but",
|
|
108
|
+
"concerning",
|
|
109
|
+
"considering",
|
|
110
|
+
"despite",
|
|
111
|
+
"down",
|
|
112
|
+
"during",
|
|
113
|
+
"except",
|
|
114
|
+
"inside",
|
|
115
|
+
"into",
|
|
116
|
+
"like",
|
|
117
|
+
"near",
|
|
118
|
+
"off",
|
|
119
|
+
"onto",
|
|
120
|
+
"out",
|
|
121
|
+
"outside",
|
|
122
|
+
"over",
|
|
123
|
+
"past",
|
|
124
|
+
"regarding",
|
|
125
|
+
"since",
|
|
126
|
+
"through",
|
|
127
|
+
"throughout",
|
|
128
|
+
"till",
|
|
129
|
+
"toward",
|
|
130
|
+
"under",
|
|
131
|
+
"underneath",
|
|
132
|
+
"until",
|
|
133
|
+
"up",
|
|
134
|
+
"upon",
|
|
135
|
+
"via",
|
|
136
|
+
"within",
|
|
137
|
+
"without",
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
# Conjunctions (coordinating, subordinating, correlative)
|
|
141
|
+
CONJUNCTIONS = {
|
|
142
|
+
# Coordinating
|
|
143
|
+
"and",
|
|
144
|
+
"but",
|
|
145
|
+
"or",
|
|
146
|
+
"nor",
|
|
147
|
+
"for",
|
|
148
|
+
"yet",
|
|
149
|
+
"so",
|
|
150
|
+
# Subordinating
|
|
151
|
+
"although",
|
|
152
|
+
"because",
|
|
153
|
+
"since",
|
|
154
|
+
"unless",
|
|
155
|
+
"while",
|
|
156
|
+
"if",
|
|
157
|
+
"when",
|
|
158
|
+
"where",
|
|
159
|
+
"after",
|
|
160
|
+
"before",
|
|
161
|
+
"once",
|
|
162
|
+
"until",
|
|
163
|
+
"as",
|
|
164
|
+
"though",
|
|
165
|
+
"even",
|
|
166
|
+
"whereas",
|
|
167
|
+
"wherever",
|
|
168
|
+
"whenever",
|
|
169
|
+
# Correlative components
|
|
170
|
+
"either",
|
|
171
|
+
"neither",
|
|
172
|
+
"both",
|
|
173
|
+
"whether",
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
# Pronouns (personal, possessive, reflexive, demonstrative, relative, indefinite)
|
|
177
|
+
PRONOUNS = {
|
|
178
|
+
# Personal (subject)
|
|
179
|
+
"i",
|
|
180
|
+
"you",
|
|
181
|
+
"he",
|
|
182
|
+
"she",
|
|
183
|
+
"it",
|
|
184
|
+
"we",
|
|
185
|
+
"they",
|
|
186
|
+
# Personal (object)
|
|
187
|
+
"me",
|
|
188
|
+
"him",
|
|
189
|
+
"her",
|
|
190
|
+
"us",
|
|
191
|
+
"them",
|
|
192
|
+
# Possessive
|
|
193
|
+
"mine",
|
|
194
|
+
"yours",
|
|
195
|
+
"his",
|
|
196
|
+
"hers",
|
|
197
|
+
"its",
|
|
198
|
+
"ours",
|
|
199
|
+
"theirs",
|
|
200
|
+
# Reflexive
|
|
201
|
+
"myself",
|
|
202
|
+
"yourself",
|
|
203
|
+
"himself",
|
|
204
|
+
"herself",
|
|
205
|
+
"itself",
|
|
206
|
+
"ourselves",
|
|
207
|
+
"yourselves",
|
|
208
|
+
"themselves",
|
|
209
|
+
# Demonstrative
|
|
210
|
+
"this",
|
|
211
|
+
"that",
|
|
212
|
+
"these",
|
|
213
|
+
"those",
|
|
214
|
+
# Relative
|
|
215
|
+
"who",
|
|
216
|
+
"whom",
|
|
217
|
+
"whose",
|
|
218
|
+
"which",
|
|
219
|
+
"that",
|
|
220
|
+
# Indefinite
|
|
221
|
+
"anybody",
|
|
222
|
+
"anyone",
|
|
223
|
+
"anything",
|
|
224
|
+
"everybody",
|
|
225
|
+
"everyone",
|
|
226
|
+
"everything",
|
|
227
|
+
"nobody",
|
|
228
|
+
"no one",
|
|
229
|
+
"nothing",
|
|
230
|
+
"somebody",
|
|
231
|
+
"someone",
|
|
232
|
+
"something",
|
|
233
|
+
"one",
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
# Auxiliary verbs (modal, primary)
|
|
237
|
+
AUXILIARIES = {
|
|
238
|
+
# Modals
|
|
239
|
+
"can",
|
|
240
|
+
"could",
|
|
241
|
+
"may",
|
|
242
|
+
"might",
|
|
243
|
+
"must",
|
|
244
|
+
"shall",
|
|
245
|
+
"should",
|
|
246
|
+
"will",
|
|
247
|
+
"would",
|
|
248
|
+
"ought",
|
|
249
|
+
# Primary auxiliaries (be, have, do)
|
|
250
|
+
"am",
|
|
251
|
+
"is",
|
|
252
|
+
"are",
|
|
253
|
+
"was",
|
|
254
|
+
"were",
|
|
255
|
+
"be",
|
|
256
|
+
"being",
|
|
257
|
+
"been",
|
|
258
|
+
"have",
|
|
259
|
+
"has",
|
|
260
|
+
"had",
|
|
261
|
+
"having",
|
|
262
|
+
"do",
|
|
263
|
+
"does",
|
|
264
|
+
"did",
|
|
265
|
+
"doing",
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
# Particles (often used with phrasal verbs)
|
|
269
|
+
PARTICLES = {
|
|
270
|
+
"up",
|
|
271
|
+
"down",
|
|
272
|
+
"out",
|
|
273
|
+
"off",
|
|
274
|
+
"over",
|
|
275
|
+
"in",
|
|
276
|
+
"away",
|
|
277
|
+
"back",
|
|
278
|
+
"on",
|
|
279
|
+
"along",
|
|
280
|
+
"forth",
|
|
281
|
+
"apart",
|
|
282
|
+
"aside",
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def compute_function_words(text: str, chunk_size: int = 1000) -> FunctionWordResult:
|
|
287
|
+
"""
|
|
288
|
+
Compute function word frequency profiles for authorship analysis.
|
|
289
|
+
|
|
290
|
+
Function words are closed-class words (determiners, prepositions,
|
|
291
|
+
conjunctions, pronouns, auxiliaries) that authors use largely
|
|
292
|
+
subconsciously and consistently. Their frequency patterns are
|
|
293
|
+
powerful authorship markers because they're independent of topic.
|
|
294
|
+
|
|
295
|
+
Related GitHub Issue:
|
|
296
|
+
#13 - Function Word Analysis
|
|
297
|
+
https://github.com/craigtrim/pystylometry/issues/13
|
|
298
|
+
|
|
299
|
+
Why function words matter for authorship:
|
|
300
|
+
1. Topic-independent: Used consistently across different subjects
|
|
301
|
+
2. Subconscious usage: Authors don't deliberately vary their use
|
|
302
|
+
3. High frequency: Appear often enough for reliable statistics
|
|
303
|
+
4. Stable over time: Authors' function word patterns remain consistent
|
|
304
|
+
5. Discriminative power: Different authors show distinct patterns
|
|
305
|
+
|
|
306
|
+
Classic example: Mosteller & Wallace (1964) used function word
|
|
307
|
+
frequencies to resolve the disputed authorship of the Federalist Papers,
|
|
308
|
+
distinguishing between Hamilton and Madison based on their use of
|
|
309
|
+
"while" vs. "whilst", "upon" vs. "on", etc.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
text: Input text to analyze. Should be at least a few hundred words
|
|
313
|
+
for reliable statistics. Function word analysis works best with
|
|
314
|
+
longer texts (1000+ words) where frequency patterns stabilize.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
FunctionWordResult containing:
|
|
318
|
+
- Ratios for each function word category (per total words)
|
|
319
|
+
- Total function word ratio
|
|
320
|
+
- Function word diversity (unique / total function words)
|
|
321
|
+
- Most/least frequent function words with counts
|
|
322
|
+
- Full distribution of all function words used
|
|
323
|
+
- Metadata with category-specific counts
|
|
324
|
+
|
|
325
|
+
Example:
|
|
326
|
+
>>> result = compute_function_words("Sample text for analysis...")
|
|
327
|
+
>>> print(f"Determiner ratio: {result.determiner_ratio:.3f}")
|
|
328
|
+
Determiner ratio: 0.156
|
|
329
|
+
>>> print(f"Preposition ratio: {result.preposition_ratio:.3f}")
|
|
330
|
+
Preposition ratio: 0.112
|
|
331
|
+
>>> print(f"Total function words: {result.total_function_word_ratio:.3f}")
|
|
332
|
+
Total function words: 0.487
|
|
333
|
+
>>> print(f"Most frequent: {result.most_frequent_function_words[:3]}")
|
|
334
|
+
Most frequent: [('the', 45), ('of', 32), ('to', 28)]
|
|
335
|
+
|
|
336
|
+
>>> # Authorship comparison example
|
|
337
|
+
>>> text1 = "Text by author 1..."
|
|
338
|
+
>>> text2 = "Text by author 2..."
|
|
339
|
+
>>> r1 = compute_function_words(text1)
|
|
340
|
+
>>> r2 = compute_function_words(text2)
|
|
341
|
+
>>> # Compare determiner ratios, preposition preferences, etc.
|
|
342
|
+
|
|
343
|
+
Note:
|
|
344
|
+
- Case-insensitive matching (all text lowercased for matching)
|
|
345
|
+
- Tokenization by whitespace and punctuation
|
|
346
|
+
- Words must match exactly (no stemming or lemmatization)
|
|
347
|
+
- Multi-word function words like "no one" are handled as separate tokens
|
|
348
|
+
- Empty or very short texts may have unreliable ratios
|
|
349
|
+
- Some words appear in multiple categories (e.g., "that" is both
|
|
350
|
+
determiner and pronoun) - each category is counted independently
|
|
351
|
+
"""
|
|
352
|
+
# Step 1: Create union set of all function words (for total ratio calculation)
|
|
353
|
+
all_function_words = (
|
|
354
|
+
DETERMINERS | PREPOSITIONS | CONJUNCTIONS | PRONOUNS | AUXILIARIES | PARTICLES
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# Step 2: Tokenize text (lowercase, split on whitespace, strip punctuation)
|
|
358
|
+
if not text or not text.strip():
|
|
359
|
+
# Handle empty text edge case
|
|
360
|
+
empty_dist = Distribution(
|
|
361
|
+
values=[],
|
|
362
|
+
mean=float("nan"),
|
|
363
|
+
median=float("nan"),
|
|
364
|
+
std=0.0,
|
|
365
|
+
range=0.0,
|
|
366
|
+
iqr=0.0,
|
|
367
|
+
)
|
|
368
|
+
return FunctionWordResult(
|
|
369
|
+
determiner_ratio=0.0,
|
|
370
|
+
preposition_ratio=0.0,
|
|
371
|
+
conjunction_ratio=0.0,
|
|
372
|
+
pronoun_ratio=0.0,
|
|
373
|
+
auxiliary_ratio=0.0,
|
|
374
|
+
particle_ratio=0.0,
|
|
375
|
+
total_function_word_ratio=0.0,
|
|
376
|
+
function_word_diversity=0.0,
|
|
377
|
+
most_frequent_function_words=[],
|
|
378
|
+
least_frequent_function_words=[],
|
|
379
|
+
function_word_distribution={},
|
|
380
|
+
determiner_ratio_dist=empty_dist,
|
|
381
|
+
preposition_ratio_dist=empty_dist,
|
|
382
|
+
conjunction_ratio_dist=empty_dist,
|
|
383
|
+
pronoun_ratio_dist=empty_dist,
|
|
384
|
+
auxiliary_ratio_dist=empty_dist,
|
|
385
|
+
particle_ratio_dist=empty_dist,
|
|
386
|
+
total_function_word_ratio_dist=empty_dist,
|
|
387
|
+
function_word_diversity_dist=empty_dist,
|
|
388
|
+
chunk_size=chunk_size,
|
|
389
|
+
chunk_count=0,
|
|
390
|
+
metadata={
|
|
391
|
+
"total_word_count": 0,
|
|
392
|
+
"total_function_word_count": 0,
|
|
393
|
+
"unique_function_word_count": 0,
|
|
394
|
+
"determiner_count": 0,
|
|
395
|
+
"preposition_count": 0,
|
|
396
|
+
"conjunction_count": 0,
|
|
397
|
+
"pronoun_count": 0,
|
|
398
|
+
"auxiliary_count": 0,
|
|
399
|
+
"particle_count": 0,
|
|
400
|
+
"determiner_list": [],
|
|
401
|
+
"preposition_list": [],
|
|
402
|
+
"conjunction_list": [],
|
|
403
|
+
"pronoun_list": [],
|
|
404
|
+
"auxiliary_list": [],
|
|
405
|
+
"particle_list": [],
|
|
406
|
+
"overlapping_words": [],
|
|
407
|
+
"overlapping_word_categories": {},
|
|
408
|
+
},
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
# Lowercase entire text
|
|
412
|
+
text_lower = text.lower()
|
|
413
|
+
|
|
414
|
+
# Split on whitespace
|
|
415
|
+
raw_tokens = text_lower.split()
|
|
416
|
+
|
|
417
|
+
# Comprehensive punctuation set for stripping
|
|
418
|
+
punctuation_chars = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„''‚'")
|
|
419
|
+
|
|
420
|
+
# Strip punctuation from each token
|
|
421
|
+
tokens = []
|
|
422
|
+
for token in raw_tokens:
|
|
423
|
+
# Strip leading and trailing punctuation
|
|
424
|
+
clean_token = token.strip("".join(punctuation_chars))
|
|
425
|
+
if clean_token: # Only add non-empty tokens
|
|
426
|
+
tokens.append(clean_token)
|
|
427
|
+
|
|
428
|
+
total_words = len(tokens)
|
|
429
|
+
|
|
430
|
+
# Step 3: Initialize counters for each category
|
|
431
|
+
determiner_count = 0
|
|
432
|
+
preposition_count = 0
|
|
433
|
+
conjunction_count = 0
|
|
434
|
+
pronoun_count = 0
|
|
435
|
+
auxiliary_count = 0
|
|
436
|
+
particle_count = 0
|
|
437
|
+
|
|
438
|
+
# Step 4: Count tokens in each category (overlapping allowed)
|
|
439
|
+
for token in tokens:
|
|
440
|
+
if token in DETERMINERS:
|
|
441
|
+
determiner_count += 1
|
|
442
|
+
if token in PREPOSITIONS:
|
|
443
|
+
preposition_count += 1
|
|
444
|
+
if token in CONJUNCTIONS:
|
|
445
|
+
conjunction_count += 1
|
|
446
|
+
if token in PRONOUNS:
|
|
447
|
+
pronoun_count += 1
|
|
448
|
+
if token in AUXILIARIES:
|
|
449
|
+
auxiliary_count += 1
|
|
450
|
+
if token in PARTICLES:
|
|
451
|
+
particle_count += 1
|
|
452
|
+
|
|
453
|
+
# Step 5: Build distribution (count each function word only once per token)
|
|
454
|
+
function_word_counts: dict[str, int] = {}
|
|
455
|
+
for token in tokens:
|
|
456
|
+
if token in all_function_words:
|
|
457
|
+
function_word_counts[token] = function_word_counts.get(token, 0) + 1
|
|
458
|
+
|
|
459
|
+
# Step 6: Calculate ratios
|
|
460
|
+
if total_words > 0:
|
|
461
|
+
determiner_ratio = determiner_count / total_words
|
|
462
|
+
preposition_ratio = preposition_count / total_words
|
|
463
|
+
conjunction_ratio = conjunction_count / total_words
|
|
464
|
+
pronoun_ratio = pronoun_count / total_words
|
|
465
|
+
auxiliary_ratio = auxiliary_count / total_words
|
|
466
|
+
particle_ratio = particle_count / total_words
|
|
467
|
+
|
|
468
|
+
total_function_word_count = sum(function_word_counts.values())
|
|
469
|
+
total_function_word_ratio = total_function_word_count / total_words
|
|
470
|
+
else:
|
|
471
|
+
determiner_ratio = 0.0
|
|
472
|
+
preposition_ratio = 0.0
|
|
473
|
+
conjunction_ratio = 0.0
|
|
474
|
+
pronoun_ratio = 0.0
|
|
475
|
+
auxiliary_ratio = 0.0
|
|
476
|
+
particle_ratio = 0.0
|
|
477
|
+
total_function_word_count = 0
|
|
478
|
+
total_function_word_ratio = 0.0
|
|
479
|
+
|
|
480
|
+
# Step 7: Calculate diversity
|
|
481
|
+
unique_function_word_count = len(function_word_counts)
|
|
482
|
+
if total_function_word_count > 0:
|
|
483
|
+
function_word_diversity = unique_function_word_count / total_function_word_count
|
|
484
|
+
else:
|
|
485
|
+
function_word_diversity = 0.0
|
|
486
|
+
|
|
487
|
+
# Step 8: Find most/least frequent function words
|
|
488
|
+
if function_word_counts:
|
|
489
|
+
# Sort by count descending
|
|
490
|
+
sorted_by_count = sorted(function_word_counts.items(), key=lambda x: x[1], reverse=True)
|
|
491
|
+
|
|
492
|
+
# Top 10 most frequent
|
|
493
|
+
most_frequent = sorted_by_count[:10]
|
|
494
|
+
|
|
495
|
+
# Bottom 10 least frequent (reverse to get ascending order)
|
|
496
|
+
least_frequent = sorted_by_count[-10:]
|
|
497
|
+
least_frequent.reverse()
|
|
498
|
+
else:
|
|
499
|
+
most_frequent = []
|
|
500
|
+
least_frequent = []
|
|
501
|
+
|
|
502
|
+
# Step 9: Build category word lists (sorted)
|
|
503
|
+
determiner_list = sorted([w for w in function_word_counts if w in DETERMINERS])
|
|
504
|
+
preposition_list = sorted([w for w in function_word_counts if w in PREPOSITIONS])
|
|
505
|
+
conjunction_list = sorted([w for w in function_word_counts if w in CONJUNCTIONS])
|
|
506
|
+
pronoun_list = sorted([w for w in function_word_counts if w in PRONOUNS])
|
|
507
|
+
auxiliary_list = sorted([w for w in function_word_counts if w in AUXILIARIES])
|
|
508
|
+
particle_list = sorted([w for w in function_word_counts if w in PARTICLES])
|
|
509
|
+
|
|
510
|
+
# Step 10: Find overlapping words (words in multiple categories)
|
|
511
|
+
overlapping_words = []
|
|
512
|
+
overlapping_word_categories: dict[str, list[str]] = {}
|
|
513
|
+
|
|
514
|
+
for word in function_word_counts:
|
|
515
|
+
categories = []
|
|
516
|
+
if word in DETERMINERS:
|
|
517
|
+
categories.append("determiner")
|
|
518
|
+
if word in PREPOSITIONS:
|
|
519
|
+
categories.append("preposition")
|
|
520
|
+
if word in CONJUNCTIONS:
|
|
521
|
+
categories.append("conjunction")
|
|
522
|
+
if word in PRONOUNS:
|
|
523
|
+
categories.append("pronoun")
|
|
524
|
+
if word in AUXILIARIES:
|
|
525
|
+
categories.append("auxiliary")
|
|
526
|
+
if word in PARTICLES:
|
|
527
|
+
categories.append("particle")
|
|
528
|
+
|
|
529
|
+
if len(categories) > 1:
|
|
530
|
+
overlapping_words.append(word)
|
|
531
|
+
overlapping_word_categories[word] = categories
|
|
532
|
+
|
|
533
|
+
overlapping_words.sort()
|
|
534
|
+
|
|
535
|
+
# Step 11: Create single-value distributions (analysis is done on full text)
|
|
536
|
+
determiner_ratio_dist = make_distribution([determiner_ratio])
|
|
537
|
+
preposition_ratio_dist = make_distribution([preposition_ratio])
|
|
538
|
+
conjunction_ratio_dist = make_distribution([conjunction_ratio])
|
|
539
|
+
pronoun_ratio_dist = make_distribution([pronoun_ratio])
|
|
540
|
+
auxiliary_ratio_dist = make_distribution([auxiliary_ratio])
|
|
541
|
+
particle_ratio_dist = make_distribution([particle_ratio])
|
|
542
|
+
total_function_word_ratio_dist = make_distribution([total_function_word_ratio])
|
|
543
|
+
function_word_diversity_dist = make_distribution([function_word_diversity])
|
|
544
|
+
|
|
545
|
+
# Step 12: Build metadata
|
|
546
|
+
metadata = {
|
|
547
|
+
"total_word_count": total_words,
|
|
548
|
+
"total_function_word_count": total_function_word_count,
|
|
549
|
+
"unique_function_word_count": unique_function_word_count,
|
|
550
|
+
"determiner_count": determiner_count,
|
|
551
|
+
"preposition_count": preposition_count,
|
|
552
|
+
"conjunction_count": conjunction_count,
|
|
553
|
+
"pronoun_count": pronoun_count,
|
|
554
|
+
"auxiliary_count": auxiliary_count,
|
|
555
|
+
"particle_count": particle_count,
|
|
556
|
+
"determiner_list": determiner_list,
|
|
557
|
+
"preposition_list": preposition_list,
|
|
558
|
+
"conjunction_list": conjunction_list,
|
|
559
|
+
"pronoun_list": pronoun_list,
|
|
560
|
+
"auxiliary_list": auxiliary_list,
|
|
561
|
+
"particle_list": particle_list,
|
|
562
|
+
"overlapping_words": overlapping_words,
|
|
563
|
+
"overlapping_word_categories": overlapping_word_categories,
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
# Step 13: Return result
|
|
567
|
+
return FunctionWordResult(
|
|
568
|
+
determiner_ratio=determiner_ratio,
|
|
569
|
+
preposition_ratio=preposition_ratio,
|
|
570
|
+
conjunction_ratio=conjunction_ratio,
|
|
571
|
+
pronoun_ratio=pronoun_ratio,
|
|
572
|
+
auxiliary_ratio=auxiliary_ratio,
|
|
573
|
+
particle_ratio=particle_ratio,
|
|
574
|
+
total_function_word_ratio=total_function_word_ratio,
|
|
575
|
+
function_word_diversity=function_word_diversity,
|
|
576
|
+
most_frequent_function_words=most_frequent,
|
|
577
|
+
least_frequent_function_words=least_frequent,
|
|
578
|
+
function_word_distribution=function_word_counts,
|
|
579
|
+
determiner_ratio_dist=determiner_ratio_dist,
|
|
580
|
+
preposition_ratio_dist=preposition_ratio_dist,
|
|
581
|
+
conjunction_ratio_dist=conjunction_ratio_dist,
|
|
582
|
+
pronoun_ratio_dist=pronoun_ratio_dist,
|
|
583
|
+
auxiliary_ratio_dist=auxiliary_ratio_dist,
|
|
584
|
+
particle_ratio_dist=particle_ratio_dist,
|
|
585
|
+
total_function_word_ratio_dist=total_function_word_ratio_dist,
|
|
586
|
+
function_word_diversity_dist=function_word_diversity_dist,
|
|
587
|
+
chunk_size=chunk_size,
|
|
588
|
+
chunk_count=1, # Single pass analysis
|
|
589
|
+
metadata=metadata,
|
|
590
|
+
)
|