pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pystylometry/__init__.py +30 -5
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1954 -28
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +26 -1
  6. pystylometry/authorship/additional_methods.py +75 -0
  7. pystylometry/authorship/kilgarriff.py +347 -0
  8. pystylometry/character/__init__.py +15 -0
  9. pystylometry/character/character_metrics.py +389 -0
  10. pystylometry/cli.py +427 -0
  11. pystylometry/consistency/__init__.py +57 -0
  12. pystylometry/consistency/_thresholds.py +162 -0
  13. pystylometry/consistency/drift.py +549 -0
  14. pystylometry/dialect/__init__.py +65 -0
  15. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  16. pystylometry/dialect/_loader.py +360 -0
  17. pystylometry/dialect/detector.py +533 -0
  18. pystylometry/lexical/__init__.py +13 -6
  19. pystylometry/lexical/advanced_diversity.py +680 -0
  20. pystylometry/lexical/function_words.py +590 -0
  21. pystylometry/lexical/hapax.py +310 -33
  22. pystylometry/lexical/mtld.py +180 -22
  23. pystylometry/lexical/ttr.py +149 -0
  24. pystylometry/lexical/word_frequency_sophistication.py +1805 -0
  25. pystylometry/lexical/yule.py +142 -29
  26. pystylometry/ngrams/__init__.py +2 -0
  27. pystylometry/ngrams/entropy.py +150 -49
  28. pystylometry/ngrams/extended_ngrams.py +235 -0
  29. pystylometry/prosody/__init__.py +12 -0
  30. pystylometry/prosody/rhythm_prosody.py +53 -0
  31. pystylometry/readability/__init__.py +12 -0
  32. pystylometry/readability/additional_formulas.py +2110 -0
  33. pystylometry/readability/ari.py +173 -35
  34. pystylometry/readability/coleman_liau.py +150 -30
  35. pystylometry/readability/complex_words.py +531 -0
  36. pystylometry/readability/flesch.py +181 -32
  37. pystylometry/readability/gunning_fog.py +208 -35
  38. pystylometry/readability/smog.py +126 -28
  39. pystylometry/readability/syllables.py +137 -30
  40. pystylometry/stylistic/__init__.py +20 -0
  41. pystylometry/stylistic/cohesion_coherence.py +45 -0
  42. pystylometry/stylistic/genre_register.py +45 -0
  43. pystylometry/stylistic/markers.py +131 -0
  44. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  45. pystylometry/syntactic/__init__.py +4 -0
  46. pystylometry/syntactic/advanced_syntactic.py +494 -0
  47. pystylometry/syntactic/pos_ratios.py +172 -17
  48. pystylometry/syntactic/sentence_stats.py +105 -18
  49. pystylometry/syntactic/sentence_types.py +526 -0
  50. pystylometry/viz/__init__.py +71 -0
  51. pystylometry/viz/drift.py +589 -0
  52. pystylometry/viz/jsx/__init__.py +31 -0
  53. pystylometry/viz/jsx/_base.py +144 -0
  54. pystylometry/viz/jsx/report.py +677 -0
  55. pystylometry/viz/jsx/timeline.py +716 -0
  56. pystylometry/viz/jsx/viewer.py +1032 -0
  57. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
  58. pystylometry-1.1.0.dist-info/RECORD +63 -0
  59. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  60. pystylometry-0.1.0.dist-info/RECORD +0 -26
  61. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,590 @@
1
+ """Function word analysis for authorship attribution.
2
+
3
+ Function words (determiners, prepositions, conjunctions, pronouns, auxiliary
4
+ verbs) are highly frequent, content-independent words that authors use
5
+ subconsciously and consistently across different topics. This makes them
6
+ powerful markers for authorship attribution.
7
+
8
+ Related GitHub Issue:
9
+ #13 - Function Word Analysis
10
+ https://github.com/craigtrim/pystylometry/issues/13
11
+
12
+ Features implemented:
13
+ - Frequency profiles for all function word categories
14
+ - Ratios for specific grammatical categories
15
+ - Most/least frequently used function words
16
+ - Function word diversity metrics
17
+
18
+ Function word categories:
19
+ - Determiners: the, a, an, this, that, these, those, my, your, etc.
20
+ - Prepositions: in, on, at, by, for, with, from, to, of, etc.
21
+ - Conjunctions: and, but, or, nor, for, yet, so, because, although, etc.
22
+ - Pronouns: I, you, he, she, it, we, they, me, him, her, us, them, etc.
23
+ - Auxiliary verbs: be, have, do, can, will, shall, may, must, etc.
24
+ - Particles: up, down, out, off, over, away, back, etc.
25
+
26
+ References:
27
+ Mosteller, F., & Wallace, D. L. (1964). Inference and disputed authorship:
28
+ The Federalist. Addison-Wesley.
29
+ Burrows, J. (2002). 'Delta': A measure of stylistic difference and a guide
30
+ to likely authorship. Literary and Linguistic Computing, 17(3), 267-287.
31
+ Argamon, S., & Levitan, S. (2005). Measuring the usefulness of function
32
+ words for authorship attribution. ACH/ALLC.
33
+ """
34
+
35
+ from .._types import Distribution, FunctionWordResult, make_distribution
36
+
37
+ # Function word lists for English
38
+ # GitHub Issue #13: https://github.com/craigtrim/pystylometry/issues/13
39
+ # These lists should be comprehensive and cover all major function word categories.
40
+ # Consider loading from external resource files for easier maintenance.
41
+
42
+ # Determiners (articles, demonstratives, possessives, quantifiers)
43
+ DETERMINERS = {
44
+ "the",
45
+ "a",
46
+ "an", # Articles
47
+ "this",
48
+ "that",
49
+ "these",
50
+ "those", # Demonstratives
51
+ "my",
52
+ "your",
53
+ "his",
54
+ "her",
55
+ "its",
56
+ "our",
57
+ "their", # Possessive determiners
58
+ "some",
59
+ "any",
60
+ "no",
61
+ "every",
62
+ "each",
63
+ "either",
64
+ "neither", # Quantifiers
65
+ "much",
66
+ "many",
67
+ "more",
68
+ "most",
69
+ "few",
70
+ "fewer",
71
+ "less",
72
+ "least",
73
+ "all",
74
+ "both",
75
+ "half",
76
+ "several",
77
+ "enough",
78
+ }
79
+
80
+ # Prepositions (locative, temporal, other)
81
+ PREPOSITIONS = {
82
+ "in",
83
+ "on",
84
+ "at",
85
+ "by",
86
+ "for",
87
+ "with",
88
+ "from",
89
+ "to",
90
+ "of",
91
+ "about",
92
+ "above",
93
+ "across",
94
+ "after",
95
+ "against",
96
+ "along",
97
+ "among",
98
+ "around",
99
+ "as",
100
+ "before",
101
+ "behind",
102
+ "below",
103
+ "beneath",
104
+ "beside",
105
+ "between",
106
+ "beyond",
107
+ "but",
108
+ "concerning",
109
+ "considering",
110
+ "despite",
111
+ "down",
112
+ "during",
113
+ "except",
114
+ "inside",
115
+ "into",
116
+ "like",
117
+ "near",
118
+ "off",
119
+ "onto",
120
+ "out",
121
+ "outside",
122
+ "over",
123
+ "past",
124
+ "regarding",
125
+ "since",
126
+ "through",
127
+ "throughout",
128
+ "till",
129
+ "toward",
130
+ "under",
131
+ "underneath",
132
+ "until",
133
+ "up",
134
+ "upon",
135
+ "via",
136
+ "within",
137
+ "without",
138
+ }
139
+
140
+ # Conjunctions (coordinating, subordinating, correlative)
141
+ CONJUNCTIONS = {
142
+ # Coordinating
143
+ "and",
144
+ "but",
145
+ "or",
146
+ "nor",
147
+ "for",
148
+ "yet",
149
+ "so",
150
+ # Subordinating
151
+ "although",
152
+ "because",
153
+ "since",
154
+ "unless",
155
+ "while",
156
+ "if",
157
+ "when",
158
+ "where",
159
+ "after",
160
+ "before",
161
+ "once",
162
+ "until",
163
+ "as",
164
+ "though",
165
+ "even",
166
+ "whereas",
167
+ "wherever",
168
+ "whenever",
169
+ # Correlative components
170
+ "either",
171
+ "neither",
172
+ "both",
173
+ "whether",
174
+ }
175
+
176
+ # Pronouns (personal, possessive, reflexive, demonstrative, relative, indefinite)
177
+ PRONOUNS = {
178
+ # Personal (subject)
179
+ "i",
180
+ "you",
181
+ "he",
182
+ "she",
183
+ "it",
184
+ "we",
185
+ "they",
186
+ # Personal (object)
187
+ "me",
188
+ "him",
189
+ "her",
190
+ "us",
191
+ "them",
192
+ # Possessive
193
+ "mine",
194
+ "yours",
195
+ "his",
196
+ "hers",
197
+ "its",
198
+ "ours",
199
+ "theirs",
200
+ # Reflexive
201
+ "myself",
202
+ "yourself",
203
+ "himself",
204
+ "herself",
205
+ "itself",
206
+ "ourselves",
207
+ "yourselves",
208
+ "themselves",
209
+ # Demonstrative
210
+ "this",
211
+ "that",
212
+ "these",
213
+ "those",
214
+ # Relative
215
+ "who",
216
+ "whom",
217
+ "whose",
218
+ "which",
219
+ "that",
220
+ # Indefinite
221
+ "anybody",
222
+ "anyone",
223
+ "anything",
224
+ "everybody",
225
+ "everyone",
226
+ "everything",
227
+ "nobody",
228
+ "no one",
229
+ "nothing",
230
+ "somebody",
231
+ "someone",
232
+ "something",
233
+ "one",
234
+ }
235
+
236
+ # Auxiliary verbs (modal, primary)
237
+ AUXILIARIES = {
238
+ # Modals
239
+ "can",
240
+ "could",
241
+ "may",
242
+ "might",
243
+ "must",
244
+ "shall",
245
+ "should",
246
+ "will",
247
+ "would",
248
+ "ought",
249
+ # Primary auxiliaries (be, have, do)
250
+ "am",
251
+ "is",
252
+ "are",
253
+ "was",
254
+ "were",
255
+ "be",
256
+ "being",
257
+ "been",
258
+ "have",
259
+ "has",
260
+ "had",
261
+ "having",
262
+ "do",
263
+ "does",
264
+ "did",
265
+ "doing",
266
+ }
267
+
268
+ # Particles (often used with phrasal verbs)
269
+ PARTICLES = {
270
+ "up",
271
+ "down",
272
+ "out",
273
+ "off",
274
+ "over",
275
+ "in",
276
+ "away",
277
+ "back",
278
+ "on",
279
+ "along",
280
+ "forth",
281
+ "apart",
282
+ "aside",
283
+ }
284
+
285
+
286
+ def compute_function_words(text: str, chunk_size: int = 1000) -> FunctionWordResult:
287
+ """
288
+ Compute function word frequency profiles for authorship analysis.
289
+
290
+ Function words are closed-class words (determiners, prepositions,
291
+ conjunctions, pronouns, auxiliaries) that authors use largely
292
+ subconsciously and consistently. Their frequency patterns are
293
+ powerful authorship markers because they're independent of topic.
294
+
295
+ Related GitHub Issue:
296
+ #13 - Function Word Analysis
297
+ https://github.com/craigtrim/pystylometry/issues/13
298
+
299
+ Why function words matter for authorship:
300
+ 1. Topic-independent: Used consistently across different subjects
301
+ 2. Subconscious usage: Authors don't deliberately vary their use
302
+ 3. High frequency: Appear often enough for reliable statistics
303
+ 4. Stable over time: Authors' function word patterns remain consistent
304
+ 5. Discriminative power: Different authors show distinct patterns
305
+
306
+ Classic example: Mosteller & Wallace (1964) used function word
307
+ frequencies to resolve the disputed authorship of the Federalist Papers,
308
+ distinguishing between Hamilton and Madison based on their use of
309
+ "while" vs. "whilst", "upon" vs. "on", etc.
310
+
311
+ Args:
312
+ text: Input text to analyze. Should be at least a few hundred words
313
+ for reliable statistics. Function word analysis works best with
314
+ longer texts (1000+ words) where frequency patterns stabilize.
315
+
316
+ Returns:
317
+ FunctionWordResult containing:
318
+ - Ratios for each function word category (per total words)
319
+ - Total function word ratio
320
+ - Function word diversity (unique / total function words)
321
+ - Most/least frequent function words with counts
322
+ - Full distribution of all function words used
323
+ - Metadata with category-specific counts
324
+
325
+ Example:
326
+ >>> result = compute_function_words("Sample text for analysis...")
327
+ >>> print(f"Determiner ratio: {result.determiner_ratio:.3f}")
328
+ Determiner ratio: 0.156
329
+ >>> print(f"Preposition ratio: {result.preposition_ratio:.3f}")
330
+ Preposition ratio: 0.112
331
+ >>> print(f"Total function words: {result.total_function_word_ratio:.3f}")
332
+ Total function words: 0.487
333
+ >>> print(f"Most frequent: {result.most_frequent_function_words[:3]}")
334
+ Most frequent: [('the', 45), ('of', 32), ('to', 28)]
335
+
336
+ >>> # Authorship comparison example
337
+ >>> text1 = "Text by author 1..."
338
+ >>> text2 = "Text by author 2..."
339
+ >>> r1 = compute_function_words(text1)
340
+ >>> r2 = compute_function_words(text2)
341
+ >>> # Compare determiner ratios, preposition preferences, etc.
342
+
343
+ Note:
344
+ - Case-insensitive matching (all text lowercased for matching)
345
+ - Tokenization by whitespace and punctuation
346
+ - Words must match exactly (no stemming or lemmatization)
347
+ - Multi-word function words like "no one" are handled as separate tokens
348
+ - Empty or very short texts may have unreliable ratios
349
+ - Some words appear in multiple categories (e.g., "that" is both
350
+ determiner and pronoun) - each category is counted independently
351
+ """
352
+ # Step 1: Create union set of all function words (for total ratio calculation)
353
+ all_function_words = (
354
+ DETERMINERS | PREPOSITIONS | CONJUNCTIONS | PRONOUNS | AUXILIARIES | PARTICLES
355
+ )
356
+
357
+ # Step 2: Tokenize text (lowercase, split on whitespace, strip punctuation)
358
+ if not text or not text.strip():
359
+ # Handle empty text edge case
360
+ empty_dist = Distribution(
361
+ values=[],
362
+ mean=float("nan"),
363
+ median=float("nan"),
364
+ std=0.0,
365
+ range=0.0,
366
+ iqr=0.0,
367
+ )
368
+ return FunctionWordResult(
369
+ determiner_ratio=0.0,
370
+ preposition_ratio=0.0,
371
+ conjunction_ratio=0.0,
372
+ pronoun_ratio=0.0,
373
+ auxiliary_ratio=0.0,
374
+ particle_ratio=0.0,
375
+ total_function_word_ratio=0.0,
376
+ function_word_diversity=0.0,
377
+ most_frequent_function_words=[],
378
+ least_frequent_function_words=[],
379
+ function_word_distribution={},
380
+ determiner_ratio_dist=empty_dist,
381
+ preposition_ratio_dist=empty_dist,
382
+ conjunction_ratio_dist=empty_dist,
383
+ pronoun_ratio_dist=empty_dist,
384
+ auxiliary_ratio_dist=empty_dist,
385
+ particle_ratio_dist=empty_dist,
386
+ total_function_word_ratio_dist=empty_dist,
387
+ function_word_diversity_dist=empty_dist,
388
+ chunk_size=chunk_size,
389
+ chunk_count=0,
390
+ metadata={
391
+ "total_word_count": 0,
392
+ "total_function_word_count": 0,
393
+ "unique_function_word_count": 0,
394
+ "determiner_count": 0,
395
+ "preposition_count": 0,
396
+ "conjunction_count": 0,
397
+ "pronoun_count": 0,
398
+ "auxiliary_count": 0,
399
+ "particle_count": 0,
400
+ "determiner_list": [],
401
+ "preposition_list": [],
402
+ "conjunction_list": [],
403
+ "pronoun_list": [],
404
+ "auxiliary_list": [],
405
+ "particle_list": [],
406
+ "overlapping_words": [],
407
+ "overlapping_word_categories": {},
408
+ },
409
+ )
410
+
411
+ # Lowercase entire text
412
+ text_lower = text.lower()
413
+
414
+ # Split on whitespace
415
+ raw_tokens = text_lower.split()
416
+
417
+ # Comprehensive punctuation set for stripping
418
+ punctuation_chars = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„''‚'")
419
+
420
+ # Strip punctuation from each token
421
+ tokens = []
422
+ for token in raw_tokens:
423
+ # Strip leading and trailing punctuation
424
+ clean_token = token.strip("".join(punctuation_chars))
425
+ if clean_token: # Only add non-empty tokens
426
+ tokens.append(clean_token)
427
+
428
+ total_words = len(tokens)
429
+
430
+ # Step 3: Initialize counters for each category
431
+ determiner_count = 0
432
+ preposition_count = 0
433
+ conjunction_count = 0
434
+ pronoun_count = 0
435
+ auxiliary_count = 0
436
+ particle_count = 0
437
+
438
+ # Step 4: Count tokens in each category (overlapping allowed)
439
+ for token in tokens:
440
+ if token in DETERMINERS:
441
+ determiner_count += 1
442
+ if token in PREPOSITIONS:
443
+ preposition_count += 1
444
+ if token in CONJUNCTIONS:
445
+ conjunction_count += 1
446
+ if token in PRONOUNS:
447
+ pronoun_count += 1
448
+ if token in AUXILIARIES:
449
+ auxiliary_count += 1
450
+ if token in PARTICLES:
451
+ particle_count += 1
452
+
453
+ # Step 5: Build distribution (count each function word only once per token)
454
+ function_word_counts: dict[str, int] = {}
455
+ for token in tokens:
456
+ if token in all_function_words:
457
+ function_word_counts[token] = function_word_counts.get(token, 0) + 1
458
+
459
+ # Step 6: Calculate ratios
460
+ if total_words > 0:
461
+ determiner_ratio = determiner_count / total_words
462
+ preposition_ratio = preposition_count / total_words
463
+ conjunction_ratio = conjunction_count / total_words
464
+ pronoun_ratio = pronoun_count / total_words
465
+ auxiliary_ratio = auxiliary_count / total_words
466
+ particle_ratio = particle_count / total_words
467
+
468
+ total_function_word_count = sum(function_word_counts.values())
469
+ total_function_word_ratio = total_function_word_count / total_words
470
+ else:
471
+ determiner_ratio = 0.0
472
+ preposition_ratio = 0.0
473
+ conjunction_ratio = 0.0
474
+ pronoun_ratio = 0.0
475
+ auxiliary_ratio = 0.0
476
+ particle_ratio = 0.0
477
+ total_function_word_count = 0
478
+ total_function_word_ratio = 0.0
479
+
480
+ # Step 7: Calculate diversity
481
+ unique_function_word_count = len(function_word_counts)
482
+ if total_function_word_count > 0:
483
+ function_word_diversity = unique_function_word_count / total_function_word_count
484
+ else:
485
+ function_word_diversity = 0.0
486
+
487
+ # Step 8: Find most/least frequent function words
488
+ if function_word_counts:
489
+ # Sort by count descending
490
+ sorted_by_count = sorted(function_word_counts.items(), key=lambda x: x[1], reverse=True)
491
+
492
+ # Top 10 most frequent
493
+ most_frequent = sorted_by_count[:10]
494
+
495
+ # Bottom 10 least frequent (reverse to get ascending order)
496
+ least_frequent = sorted_by_count[-10:]
497
+ least_frequent.reverse()
498
+ else:
499
+ most_frequent = []
500
+ least_frequent = []
501
+
502
+ # Step 9: Build category word lists (sorted)
503
+ determiner_list = sorted([w for w in function_word_counts if w in DETERMINERS])
504
+ preposition_list = sorted([w for w in function_word_counts if w in PREPOSITIONS])
505
+ conjunction_list = sorted([w for w in function_word_counts if w in CONJUNCTIONS])
506
+ pronoun_list = sorted([w for w in function_word_counts if w in PRONOUNS])
507
+ auxiliary_list = sorted([w for w in function_word_counts if w in AUXILIARIES])
508
+ particle_list = sorted([w for w in function_word_counts if w in PARTICLES])
509
+
510
+ # Step 10: Find overlapping words (words in multiple categories)
511
+ overlapping_words = []
512
+ overlapping_word_categories: dict[str, list[str]] = {}
513
+
514
+ for word in function_word_counts:
515
+ categories = []
516
+ if word in DETERMINERS:
517
+ categories.append("determiner")
518
+ if word in PREPOSITIONS:
519
+ categories.append("preposition")
520
+ if word in CONJUNCTIONS:
521
+ categories.append("conjunction")
522
+ if word in PRONOUNS:
523
+ categories.append("pronoun")
524
+ if word in AUXILIARIES:
525
+ categories.append("auxiliary")
526
+ if word in PARTICLES:
527
+ categories.append("particle")
528
+
529
+ if len(categories) > 1:
530
+ overlapping_words.append(word)
531
+ overlapping_word_categories[word] = categories
532
+
533
+ overlapping_words.sort()
534
+
535
+ # Step 11: Create single-value distributions (analysis is done on full text)
536
+ determiner_ratio_dist = make_distribution([determiner_ratio])
537
+ preposition_ratio_dist = make_distribution([preposition_ratio])
538
+ conjunction_ratio_dist = make_distribution([conjunction_ratio])
539
+ pronoun_ratio_dist = make_distribution([pronoun_ratio])
540
+ auxiliary_ratio_dist = make_distribution([auxiliary_ratio])
541
+ particle_ratio_dist = make_distribution([particle_ratio])
542
+ total_function_word_ratio_dist = make_distribution([total_function_word_ratio])
543
+ function_word_diversity_dist = make_distribution([function_word_diversity])
544
+
545
+ # Step 12: Build metadata
546
+ metadata = {
547
+ "total_word_count": total_words,
548
+ "total_function_word_count": total_function_word_count,
549
+ "unique_function_word_count": unique_function_word_count,
550
+ "determiner_count": determiner_count,
551
+ "preposition_count": preposition_count,
552
+ "conjunction_count": conjunction_count,
553
+ "pronoun_count": pronoun_count,
554
+ "auxiliary_count": auxiliary_count,
555
+ "particle_count": particle_count,
556
+ "determiner_list": determiner_list,
557
+ "preposition_list": preposition_list,
558
+ "conjunction_list": conjunction_list,
559
+ "pronoun_list": pronoun_list,
560
+ "auxiliary_list": auxiliary_list,
561
+ "particle_list": particle_list,
562
+ "overlapping_words": overlapping_words,
563
+ "overlapping_word_categories": overlapping_word_categories,
564
+ }
565
+
566
+ # Step 13: Return result
567
+ return FunctionWordResult(
568
+ determiner_ratio=determiner_ratio,
569
+ preposition_ratio=preposition_ratio,
570
+ conjunction_ratio=conjunction_ratio,
571
+ pronoun_ratio=pronoun_ratio,
572
+ auxiliary_ratio=auxiliary_ratio,
573
+ particle_ratio=particle_ratio,
574
+ total_function_word_ratio=total_function_word_ratio,
575
+ function_word_diversity=function_word_diversity,
576
+ most_frequent_function_words=most_frequent,
577
+ least_frequent_function_words=least_frequent,
578
+ function_word_distribution=function_word_counts,
579
+ determiner_ratio_dist=determiner_ratio_dist,
580
+ preposition_ratio_dist=preposition_ratio_dist,
581
+ conjunction_ratio_dist=conjunction_ratio_dist,
582
+ pronoun_ratio_dist=pronoun_ratio_dist,
583
+ auxiliary_ratio_dist=auxiliary_ratio_dist,
584
+ particle_ratio_dist=particle_ratio_dist,
585
+ total_function_word_ratio_dist=total_function_word_ratio_dist,
586
+ function_word_diversity_dist=function_word_diversity_dist,
587
+ chunk_size=chunk_size,
588
+ chunk_count=1, # Single pass analysis
589
+ metadata=metadata,
590
+ )