pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +45 -3
- pystylometry/_types.py +1017 -259
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +28 -4
- pystylometry/authorship/additional_methods.py +260 -40
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +354 -0
- pystylometry/character/README.md +17 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/README.md +27 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- pystylometry-1.3.0.dist-info/RECORD +76 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
- pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/METADATA +0 -275
- pystylometry-1.0.0.dist-info/RECORD +0 -46
|
@@ -24,8 +24,445 @@ References:
|
|
|
24
24
|
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
25
25
|
"""
|
|
26
26
|
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import re
|
|
30
|
+
from collections import Counter
|
|
31
|
+
from typing import Any
|
|
32
|
+
|
|
27
33
|
from .._types import StylisticMarkersResult
|
|
28
34
|
|
|
35
|
+
# =============================================================================
|
|
36
|
+
# CONTRACTION PATTERNS
|
|
37
|
+
# =============================================================================
|
|
38
|
+
# Map contractions to their expanded forms for detection and ratio calculation
|
|
39
|
+
# Related GitHub Issue #20: https://github.com/craigtrim/pystylometry/issues/20
|
|
40
|
+
|
|
41
|
+
CONTRACTIONS: dict[str, str] = {
|
|
42
|
+
# Negative contractions
|
|
43
|
+
"aren't": "are not",
|
|
44
|
+
"can't": "cannot",
|
|
45
|
+
"couldn't": "could not",
|
|
46
|
+
"didn't": "did not",
|
|
47
|
+
"doesn't": "does not",
|
|
48
|
+
"don't": "do not",
|
|
49
|
+
"hadn't": "had not",
|
|
50
|
+
"hasn't": "has not",
|
|
51
|
+
"haven't": "have not",
|
|
52
|
+
"isn't": "is not",
|
|
53
|
+
"mightn't": "might not",
|
|
54
|
+
"mustn't": "must not",
|
|
55
|
+
"needn't": "need not",
|
|
56
|
+
"shan't": "shall not",
|
|
57
|
+
"shouldn't": "should not",
|
|
58
|
+
"wasn't": "was not",
|
|
59
|
+
"weren't": "were not",
|
|
60
|
+
"won't": "will not",
|
|
61
|
+
"wouldn't": "would not",
|
|
62
|
+
# Pronoun contractions
|
|
63
|
+
"i'm": "i am",
|
|
64
|
+
"i've": "i have",
|
|
65
|
+
"i'll": "i will",
|
|
66
|
+
"i'd": "i would",
|
|
67
|
+
"you're": "you are",
|
|
68
|
+
"you've": "you have",
|
|
69
|
+
"you'll": "you will",
|
|
70
|
+
"you'd": "you would",
|
|
71
|
+
"he's": "he is",
|
|
72
|
+
"he'll": "he will",
|
|
73
|
+
"he'd": "he would",
|
|
74
|
+
"she's": "she is",
|
|
75
|
+
"she'll": "she will",
|
|
76
|
+
"she'd": "she would",
|
|
77
|
+
"it's": "it is",
|
|
78
|
+
"it'll": "it will",
|
|
79
|
+
"it'd": "it would",
|
|
80
|
+
"we're": "we are",
|
|
81
|
+
"we've": "we have",
|
|
82
|
+
"we'll": "we will",
|
|
83
|
+
"we'd": "we would",
|
|
84
|
+
"they're": "they are",
|
|
85
|
+
"they've": "they have",
|
|
86
|
+
"they'll": "they will",
|
|
87
|
+
"they'd": "they would",
|
|
88
|
+
"that's": "that is",
|
|
89
|
+
"that'll": "that will",
|
|
90
|
+
"that'd": "that would",
|
|
91
|
+
"who's": "who is",
|
|
92
|
+
"who'll": "who will",
|
|
93
|
+
"who'd": "who would",
|
|
94
|
+
"what's": "what is",
|
|
95
|
+
"what'll": "what will",
|
|
96
|
+
"what'd": "what would",
|
|
97
|
+
"where's": "where is",
|
|
98
|
+
"where'll": "where will",
|
|
99
|
+
"where'd": "where would",
|
|
100
|
+
"when's": "when is",
|
|
101
|
+
"when'll": "when will",
|
|
102
|
+
"when'd": "when would",
|
|
103
|
+
"why's": "why is",
|
|
104
|
+
"why'll": "why will",
|
|
105
|
+
"why'd": "why would",
|
|
106
|
+
"how's": "how is",
|
|
107
|
+
"how'll": "how will",
|
|
108
|
+
"how'd": "how would",
|
|
109
|
+
"there's": "there is",
|
|
110
|
+
"there'll": "there will",
|
|
111
|
+
"there'd": "there would",
|
|
112
|
+
"here's": "here is",
|
|
113
|
+
# Other contractions
|
|
114
|
+
"let's": "let us",
|
|
115
|
+
"ain't": "am not",
|
|
116
|
+
"'twas": "it was",
|
|
117
|
+
"'tis": "it is",
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
# Build expanded form patterns for detection
|
|
121
|
+
# These patterns match the expanded forms that could have been contracted
|
|
122
|
+
EXPANDED_FORM_PATTERNS: list[tuple[re.Pattern[str], str]] = [
|
|
123
|
+
(re.compile(r"\b(are)\s+(not)\b", re.IGNORECASE), "aren't"),
|
|
124
|
+
(re.compile(r"\b(can)\s*(?:not|n't)\b", re.IGNORECASE), "can't"),
|
|
125
|
+
(re.compile(r"\b(could)\s+(not)\b", re.IGNORECASE), "couldn't"),
|
|
126
|
+
(re.compile(r"\b(did)\s+(not)\b", re.IGNORECASE), "didn't"),
|
|
127
|
+
(re.compile(r"\b(does)\s+(not)\b", re.IGNORECASE), "doesn't"),
|
|
128
|
+
(re.compile(r"\b(do)\s+(not)\b", re.IGNORECASE), "don't"),
|
|
129
|
+
(re.compile(r"\b(had)\s+(not)\b", re.IGNORECASE), "hadn't"),
|
|
130
|
+
(re.compile(r"\b(has)\s+(not)\b", re.IGNORECASE), "hasn't"),
|
|
131
|
+
(re.compile(r"\b(have)\s+(not)\b", re.IGNORECASE), "haven't"),
|
|
132
|
+
(re.compile(r"\b(is)\s+(not)\b", re.IGNORECASE), "isn't"),
|
|
133
|
+
(re.compile(r"\b(might)\s+(not)\b", re.IGNORECASE), "mightn't"),
|
|
134
|
+
(re.compile(r"\b(must)\s+(not)\b", re.IGNORECASE), "mustn't"),
|
|
135
|
+
(re.compile(r"\b(need)\s+(not)\b", re.IGNORECASE), "needn't"),
|
|
136
|
+
(re.compile(r"\b(shall)\s+(not)\b", re.IGNORECASE), "shan't"),
|
|
137
|
+
(re.compile(r"\b(should)\s+(not)\b", re.IGNORECASE), "shouldn't"),
|
|
138
|
+
(re.compile(r"\b(was)\s+(not)\b", re.IGNORECASE), "wasn't"),
|
|
139
|
+
(re.compile(r"\b(were)\s+(not)\b", re.IGNORECASE), "weren't"),
|
|
140
|
+
(re.compile(r"\b(will)\s+(not)\b", re.IGNORECASE), "won't"),
|
|
141
|
+
(re.compile(r"\b(would)\s+(not)\b", re.IGNORECASE), "wouldn't"),
|
|
142
|
+
(re.compile(r"\b(i)\s+(am)\b", re.IGNORECASE), "i'm"),
|
|
143
|
+
(re.compile(r"\b(i)\s+(have)\b", re.IGNORECASE), "i've"),
|
|
144
|
+
(re.compile(r"\b(i)\s+(will)\b", re.IGNORECASE), "i'll"),
|
|
145
|
+
(re.compile(r"\b(i)\s+(would)\b", re.IGNORECASE), "i'd"),
|
|
146
|
+
(re.compile(r"\b(you)\s+(are)\b", re.IGNORECASE), "you're"),
|
|
147
|
+
(re.compile(r"\b(you)\s+(have)\b", re.IGNORECASE), "you've"),
|
|
148
|
+
(re.compile(r"\b(you)\s+(will)\b", re.IGNORECASE), "you'll"),
|
|
149
|
+
(re.compile(r"\b(you)\s+(would)\b", re.IGNORECASE), "you'd"),
|
|
150
|
+
(re.compile(r"\b(he)\s+(is)\b", re.IGNORECASE), "he's"),
|
|
151
|
+
(re.compile(r"\b(he)\s+(will)\b", re.IGNORECASE), "he'll"),
|
|
152
|
+
(re.compile(r"\b(he)\s+(would)\b", re.IGNORECASE), "he'd"),
|
|
153
|
+
(re.compile(r"\b(she)\s+(is)\b", re.IGNORECASE), "she's"),
|
|
154
|
+
(re.compile(r"\b(she)\s+(will)\b", re.IGNORECASE), "she'll"),
|
|
155
|
+
(re.compile(r"\b(she)\s+(would)\b", re.IGNORECASE), "she'd"),
|
|
156
|
+
(re.compile(r"\b(it)\s+(is)\b", re.IGNORECASE), "it's"),
|
|
157
|
+
(re.compile(r"\b(it)\s+(will)\b", re.IGNORECASE), "it'll"),
|
|
158
|
+
(re.compile(r"\b(it)\s+(would)\b", re.IGNORECASE), "it'd"),
|
|
159
|
+
(re.compile(r"\b(we)\s+(are)\b", re.IGNORECASE), "we're"),
|
|
160
|
+
(re.compile(r"\b(we)\s+(have)\b", re.IGNORECASE), "we've"),
|
|
161
|
+
(re.compile(r"\b(we)\s+(will)\b", re.IGNORECASE), "we'll"),
|
|
162
|
+
(re.compile(r"\b(we)\s+(would)\b", re.IGNORECASE), "we'd"),
|
|
163
|
+
(re.compile(r"\b(they)\s+(are)\b", re.IGNORECASE), "they're"),
|
|
164
|
+
(re.compile(r"\b(they)\s+(have)\b", re.IGNORECASE), "they've"),
|
|
165
|
+
(re.compile(r"\b(they)\s+(will)\b", re.IGNORECASE), "they'll"),
|
|
166
|
+
(re.compile(r"\b(they)\s+(would)\b", re.IGNORECASE), "they'd"),
|
|
167
|
+
(re.compile(r"\b(that)\s+(is)\b", re.IGNORECASE), "that's"),
|
|
168
|
+
(re.compile(r"\b(there)\s+(is)\b", re.IGNORECASE), "there's"),
|
|
169
|
+
(re.compile(r"\b(here)\s+(is)\b", re.IGNORECASE), "here's"),
|
|
170
|
+
(re.compile(r"\b(let)\s+(us)\b", re.IGNORECASE), "let's"),
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
# =============================================================================
|
|
174
|
+
# INTENSIFIERS
|
|
175
|
+
# =============================================================================
|
|
176
|
+
# Words that amplify or emphasize meaning
|
|
177
|
+
# Reference: Biber, D. (1988). Variation across speech and writing.
|
|
178
|
+
|
|
179
|
+
INTENSIFIERS: set[str] = {
|
|
180
|
+
# Amplifiers (boosters)
|
|
181
|
+
"very",
|
|
182
|
+
"really",
|
|
183
|
+
"extremely",
|
|
184
|
+
"absolutely",
|
|
185
|
+
"completely",
|
|
186
|
+
"totally",
|
|
187
|
+
"entirely",
|
|
188
|
+
"utterly",
|
|
189
|
+
"thoroughly",
|
|
190
|
+
"perfectly",
|
|
191
|
+
"highly",
|
|
192
|
+
"deeply",
|
|
193
|
+
"greatly",
|
|
194
|
+
"strongly",
|
|
195
|
+
"immensely",
|
|
196
|
+
"incredibly",
|
|
197
|
+
"remarkably",
|
|
198
|
+
"exceptionally",
|
|
199
|
+
"extraordinarily",
|
|
200
|
+
"tremendously",
|
|
201
|
+
"enormously",
|
|
202
|
+
"vastly",
|
|
203
|
+
"significantly",
|
|
204
|
+
"substantially",
|
|
205
|
+
"considerably",
|
|
206
|
+
"profoundly",
|
|
207
|
+
"intensely",
|
|
208
|
+
"acutely",
|
|
209
|
+
"severely",
|
|
210
|
+
"seriously",
|
|
211
|
+
# Degree modifiers
|
|
212
|
+
"quite",
|
|
213
|
+
"rather",
|
|
214
|
+
"fairly",
|
|
215
|
+
"pretty",
|
|
216
|
+
"so",
|
|
217
|
+
"too",
|
|
218
|
+
"such",
|
|
219
|
+
"much",
|
|
220
|
+
"more",
|
|
221
|
+
"most",
|
|
222
|
+
"particularly",
|
|
223
|
+
"especially",
|
|
224
|
+
"decidedly",
|
|
225
|
+
"definitely",
|
|
226
|
+
"certainly",
|
|
227
|
+
"surely",
|
|
228
|
+
"indeed",
|
|
229
|
+
# Informal intensifiers
|
|
230
|
+
"super",
|
|
231
|
+
"mega",
|
|
232
|
+
"ultra",
|
|
233
|
+
"way",
|
|
234
|
+
"real",
|
|
235
|
+
"awful",
|
|
236
|
+
"awfully",
|
|
237
|
+
"terribly",
|
|
238
|
+
"dreadfully",
|
|
239
|
+
"frightfully",
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
# =============================================================================
|
|
243
|
+
# HEDGES
|
|
244
|
+
# =============================================================================
|
|
245
|
+
# Words that weaken or qualify statements, showing uncertainty or politeness
|
|
246
|
+
# Reference: Lakoff, G. (1972). Hedges: A study in meaning criteria.
|
|
247
|
+
|
|
248
|
+
HEDGES: set[str] = {
|
|
249
|
+
# Epistemic hedges (expressing uncertainty)
|
|
250
|
+
"maybe",
|
|
251
|
+
"perhaps",
|
|
252
|
+
"possibly",
|
|
253
|
+
"probably",
|
|
254
|
+
"apparently",
|
|
255
|
+
"seemingly",
|
|
256
|
+
"supposedly",
|
|
257
|
+
"allegedly",
|
|
258
|
+
"presumably",
|
|
259
|
+
"conceivably",
|
|
260
|
+
"potentially",
|
|
261
|
+
"arguably",
|
|
262
|
+
"ostensibly",
|
|
263
|
+
# Approximators
|
|
264
|
+
"about",
|
|
265
|
+
"around",
|
|
266
|
+
"approximately",
|
|
267
|
+
"roughly",
|
|
268
|
+
"nearly",
|
|
269
|
+
"almost",
|
|
270
|
+
"virtually",
|
|
271
|
+
"practically",
|
|
272
|
+
"essentially",
|
|
273
|
+
"basically",
|
|
274
|
+
"generally",
|
|
275
|
+
"usually",
|
|
276
|
+
"typically",
|
|
277
|
+
"normally",
|
|
278
|
+
"ordinarily",
|
|
279
|
+
# Degree hedges
|
|
280
|
+
"somewhat",
|
|
281
|
+
"slightly",
|
|
282
|
+
"a bit",
|
|
283
|
+
"a little",
|
|
284
|
+
"kind of",
|
|
285
|
+
"sort of",
|
|
286
|
+
"more or less",
|
|
287
|
+
"to some extent",
|
|
288
|
+
"in a way",
|
|
289
|
+
"in some ways",
|
|
290
|
+
"to a degree",
|
|
291
|
+
"relatively",
|
|
292
|
+
"comparatively",
|
|
293
|
+
"partly",
|
|
294
|
+
"partially",
|
|
295
|
+
# Shield expressions
|
|
296
|
+
"seem",
|
|
297
|
+
"seems",
|
|
298
|
+
"seemed",
|
|
299
|
+
"appear",
|
|
300
|
+
"appears",
|
|
301
|
+
"appeared",
|
|
302
|
+
"suggest",
|
|
303
|
+
"suggests",
|
|
304
|
+
"suggested",
|
|
305
|
+
"indicate",
|
|
306
|
+
"indicates",
|
|
307
|
+
"indicated",
|
|
308
|
+
"tend",
|
|
309
|
+
"tends",
|
|
310
|
+
"tended",
|
|
311
|
+
# Attribution hedges
|
|
312
|
+
"reportedly",
|
|
313
|
+
"according to",
|
|
314
|
+
"i think",
|
|
315
|
+
"i believe",
|
|
316
|
+
"i suppose",
|
|
317
|
+
"i guess",
|
|
318
|
+
"i assume",
|
|
319
|
+
"it seems",
|
|
320
|
+
"it appears",
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
# =============================================================================
|
|
324
|
+
# MODAL AUXILIARIES
|
|
325
|
+
# =============================================================================
|
|
326
|
+
# Epistemic modals express possibility/probability
|
|
327
|
+
# Deontic modals express necessity/obligation/permission
|
|
328
|
+
|
|
329
|
+
EPISTEMIC_MODALS: set[str] = {
|
|
330
|
+
"may",
|
|
331
|
+
"might",
|
|
332
|
+
"could",
|
|
333
|
+
"can",
|
|
334
|
+
"would",
|
|
335
|
+
"should",
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
DEONTIC_MODALS: set[str] = {
|
|
339
|
+
"must",
|
|
340
|
+
"shall",
|
|
341
|
+
"will",
|
|
342
|
+
"should",
|
|
343
|
+
"ought",
|
|
344
|
+
"need",
|
|
345
|
+
"have to",
|
|
346
|
+
"has to",
|
|
347
|
+
"had to",
|
|
348
|
+
"got to",
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
ALL_MODALS: set[str] = {
|
|
352
|
+
"can",
|
|
353
|
+
"could",
|
|
354
|
+
"may",
|
|
355
|
+
"might",
|
|
356
|
+
"must",
|
|
357
|
+
"shall",
|
|
358
|
+
"should",
|
|
359
|
+
"will",
|
|
360
|
+
"would",
|
|
361
|
+
"ought",
|
|
362
|
+
"need",
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
# =============================================================================
|
|
366
|
+
# NEGATION MARKERS
|
|
367
|
+
# =============================================================================
|
|
368
|
+
# Words and patterns that express negation
|
|
369
|
+
|
|
370
|
+
NEGATION_MARKERS: set[str] = {
|
|
371
|
+
"not",
|
|
372
|
+
"no",
|
|
373
|
+
"never",
|
|
374
|
+
"none",
|
|
375
|
+
"nothing",
|
|
376
|
+
"nobody",
|
|
377
|
+
"nowhere",
|
|
378
|
+
"neither",
|
|
379
|
+
"nor",
|
|
380
|
+
"without",
|
|
381
|
+
"hardly",
|
|
382
|
+
"barely",
|
|
383
|
+
"scarcely",
|
|
384
|
+
"rarely",
|
|
385
|
+
"seldom",
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
# =============================================================================
|
|
389
|
+
# PUNCTUATION PATTERNS
|
|
390
|
+
# =============================================================================
|
|
391
|
+
|
|
392
|
+
# Patterns for punctuation detection
|
|
393
|
+
ELLIPSIS_PATTERN = re.compile(r"\.{3}|…")
|
|
394
|
+
DASH_PATTERN = re.compile(r"—|–|--") # em-dash, en-dash, double hyphen
|
|
395
|
+
PARENTHETICAL_PATTERN = re.compile(r"[()]")
|
|
396
|
+
QUOTATION_PATTERN = re.compile(r'["""\'\']') # Various quote styles
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def _tokenize_simple(text: str) -> list[str]:
|
|
400
|
+
"""Simple word tokenization for marker analysis.
|
|
401
|
+
|
|
402
|
+
Preserves contractions as single tokens while splitting on whitespace
|
|
403
|
+
and basic punctuation.
|
|
404
|
+
"""
|
|
405
|
+
# First normalize apostrophes
|
|
406
|
+
text = text.replace("'", "'").replace("'", "'")
|
|
407
|
+
|
|
408
|
+
# Split on whitespace and punctuation, keeping contractions together
|
|
409
|
+
# This pattern keeps words with apostrophes intact
|
|
410
|
+
tokens = re.findall(r"\b[\w']+\b", text.lower())
|
|
411
|
+
|
|
412
|
+
return tokens
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def _count_contractions(text: str) -> tuple[Counter[str], int]:
|
|
416
|
+
"""Count contractions in text.
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
Tuple of (contraction_counts, expanded_form_count)
|
|
420
|
+
"""
|
|
421
|
+
text_lower = text.lower()
|
|
422
|
+
# Normalize apostrophes
|
|
423
|
+
text_lower = text_lower.replace("'", "'").replace("'", "'")
|
|
424
|
+
|
|
425
|
+
contraction_counts: Counter[str] = Counter()
|
|
426
|
+
|
|
427
|
+
# Count each contraction
|
|
428
|
+
for contraction in CONTRACTIONS:
|
|
429
|
+
# Use word boundary matching
|
|
430
|
+
contraction_pattern = r"\b" + re.escape(contraction) + r"\b"
|
|
431
|
+
matches = re.findall(contraction_pattern, text_lower)
|
|
432
|
+
if matches:
|
|
433
|
+
contraction_counts[contraction] = len(matches)
|
|
434
|
+
|
|
435
|
+
# Count expanded forms
|
|
436
|
+
expanded_count = 0
|
|
437
|
+
for expanded_pattern, _ in EXPANDED_FORM_PATTERNS:
|
|
438
|
+
matches = expanded_pattern.findall(text_lower)
|
|
439
|
+
expanded_count += len(matches)
|
|
440
|
+
|
|
441
|
+
return contraction_counts, expanded_count
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _count_markers(tokens: list[str], marker_set: set[str]) -> Counter[str]:
|
|
445
|
+
"""Count occurrences of markers from a set in tokenized text."""
|
|
446
|
+
counts: Counter[str] = Counter()
|
|
447
|
+
for token in tokens:
|
|
448
|
+
if token in marker_set:
|
|
449
|
+
counts[token] += 1
|
|
450
|
+
return counts
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def _count_punctuation(text: str) -> dict[str, int]:
|
|
454
|
+
"""Count various punctuation marks in text."""
|
|
455
|
+
return {
|
|
456
|
+
"exclamation": text.count("!"),
|
|
457
|
+
"question": text.count("?"),
|
|
458
|
+
"quotation": len(QUOTATION_PATTERN.findall(text)),
|
|
459
|
+
"parenthetical": len(PARENTHETICAL_PATTERN.findall(text)),
|
|
460
|
+
"ellipsis": len(ELLIPSIS_PATTERN.findall(text)),
|
|
461
|
+
"dash": len(DASH_PATTERN.findall(text)),
|
|
462
|
+
"semicolon": text.count(";"),
|
|
463
|
+
"colon": text.count(":"),
|
|
464
|
+
}
|
|
465
|
+
|
|
29
466
|
|
|
30
467
|
def compute_stylistic_markers(text: str) -> StylisticMarkersResult:
|
|
31
468
|
"""
|
|
@@ -101,31 +538,188 @@ def compute_stylistic_markers(text: str) -> StylisticMarkersResult:
|
|
|
101
538
|
See _types.py for complete field list.
|
|
102
539
|
|
|
103
540
|
Example:
|
|
104
|
-
>>> result = compute_stylistic_markers("
|
|
541
|
+
>>> result = compute_stylistic_markers("I can't believe it's really happening!")
|
|
105
542
|
>>> print(f"Contraction ratio: {result.contraction_ratio * 100:.1f}%")
|
|
106
|
-
Contraction ratio: 42.3%
|
|
107
543
|
>>> print(f"Intensifiers/100 words: {result.intensifier_density:.2f}")
|
|
108
|
-
Intensifiers/100 words: 3.45
|
|
109
|
-
>>> print(f"Top intensifiers: {result.top_intensifiers[:3]}")
|
|
110
|
-
Top intensifiers: [('very', 12), ('really', 8), ('quite', 5)]
|
|
111
544
|
>>> print(f"Exclamation density: {result.exclamation_density:.2f}")
|
|
112
|
-
Exclamation density: 2.10
|
|
113
545
|
|
|
114
546
|
Note:
|
|
115
547
|
- Densities are per 100 words for interpretability
|
|
116
548
|
- Contraction detection requires pattern matching
|
|
117
549
|
- Modal auxiliaries classified as epistemic or deontic
|
|
118
550
|
- Punctuation counts include all occurrences
|
|
119
|
-
- Empty text returns
|
|
551
|
+
- Empty text returns 0.0 for ratios, 0 for counts
|
|
120
552
|
"""
|
|
121
|
-
#
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
553
|
+
# Handle empty text
|
|
554
|
+
if not text or not text.strip():
|
|
555
|
+
return StylisticMarkersResult(
|
|
556
|
+
contraction_ratio=0.0,
|
|
557
|
+
contraction_count=0,
|
|
558
|
+
expanded_form_count=0,
|
|
559
|
+
top_contractions=[],
|
|
560
|
+
intensifier_density=0.0,
|
|
561
|
+
intensifier_count=0,
|
|
562
|
+
top_intensifiers=[],
|
|
563
|
+
hedging_density=0.0,
|
|
564
|
+
hedging_count=0,
|
|
565
|
+
top_hedges=[],
|
|
566
|
+
modal_density=0.0,
|
|
567
|
+
modal_distribution={},
|
|
568
|
+
epistemic_modal_ratio=0.0,
|
|
569
|
+
deontic_modal_ratio=0.0,
|
|
570
|
+
negation_density=0.0,
|
|
571
|
+
negation_count=0,
|
|
572
|
+
negation_types={},
|
|
573
|
+
exclamation_density=0.0,
|
|
574
|
+
question_density=0.0,
|
|
575
|
+
quotation_density=0.0,
|
|
576
|
+
parenthetical_density=0.0,
|
|
577
|
+
ellipsis_density=0.0,
|
|
578
|
+
dash_density=0.0,
|
|
579
|
+
semicolon_density=0.0,
|
|
580
|
+
colon_density=0.0,
|
|
581
|
+
metadata={"word_count": 0, "warning": "Empty text"},
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
# Tokenize
|
|
585
|
+
tokens = _tokenize_simple(text)
|
|
586
|
+
word_count = len(tokens)
|
|
587
|
+
|
|
588
|
+
if word_count == 0:
|
|
589
|
+
return StylisticMarkersResult(
|
|
590
|
+
contraction_ratio=0.0,
|
|
591
|
+
contraction_count=0,
|
|
592
|
+
expanded_form_count=0,
|
|
593
|
+
top_contractions=[],
|
|
594
|
+
intensifier_density=0.0,
|
|
595
|
+
intensifier_count=0,
|
|
596
|
+
top_intensifiers=[],
|
|
597
|
+
hedging_density=0.0,
|
|
598
|
+
hedging_count=0,
|
|
599
|
+
top_hedges=[],
|
|
600
|
+
modal_density=0.0,
|
|
601
|
+
modal_distribution={},
|
|
602
|
+
epistemic_modal_ratio=0.0,
|
|
603
|
+
deontic_modal_ratio=0.0,
|
|
604
|
+
negation_density=0.0,
|
|
605
|
+
negation_count=0,
|
|
606
|
+
negation_types={},
|
|
607
|
+
exclamation_density=0.0,
|
|
608
|
+
question_density=0.0,
|
|
609
|
+
quotation_density=0.0,
|
|
610
|
+
parenthetical_density=0.0,
|
|
611
|
+
ellipsis_density=0.0,
|
|
612
|
+
dash_density=0.0,
|
|
613
|
+
semicolon_density=0.0,
|
|
614
|
+
colon_density=0.0,
|
|
615
|
+
metadata={"word_count": 0, "warning": "No tokens found"},
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
# Calculate density multiplier (per 100 words)
|
|
619
|
+
density_multiplier = 100.0 / word_count
|
|
620
|
+
|
|
621
|
+
# ==========================================================================
|
|
622
|
+
# CONTRACTIONS
|
|
623
|
+
# ==========================================================================
|
|
624
|
+
contraction_counts, expanded_form_count = _count_contractions(text)
|
|
625
|
+
contraction_count = sum(contraction_counts.values())
|
|
626
|
+
total_contractable = contraction_count + expanded_form_count
|
|
627
|
+
contraction_ratio = contraction_count / total_contractable if total_contractable > 0 else 0.0
|
|
628
|
+
top_contractions = contraction_counts.most_common(10)
|
|
629
|
+
|
|
630
|
+
# ==========================================================================
|
|
631
|
+
# INTENSIFIERS
|
|
632
|
+
# ==========================================================================
|
|
633
|
+
intensifier_counts = _count_markers(tokens, INTENSIFIERS)
|
|
634
|
+
intensifier_count = sum(intensifier_counts.values())
|
|
635
|
+
intensifier_density = intensifier_count * density_multiplier
|
|
636
|
+
top_intensifiers = intensifier_counts.most_common(10)
|
|
637
|
+
|
|
638
|
+
# ==========================================================================
|
|
639
|
+
# HEDGES
|
|
640
|
+
# ==========================================================================
|
|
641
|
+
hedge_counts = _count_markers(tokens, HEDGES)
|
|
642
|
+
hedge_count = sum(hedge_counts.values())
|
|
643
|
+
hedging_density = hedge_count * density_multiplier
|
|
644
|
+
top_hedges = hedge_counts.most_common(10)
|
|
645
|
+
|
|
646
|
+
# ==========================================================================
|
|
647
|
+
# MODAL AUXILIARIES
|
|
648
|
+
# ==========================================================================
|
|
649
|
+
modal_counts = _count_markers(tokens, ALL_MODALS)
|
|
650
|
+
modal_distribution = dict(modal_counts)
|
|
651
|
+
total_modals = sum(modal_counts.values())
|
|
652
|
+
modal_density = total_modals * density_multiplier
|
|
653
|
+
|
|
654
|
+
# Calculate epistemic vs deontic ratios
|
|
655
|
+
epistemic_count = sum(modal_counts.get(m, 0) for m in EPISTEMIC_MODALS)
|
|
656
|
+
deontic_count = sum(modal_counts.get(m, 0) for m in DEONTIC_MODALS)
|
|
657
|
+
epistemic_modal_ratio = epistemic_count / total_modals if total_modals > 0 else 0.0
|
|
658
|
+
deontic_modal_ratio = deontic_count / total_modals if total_modals > 0 else 0.0
|
|
659
|
+
|
|
660
|
+
# ==========================================================================
|
|
661
|
+
# NEGATION
|
|
662
|
+
# ==========================================================================
|
|
663
|
+
negation_counts = _count_markers(tokens, NEGATION_MARKERS)
|
|
664
|
+
negation_count = sum(negation_counts.values())
|
|
665
|
+
negation_density = negation_count * density_multiplier
|
|
666
|
+
negation_types = dict(negation_counts)
|
|
667
|
+
|
|
668
|
+
# ==========================================================================
|
|
669
|
+
# PUNCTUATION
|
|
670
|
+
# ==========================================================================
|
|
671
|
+
punct_counts = _count_punctuation(text)
|
|
672
|
+
exclamation_density = punct_counts["exclamation"] * density_multiplier
|
|
673
|
+
question_density = punct_counts["question"] * density_multiplier
|
|
674
|
+
quotation_density = punct_counts["quotation"] * density_multiplier
|
|
675
|
+
parenthetical_density = punct_counts["parenthetical"] * density_multiplier
|
|
676
|
+
ellipsis_density = punct_counts["ellipsis"] * density_multiplier
|
|
677
|
+
dash_density = punct_counts["dash"] * density_multiplier
|
|
678
|
+
semicolon_density = punct_counts["semicolon"] * density_multiplier
|
|
679
|
+
colon_density = punct_counts["colon"] * density_multiplier
|
|
680
|
+
|
|
681
|
+
# ==========================================================================
|
|
682
|
+
# BUILD RESULT
|
|
683
|
+
# ==========================================================================
|
|
684
|
+
metadata: dict[str, Any] = {
|
|
685
|
+
"word_count": word_count,
|
|
686
|
+
"contraction_list": CONTRACTIONS,
|
|
687
|
+
"intensifier_list": sorted(INTENSIFIERS),
|
|
688
|
+
"hedge_list": sorted(HEDGES),
|
|
689
|
+
"modal_list": sorted(ALL_MODALS),
|
|
690
|
+
"negation_list": sorted(NEGATION_MARKERS),
|
|
691
|
+
"punctuation_counts": punct_counts,
|
|
692
|
+
"all_contraction_counts": dict(contraction_counts),
|
|
693
|
+
"all_intensifier_counts": dict(intensifier_counts),
|
|
694
|
+
"all_hedge_counts": dict(hedge_counts),
|
|
695
|
+
"all_negation_counts": dict(negation_counts),
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
return StylisticMarkersResult(
|
|
699
|
+
contraction_ratio=contraction_ratio,
|
|
700
|
+
contraction_count=contraction_count,
|
|
701
|
+
expanded_form_count=expanded_form_count,
|
|
702
|
+
top_contractions=top_contractions,
|
|
703
|
+
intensifier_density=intensifier_density,
|
|
704
|
+
intensifier_count=intensifier_count,
|
|
705
|
+
top_intensifiers=top_intensifiers,
|
|
706
|
+
hedging_density=hedging_density,
|
|
707
|
+
hedging_count=hedge_count,
|
|
708
|
+
top_hedges=top_hedges,
|
|
709
|
+
modal_density=modal_density,
|
|
710
|
+
modal_distribution=modal_distribution,
|
|
711
|
+
epistemic_modal_ratio=epistemic_modal_ratio,
|
|
712
|
+
deontic_modal_ratio=deontic_modal_ratio,
|
|
713
|
+
negation_density=negation_density,
|
|
714
|
+
negation_count=negation_count,
|
|
715
|
+
negation_types=negation_types,
|
|
716
|
+
exclamation_density=exclamation_density,
|
|
717
|
+
question_density=question_density,
|
|
718
|
+
quotation_density=quotation_density,
|
|
719
|
+
parenthetical_density=parenthetical_density,
|
|
720
|
+
ellipsis_density=ellipsis_density,
|
|
721
|
+
dash_density=dash_density,
|
|
722
|
+
semicolon_density=semicolon_density,
|
|
723
|
+
colon_density=colon_density,
|
|
724
|
+
metadata=metadata,
|
|
131
725
|
)
|