pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +1 -2
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1224 -2
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +4 -0
- pystylometry/authorship/additional_methods.py +100 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +301 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +641 -0
- pystylometry/lexical/function_words.py +391 -0
- pystylometry/lexical/hapax.py +154 -7
- pystylometry/lexical/mtld.py +83 -7
- pystylometry/lexical/ttr.py +83 -0
- pystylometry/lexical/word_frequency_sophistication.py +581 -0
- pystylometry/lexical/yule.py +34 -7
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +985 -0
- pystylometry/readability/ari.py +93 -17
- pystylometry/readability/coleman_liau.py +102 -9
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +59 -14
- pystylometry/readability/gunning_fog.py +194 -25
- pystylometry/readability/smog.py +31 -14
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +432 -0
- pystylometry/syntactic/pos_ratios.py +104 -13
- pystylometry/syntactic/sentence_stats.py +57 -13
- pystylometry/syntactic/sentence_types.py +470 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
- pystylometry-1.0.0.dist-info/RECORD +46 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
- pystylometry-0.1.0.dist-info/RECORD +0 -26
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
"""Function word analysis for authorship attribution.
|
|
2
|
+
|
|
3
|
+
Function words (determiners, prepositions, conjunctions, pronouns, auxiliary
|
|
4
|
+
verbs) are highly frequent, content-independent words that authors use
|
|
5
|
+
subconsciously and consistently across different topics. This makes them
|
|
6
|
+
powerful markers for authorship attribution.
|
|
7
|
+
|
|
8
|
+
Related GitHub Issue:
|
|
9
|
+
#13 - Function Word Analysis
|
|
10
|
+
https://github.com/craigtrim/pystylometry/issues/13
|
|
11
|
+
|
|
12
|
+
Features implemented:
|
|
13
|
+
- Frequency profiles for all function word categories
|
|
14
|
+
- Ratios for specific grammatical categories
|
|
15
|
+
- Most/least frequently used function words
|
|
16
|
+
- Function word diversity metrics
|
|
17
|
+
|
|
18
|
+
Function word categories:
|
|
19
|
+
- Determiners: the, a, an, this, that, these, those, my, your, etc.
|
|
20
|
+
- Prepositions: in, on, at, by, for, with, from, to, of, etc.
|
|
21
|
+
- Conjunctions: and, but, or, nor, for, yet, so, because, although, etc.
|
|
22
|
+
- Pronouns: I, you, he, she, it, we, they, me, him, her, us, them, etc.
|
|
23
|
+
- Auxiliary verbs: be, have, do, can, will, shall, may, must, etc.
|
|
24
|
+
- Particles: up, down, out, off, over, away, back, etc.
|
|
25
|
+
|
|
26
|
+
References:
|
|
27
|
+
Mosteller, F., & Wallace, D. L. (1964). Inference and disputed authorship:
|
|
28
|
+
The Federalist. Addison-Wesley.
|
|
29
|
+
Burrows, J. (2002). 'Delta': A measure of stylistic difference and a guide
|
|
30
|
+
to likely authorship. Literary and Linguistic Computing, 17(3), 267-287.
|
|
31
|
+
Argamon, S., & Levitan, S. (2005). Measuring the usefulness of function
|
|
32
|
+
words for authorship attribution. ACH/ALLC.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from .._types import FunctionWordResult
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Function word lists for English
|
|
39
|
+
# GitHub Issue #13: https://github.com/craigtrim/pystylometry/issues/13
|
|
40
|
+
# These lists should be comprehensive and cover all major function word categories.
|
|
41
|
+
# Consider loading from external resource files for easier maintenance.
|
|
42
|
+
|
|
43
|
+
# Determiners (articles, demonstratives, possessives, quantifiers)
|
|
44
|
+
DETERMINERS = {
|
|
45
|
+
"the", "a", "an", # Articles
|
|
46
|
+
"this", "that", "these", "those", # Demonstratives
|
|
47
|
+
"my", "your", "his", "her", "its", "our", "their", # Possessive determiners
|
|
48
|
+
"some", "any", "no", "every", "each", "either", "neither", # Quantifiers
|
|
49
|
+
"much", "many", "more", "most", "few", "fewer", "less", "least",
|
|
50
|
+
"all", "both", "half", "several", "enough",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# Prepositions (locative, temporal, other)
|
|
54
|
+
PREPOSITIONS = {
|
|
55
|
+
"in", "on", "at", "by", "for", "with", "from", "to", "of",
|
|
56
|
+
"about", "above", "across", "after", "against", "along", "among",
|
|
57
|
+
"around", "as", "before", "behind", "below", "beneath", "beside",
|
|
58
|
+
"between", "beyond", "but", "concerning", "considering", "despite",
|
|
59
|
+
"down", "during", "except", "inside", "into", "like", "near",
|
|
60
|
+
"off", "onto", "out", "outside", "over", "past", "regarding",
|
|
61
|
+
"since", "through", "throughout", "till", "toward", "under",
|
|
62
|
+
"underneath", "until", "up", "upon", "via", "within", "without",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Conjunctions (coordinating, subordinating, correlative)
|
|
66
|
+
CONJUNCTIONS = {
|
|
67
|
+
# Coordinating
|
|
68
|
+
"and", "but", "or", "nor", "for", "yet", "so",
|
|
69
|
+
# Subordinating
|
|
70
|
+
"although", "because", "since", "unless", "while", "if", "when",
|
|
71
|
+
"where", "after", "before", "once", "until", "as", "though",
|
|
72
|
+
"even", "whereas", "wherever", "whenever",
|
|
73
|
+
# Correlative components
|
|
74
|
+
"either", "neither", "both", "whether",
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Pronouns (personal, possessive, reflexive, demonstrative, relative, indefinite)
|
|
78
|
+
PRONOUNS = {
|
|
79
|
+
# Personal (subject)
|
|
80
|
+
"i", "you", "he", "she", "it", "we", "they",
|
|
81
|
+
# Personal (object)
|
|
82
|
+
"me", "him", "her", "us", "them",
|
|
83
|
+
# Possessive
|
|
84
|
+
"mine", "yours", "his", "hers", "its", "ours", "theirs",
|
|
85
|
+
# Reflexive
|
|
86
|
+
"myself", "yourself", "himself", "herself", "itself",
|
|
87
|
+
"ourselves", "yourselves", "themselves",
|
|
88
|
+
# Demonstrative
|
|
89
|
+
"this", "that", "these", "those",
|
|
90
|
+
# Relative
|
|
91
|
+
"who", "whom", "whose", "which", "that",
|
|
92
|
+
# Indefinite
|
|
93
|
+
"anybody", "anyone", "anything", "everybody", "everyone",
|
|
94
|
+
"everything", "nobody", "no one", "nothing", "somebody",
|
|
95
|
+
"someone", "something", "one",
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# Auxiliary verbs (modal, primary)
|
|
99
|
+
AUXILIARIES = {
|
|
100
|
+
# Modals
|
|
101
|
+
"can", "could", "may", "might", "must", "shall", "should",
|
|
102
|
+
"will", "would", "ought",
|
|
103
|
+
# Primary auxiliaries (be, have, do)
|
|
104
|
+
"am", "is", "are", "was", "were", "be", "being", "been",
|
|
105
|
+
"have", "has", "had", "having",
|
|
106
|
+
"do", "does", "did", "doing",
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
# Particles (often used with phrasal verbs)
|
|
110
|
+
PARTICLES = {
|
|
111
|
+
"up", "down", "out", "off", "over", "in", "away",
|
|
112
|
+
"back", "on", "along", "forth", "apart", "aside",
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def compute_function_words(text: str) -> FunctionWordResult:
|
|
117
|
+
"""
|
|
118
|
+
Compute function word frequency profiles for authorship analysis.
|
|
119
|
+
|
|
120
|
+
Function words are closed-class words (determiners, prepositions,
|
|
121
|
+
conjunctions, pronouns, auxiliaries) that authors use largely
|
|
122
|
+
subconsciously and consistently. Their frequency patterns are
|
|
123
|
+
powerful authorship markers because they're independent of topic.
|
|
124
|
+
|
|
125
|
+
Related GitHub Issue:
|
|
126
|
+
#13 - Function Word Analysis
|
|
127
|
+
https://github.com/craigtrim/pystylometry/issues/13
|
|
128
|
+
|
|
129
|
+
Why function words matter for authorship:
|
|
130
|
+
1. Topic-independent: Used consistently across different subjects
|
|
131
|
+
2. Subconscious usage: Authors don't deliberately vary their use
|
|
132
|
+
3. High frequency: Appear often enough for reliable statistics
|
|
133
|
+
4. Stable over time: Authors' function word patterns remain consistent
|
|
134
|
+
5. Discriminative power: Different authors show distinct patterns
|
|
135
|
+
|
|
136
|
+
Classic example: Mosteller & Wallace (1964) used function word
|
|
137
|
+
frequencies to resolve the disputed authorship of the Federalist Papers,
|
|
138
|
+
distinguishing between Hamilton and Madison based on their use of
|
|
139
|
+
"while" vs. "whilst", "upon" vs. "on", etc.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
text: Input text to analyze. Should be at least a few hundred words
|
|
143
|
+
for reliable statistics. Function word analysis works best with
|
|
144
|
+
longer texts (1000+ words) where frequency patterns stabilize.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
FunctionWordResult containing:
|
|
148
|
+
- Ratios for each function word category (per total words)
|
|
149
|
+
- Total function word ratio
|
|
150
|
+
- Function word diversity (unique / total function words)
|
|
151
|
+
- Most/least frequent function words with counts
|
|
152
|
+
- Full distribution of all function words used
|
|
153
|
+
- Metadata with category-specific counts
|
|
154
|
+
|
|
155
|
+
Example:
|
|
156
|
+
>>> result = compute_function_words("Sample text for analysis...")
|
|
157
|
+
>>> print(f"Determiner ratio: {result.determiner_ratio:.3f}")
|
|
158
|
+
Determiner ratio: 0.156
|
|
159
|
+
>>> print(f"Preposition ratio: {result.preposition_ratio:.3f}")
|
|
160
|
+
Preposition ratio: 0.112
|
|
161
|
+
>>> print(f"Total function words: {result.total_function_word_ratio:.3f}")
|
|
162
|
+
Total function words: 0.487
|
|
163
|
+
>>> print(f"Most frequent: {result.most_frequent_function_words[:3]}")
|
|
164
|
+
Most frequent: [('the', 45), ('of', 32), ('to', 28)]
|
|
165
|
+
|
|
166
|
+
>>> # Authorship comparison example
|
|
167
|
+
>>> text1 = "Text by author 1..."
|
|
168
|
+
>>> text2 = "Text by author 2..."
|
|
169
|
+
>>> r1 = compute_function_words(text1)
|
|
170
|
+
>>> r2 = compute_function_words(text2)
|
|
171
|
+
>>> # Compare determiner ratios, preposition preferences, etc.
|
|
172
|
+
|
|
173
|
+
Note:
|
|
174
|
+
- Case-insensitive matching (all text lowercased for matching)
|
|
175
|
+
- Tokenization by whitespace and punctuation
|
|
176
|
+
- Words must match exactly (no stemming or lemmatization)
|
|
177
|
+
- Multi-word function words like "no one" are handled as separate tokens
|
|
178
|
+
- Empty or very short texts may have unreliable ratios
|
|
179
|
+
- Some words appear in multiple categories (e.g., "that" is both
|
|
180
|
+
determiner and pronoun) - each category is counted independently
|
|
181
|
+
"""
|
|
182
|
+
# Step 1: Create union set of all function words (for total ratio calculation)
|
|
183
|
+
ALL_FUNCTION_WORDS = (
|
|
184
|
+
DETERMINERS
|
|
185
|
+
| PREPOSITIONS
|
|
186
|
+
| CONJUNCTIONS
|
|
187
|
+
| PRONOUNS
|
|
188
|
+
| AUXILIARIES
|
|
189
|
+
| PARTICLES
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Step 2: Tokenize text (lowercase, split on whitespace, strip punctuation)
|
|
193
|
+
if not text or not text.strip():
|
|
194
|
+
# Handle empty text edge case
|
|
195
|
+
return FunctionWordResult(
|
|
196
|
+
determiner_ratio=0.0,
|
|
197
|
+
preposition_ratio=0.0,
|
|
198
|
+
conjunction_ratio=0.0,
|
|
199
|
+
pronoun_ratio=0.0,
|
|
200
|
+
auxiliary_ratio=0.0,
|
|
201
|
+
particle_ratio=0.0,
|
|
202
|
+
total_function_word_ratio=0.0,
|
|
203
|
+
function_word_diversity=0.0,
|
|
204
|
+
most_frequent_function_words=[],
|
|
205
|
+
least_frequent_function_words=[],
|
|
206
|
+
function_word_distribution={},
|
|
207
|
+
metadata={
|
|
208
|
+
"total_word_count": 0,
|
|
209
|
+
"total_function_word_count": 0,
|
|
210
|
+
"unique_function_word_count": 0,
|
|
211
|
+
"determiner_count": 0,
|
|
212
|
+
"preposition_count": 0,
|
|
213
|
+
"conjunction_count": 0,
|
|
214
|
+
"pronoun_count": 0,
|
|
215
|
+
"auxiliary_count": 0,
|
|
216
|
+
"particle_count": 0,
|
|
217
|
+
"determiner_list": [],
|
|
218
|
+
"preposition_list": [],
|
|
219
|
+
"conjunction_list": [],
|
|
220
|
+
"pronoun_list": [],
|
|
221
|
+
"auxiliary_list": [],
|
|
222
|
+
"particle_list": [],
|
|
223
|
+
"overlapping_words": [],
|
|
224
|
+
"overlapping_word_categories": {},
|
|
225
|
+
},
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Lowercase entire text
|
|
229
|
+
text_lower = text.lower()
|
|
230
|
+
|
|
231
|
+
# Split on whitespace
|
|
232
|
+
raw_tokens = text_lower.split()
|
|
233
|
+
|
|
234
|
+
# Comprehensive punctuation set for stripping
|
|
235
|
+
PUNCTUATION = set(
|
|
236
|
+
".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„""''‚'"
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Strip punctuation from each token
|
|
240
|
+
tokens = []
|
|
241
|
+
for token in raw_tokens:
|
|
242
|
+
# Strip leading and trailing punctuation
|
|
243
|
+
clean_token = token.strip("".join(PUNCTUATION))
|
|
244
|
+
if clean_token: # Only add non-empty tokens
|
|
245
|
+
tokens.append(clean_token)
|
|
246
|
+
|
|
247
|
+
total_words = len(tokens)
|
|
248
|
+
|
|
249
|
+
# Step 3: Initialize counters for each category
|
|
250
|
+
determiner_count = 0
|
|
251
|
+
preposition_count = 0
|
|
252
|
+
conjunction_count = 0
|
|
253
|
+
pronoun_count = 0
|
|
254
|
+
auxiliary_count = 0
|
|
255
|
+
particle_count = 0
|
|
256
|
+
|
|
257
|
+
# Step 4: Count tokens in each category (overlapping allowed)
|
|
258
|
+
for token in tokens:
|
|
259
|
+
if token in DETERMINERS:
|
|
260
|
+
determiner_count += 1
|
|
261
|
+
if token in PREPOSITIONS:
|
|
262
|
+
preposition_count += 1
|
|
263
|
+
if token in CONJUNCTIONS:
|
|
264
|
+
conjunction_count += 1
|
|
265
|
+
if token in PRONOUNS:
|
|
266
|
+
pronoun_count += 1
|
|
267
|
+
if token in AUXILIARIES:
|
|
268
|
+
auxiliary_count += 1
|
|
269
|
+
if token in PARTICLES:
|
|
270
|
+
particle_count += 1
|
|
271
|
+
|
|
272
|
+
# Step 5: Build distribution (count each function word only once per token)
|
|
273
|
+
function_word_counts: dict[str, int] = {}
|
|
274
|
+
for token in tokens:
|
|
275
|
+
if token in ALL_FUNCTION_WORDS:
|
|
276
|
+
function_word_counts[token] = function_word_counts.get(token, 0) + 1
|
|
277
|
+
|
|
278
|
+
# Step 6: Calculate ratios
|
|
279
|
+
if total_words > 0:
|
|
280
|
+
determiner_ratio = determiner_count / total_words
|
|
281
|
+
preposition_ratio = preposition_count / total_words
|
|
282
|
+
conjunction_ratio = conjunction_count / total_words
|
|
283
|
+
pronoun_ratio = pronoun_count / total_words
|
|
284
|
+
auxiliary_ratio = auxiliary_count / total_words
|
|
285
|
+
particle_ratio = particle_count / total_words
|
|
286
|
+
|
|
287
|
+
total_function_word_count = sum(function_word_counts.values())
|
|
288
|
+
total_function_word_ratio = total_function_word_count / total_words
|
|
289
|
+
else:
|
|
290
|
+
determiner_ratio = 0.0
|
|
291
|
+
preposition_ratio = 0.0
|
|
292
|
+
conjunction_ratio = 0.0
|
|
293
|
+
pronoun_ratio = 0.0
|
|
294
|
+
auxiliary_ratio = 0.0
|
|
295
|
+
particle_ratio = 0.0
|
|
296
|
+
total_function_word_count = 0
|
|
297
|
+
total_function_word_ratio = 0.0
|
|
298
|
+
|
|
299
|
+
# Step 7: Calculate diversity
|
|
300
|
+
unique_function_word_count = len(function_word_counts)
|
|
301
|
+
if total_function_word_count > 0:
|
|
302
|
+
function_word_diversity = unique_function_word_count / total_function_word_count
|
|
303
|
+
else:
|
|
304
|
+
function_word_diversity = 0.0
|
|
305
|
+
|
|
306
|
+
# Step 8: Find most/least frequent function words
|
|
307
|
+
if function_word_counts:
|
|
308
|
+
# Sort by count descending
|
|
309
|
+
sorted_by_count = sorted(
|
|
310
|
+
function_word_counts.items(), key=lambda x: x[1], reverse=True
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Top 10 most frequent
|
|
314
|
+
most_frequent = sorted_by_count[:10]
|
|
315
|
+
|
|
316
|
+
# Bottom 10 least frequent (reverse to get ascending order)
|
|
317
|
+
least_frequent = sorted_by_count[-10:]
|
|
318
|
+
least_frequent.reverse()
|
|
319
|
+
else:
|
|
320
|
+
most_frequent = []
|
|
321
|
+
least_frequent = []
|
|
322
|
+
|
|
323
|
+
# Step 9: Build category word lists (sorted)
|
|
324
|
+
determiner_list = sorted([w for w in function_word_counts if w in DETERMINERS])
|
|
325
|
+
preposition_list = sorted([w for w in function_word_counts if w in PREPOSITIONS])
|
|
326
|
+
conjunction_list = sorted([w for w in function_word_counts if w in CONJUNCTIONS])
|
|
327
|
+
pronoun_list = sorted([w for w in function_word_counts if w in PRONOUNS])
|
|
328
|
+
auxiliary_list = sorted([w for w in function_word_counts if w in AUXILIARIES])
|
|
329
|
+
particle_list = sorted([w for w in function_word_counts if w in PARTICLES])
|
|
330
|
+
|
|
331
|
+
# Step 10: Find overlapping words (words in multiple categories)
|
|
332
|
+
overlapping_words = []
|
|
333
|
+
overlapping_word_categories: dict[str, list[str]] = {}
|
|
334
|
+
|
|
335
|
+
for word in function_word_counts:
|
|
336
|
+
categories = []
|
|
337
|
+
if word in DETERMINERS:
|
|
338
|
+
categories.append("determiner")
|
|
339
|
+
if word in PREPOSITIONS:
|
|
340
|
+
categories.append("preposition")
|
|
341
|
+
if word in CONJUNCTIONS:
|
|
342
|
+
categories.append("conjunction")
|
|
343
|
+
if word in PRONOUNS:
|
|
344
|
+
categories.append("pronoun")
|
|
345
|
+
if word in AUXILIARIES:
|
|
346
|
+
categories.append("auxiliary")
|
|
347
|
+
if word in PARTICLES:
|
|
348
|
+
categories.append("particle")
|
|
349
|
+
|
|
350
|
+
if len(categories) > 1:
|
|
351
|
+
overlapping_words.append(word)
|
|
352
|
+
overlapping_word_categories[word] = categories
|
|
353
|
+
|
|
354
|
+
overlapping_words.sort()
|
|
355
|
+
|
|
356
|
+
# Step 11: Build metadata
|
|
357
|
+
metadata = {
|
|
358
|
+
"total_word_count": total_words,
|
|
359
|
+
"total_function_word_count": total_function_word_count,
|
|
360
|
+
"unique_function_word_count": unique_function_word_count,
|
|
361
|
+
"determiner_count": determiner_count,
|
|
362
|
+
"preposition_count": preposition_count,
|
|
363
|
+
"conjunction_count": conjunction_count,
|
|
364
|
+
"pronoun_count": pronoun_count,
|
|
365
|
+
"auxiliary_count": auxiliary_count,
|
|
366
|
+
"particle_count": particle_count,
|
|
367
|
+
"determiner_list": determiner_list,
|
|
368
|
+
"preposition_list": preposition_list,
|
|
369
|
+
"conjunction_list": conjunction_list,
|
|
370
|
+
"pronoun_list": pronoun_list,
|
|
371
|
+
"auxiliary_list": auxiliary_list,
|
|
372
|
+
"particle_list": particle_list,
|
|
373
|
+
"overlapping_words": overlapping_words,
|
|
374
|
+
"overlapping_word_categories": overlapping_word_categories,
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
# Step 12: Return result
|
|
378
|
+
return FunctionWordResult(
|
|
379
|
+
determiner_ratio=determiner_ratio,
|
|
380
|
+
preposition_ratio=preposition_ratio,
|
|
381
|
+
conjunction_ratio=conjunction_ratio,
|
|
382
|
+
pronoun_ratio=pronoun_ratio,
|
|
383
|
+
auxiliary_ratio=auxiliary_ratio,
|
|
384
|
+
particle_ratio=particle_ratio,
|
|
385
|
+
total_function_word_ratio=total_function_word_ratio,
|
|
386
|
+
function_word_diversity=function_word_diversity,
|
|
387
|
+
most_frequent_function_words=most_frequent,
|
|
388
|
+
least_frequent_function_words=least_frequent,
|
|
389
|
+
function_word_distribution=function_word_counts,
|
|
390
|
+
metadata=metadata,
|
|
391
|
+
)
|
pystylometry/lexical/hapax.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
"""Hapax legomena and related vocabulary richness metrics."""
|
|
2
2
|
|
|
3
|
+
import math
|
|
3
4
|
from collections import Counter
|
|
4
5
|
|
|
5
|
-
from .._types import HapaxResult
|
|
6
|
-
from .._utils import tokenize
|
|
6
|
+
from .._types import HapaxLexiconResult, HapaxResult, LexiconCategories
|
|
7
|
+
from .._utils import check_optional_dependency, tokenize
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def compute_hapax_ratios(text: str) -> HapaxResult:
|
|
@@ -30,10 +31,18 @@ def compute_hapax_ratios(text: str) -> HapaxResult:
|
|
|
30
31
|
Returns:
|
|
31
32
|
HapaxResult with counts, ratios, Sichel's S, Honoré's R, and metadata
|
|
32
33
|
|
|
34
|
+
Note: When all words are unique (V₁ = V), Honoré's R returns float('inf')
|
|
35
|
+
to indicate maximal vocabulary richness (division by zero case).
|
|
36
|
+
|
|
33
37
|
Example:
|
|
34
|
-
>>>
|
|
35
|
-
>>>
|
|
38
|
+
>>> text = "The quick brown fox jumps over the lazy dog"
|
|
39
|
+
>>> result = compute_hapax_ratios(text)
|
|
40
|
+
>>> result.hapax_count # Words appearing once
|
|
41
|
+
7
|
|
42
|
+
>>> result.dis_hapax_count # Words appearing twice
|
|
43
|
+
1
|
|
36
44
|
>>> print(f"Sichel's S: {result.sichel_s:.3f}")
|
|
45
|
+
Sichel's S: 0.125
|
|
37
46
|
"""
|
|
38
47
|
tokens = tokenize(text.lower())
|
|
39
48
|
N = len(tokens) # noqa: N806
|
|
@@ -57,9 +66,18 @@ def compute_hapax_ratios(text: str) -> HapaxResult:
|
|
|
57
66
|
V1 = sum(1 for count in freq_counter.values() if count == 1) # noqa: N806
|
|
58
67
|
V2 = sum(1 for count in freq_counter.values() if count == 2) # noqa: N806
|
|
59
68
|
|
|
60
|
-
#
|
|
61
|
-
|
|
62
|
-
|
|
69
|
+
# Sichel's S: ratio of dislegomena to vocabulary size
|
|
70
|
+
# S = V₂ / V
|
|
71
|
+
sichel_s = V2 / V if V > 0 else 0.0
|
|
72
|
+
|
|
73
|
+
# Honoré's R: 100 × log(N) / (1 - V₁/V)
|
|
74
|
+
# R = 100 × log(N) / (1 - V₁/V)
|
|
75
|
+
# If V₁ = V (all words appear once), denominator is 0, return infinity
|
|
76
|
+
# This indicates maximal vocabulary richness (every word unique)
|
|
77
|
+
if V1 == V:
|
|
78
|
+
honore_r = float("inf")
|
|
79
|
+
else:
|
|
80
|
+
honore_r = 100 * math.log(N) / (1 - V1 / V)
|
|
63
81
|
|
|
64
82
|
return HapaxResult(
|
|
65
83
|
hapax_count=V1,
|
|
@@ -73,3 +91,132 @@ def compute_hapax_ratios(text: str) -> HapaxResult:
|
|
|
73
91
|
"vocabulary_size": V,
|
|
74
92
|
},
|
|
75
93
|
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def compute_hapax_with_lexicon_analysis(text: str) -> HapaxLexiconResult:
|
|
97
|
+
"""
|
|
98
|
+
Compute hapax legomena with lexicon-based categorization.
|
|
99
|
+
|
|
100
|
+
Extends standard hapax analysis by categorizing hapax legomena based on
|
|
101
|
+
presence in WordNet and British National Corpus (BNC). This distinguishes
|
|
102
|
+
between:
|
|
103
|
+
|
|
104
|
+
1. **Neologisms**: Words not in WordNet AND not in BNC
|
|
105
|
+
- True novel words or proper nouns
|
|
106
|
+
- High neologism ratio indicates vocabulary innovation
|
|
107
|
+
|
|
108
|
+
2. **Rare Words**: Words in BNC but not WordNet, or vice versa
|
|
109
|
+
- Technical jargon, specialized terminology
|
|
110
|
+
- Words at the edges of common vocabulary
|
|
111
|
+
|
|
112
|
+
3. **Common Words**: Words in both WordNet AND BNC
|
|
113
|
+
- Standard vocabulary that happens to appear once
|
|
114
|
+
- Low incidental usage of common words
|
|
115
|
+
|
|
116
|
+
This categorization is valuable for stylometric analysis:
|
|
117
|
+
- Authors with high neologism ratios are more innovative/creative
|
|
118
|
+
- Technical writing typically has higher rare word ratios
|
|
119
|
+
- Comparison of neologism vs common hapax distinguishes vocabulary
|
|
120
|
+
innovation from incidental word usage
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
text: Input text to analyze
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
HapaxLexiconResult with standard hapax metrics and lexicon categorization
|
|
127
|
+
|
|
128
|
+
Raises:
|
|
129
|
+
ImportError: If bnc-lookup or wordnet-lookup packages are not installed
|
|
130
|
+
|
|
131
|
+
Example:
|
|
132
|
+
>>> text = "The xyzbot platform facilitates interdepartmental synergy."
|
|
133
|
+
>>> result = compute_hapax_with_lexicon_analysis(text)
|
|
134
|
+
>>> result.lexicon_analysis.neologisms
|
|
135
|
+
['xyzbot', 'platform']
|
|
136
|
+
>>> result.lexicon_analysis.rare_words
|
|
137
|
+
['facilitates', 'interdepartmental']
|
|
138
|
+
>>> result.lexicon_analysis.common_words
|
|
139
|
+
['synergy']
|
|
140
|
+
>>> print(f"Neologism ratio: {result.lexicon_analysis.neologism_ratio:.2%}")
|
|
141
|
+
Neologism ratio: 40.00%
|
|
142
|
+
|
|
143
|
+
References:
|
|
144
|
+
British National Corpus: http://www.natcorp.ox.ac.uk/
|
|
145
|
+
WordNet: https://wordnet.princeton.edu/
|
|
146
|
+
"""
|
|
147
|
+
# Check dependencies
|
|
148
|
+
check_optional_dependency("bnc_lookup", "lexical")
|
|
149
|
+
check_optional_dependency("wordnet_lookup", "lexical")
|
|
150
|
+
|
|
151
|
+
from bnc_lookup import is_bnc_term # type: ignore[import-not-found]
|
|
152
|
+
from wordnet_lookup import is_wordnet_term # type: ignore[import-not-found]
|
|
153
|
+
|
|
154
|
+
# First compute standard hapax metrics
|
|
155
|
+
hapax_result = compute_hapax_ratios(text)
|
|
156
|
+
|
|
157
|
+
# If no hapax legomena, return empty categorization
|
|
158
|
+
if hapax_result.hapax_count == 0:
|
|
159
|
+
return HapaxLexiconResult(
|
|
160
|
+
hapax_result=hapax_result,
|
|
161
|
+
lexicon_analysis=LexiconCategories(
|
|
162
|
+
neologisms=[],
|
|
163
|
+
rare_words=[],
|
|
164
|
+
common_words=[],
|
|
165
|
+
neologism_ratio=0.0,
|
|
166
|
+
rare_word_ratio=0.0,
|
|
167
|
+
metadata={"total_hapax": 0},
|
|
168
|
+
),
|
|
169
|
+
metadata={"note": "No hapax legomena found"},
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Get tokens and identify hapax words
|
|
173
|
+
tokens = tokenize(text.lower())
|
|
174
|
+
freq_counter = Counter(tokens)
|
|
175
|
+
hapax_words = [word for word, count in freq_counter.items() if count == 1]
|
|
176
|
+
|
|
177
|
+
# Categorize each hapax word by lexicon presence
|
|
178
|
+
neologisms = []
|
|
179
|
+
rare_words = []
|
|
180
|
+
common_words = []
|
|
181
|
+
|
|
182
|
+
for word in hapax_words:
|
|
183
|
+
in_bnc = is_bnc_term(word)
|
|
184
|
+
in_wordnet = is_wordnet_term(word)
|
|
185
|
+
|
|
186
|
+
if not in_bnc and not in_wordnet:
|
|
187
|
+
# Not in either lexicon → true neologism
|
|
188
|
+
neologisms.append(word)
|
|
189
|
+
elif in_bnc and in_wordnet:
|
|
190
|
+
# In both lexicons → common word
|
|
191
|
+
common_words.append(word)
|
|
192
|
+
else:
|
|
193
|
+
# In one but not the other → rare word
|
|
194
|
+
rare_words.append(word)
|
|
195
|
+
|
|
196
|
+
# Calculate ratios
|
|
197
|
+
total_hapax = len(hapax_words)
|
|
198
|
+
neologism_ratio = len(neologisms) / total_hapax if total_hapax > 0 else 0.0
|
|
199
|
+
rare_word_ratio = len(rare_words) / total_hapax if total_hapax > 0 else 0.0
|
|
200
|
+
common_word_ratio = len(common_words) / total_hapax if total_hapax > 0 else 0.0
|
|
201
|
+
|
|
202
|
+
return HapaxLexiconResult(
|
|
203
|
+
hapax_result=hapax_result,
|
|
204
|
+
lexicon_analysis=LexiconCategories(
|
|
205
|
+
neologisms=sorted(neologisms),
|
|
206
|
+
rare_words=sorted(rare_words),
|
|
207
|
+
common_words=sorted(common_words),
|
|
208
|
+
neologism_ratio=neologism_ratio,
|
|
209
|
+
rare_word_ratio=rare_word_ratio,
|
|
210
|
+
metadata={
|
|
211
|
+
"total_hapax": total_hapax,
|
|
212
|
+
"neologism_count": len(neologisms),
|
|
213
|
+
"rare_word_count": len(rare_words),
|
|
214
|
+
"common_word_count": len(common_words),
|
|
215
|
+
"common_word_ratio": common_word_ratio,
|
|
216
|
+
},
|
|
217
|
+
),
|
|
218
|
+
metadata={
|
|
219
|
+
"lexicons_used": ["bnc", "wordnet"],
|
|
220
|
+
"note": "Lexicon categorization based on BNC and WordNet presence",
|
|
221
|
+
},
|
|
222
|
+
)
|