pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pystylometry/__init__.py +1 -2
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1224 -2
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +4 -0
  6. pystylometry/authorship/additional_methods.py +100 -0
  7. pystylometry/character/__init__.py +15 -0
  8. pystylometry/character/character_metrics.py +301 -0
  9. pystylometry/lexical/__init__.py +13 -6
  10. pystylometry/lexical/advanced_diversity.py +641 -0
  11. pystylometry/lexical/function_words.py +391 -0
  12. pystylometry/lexical/hapax.py +154 -7
  13. pystylometry/lexical/mtld.py +83 -7
  14. pystylometry/lexical/ttr.py +83 -0
  15. pystylometry/lexical/word_frequency_sophistication.py +581 -0
  16. pystylometry/lexical/yule.py +34 -7
  17. pystylometry/ngrams/__init__.py +2 -0
  18. pystylometry/ngrams/extended_ngrams.py +235 -0
  19. pystylometry/prosody/__init__.py +12 -0
  20. pystylometry/prosody/rhythm_prosody.py +53 -0
  21. pystylometry/readability/__init__.py +12 -0
  22. pystylometry/readability/additional_formulas.py +985 -0
  23. pystylometry/readability/ari.py +93 -17
  24. pystylometry/readability/coleman_liau.py +102 -9
  25. pystylometry/readability/complex_words.py +531 -0
  26. pystylometry/readability/flesch.py +59 -14
  27. pystylometry/readability/gunning_fog.py +194 -25
  28. pystylometry/readability/smog.py +31 -14
  29. pystylometry/readability/syllables.py +137 -30
  30. pystylometry/stylistic/__init__.py +20 -0
  31. pystylometry/stylistic/cohesion_coherence.py +45 -0
  32. pystylometry/stylistic/genre_register.py +45 -0
  33. pystylometry/stylistic/markers.py +131 -0
  34. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  35. pystylometry/syntactic/__init__.py +4 -0
  36. pystylometry/syntactic/advanced_syntactic.py +432 -0
  37. pystylometry/syntactic/pos_ratios.py +104 -13
  38. pystylometry/syntactic/sentence_stats.py +57 -13
  39. pystylometry/syntactic/sentence_types.py +470 -0
  40. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
  41. pystylometry-1.0.0.dist-info/RECORD +46 -0
  42. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
  43. pystylometry-0.1.0.dist-info/RECORD +0 -26
@@ -0,0 +1,391 @@
1
+ """Function word analysis for authorship attribution.
2
+
3
+ Function words (determiners, prepositions, conjunctions, pronouns, auxiliary
4
+ verbs) are highly frequent, content-independent words that authors use
5
+ subconsciously and consistently across different topics. This makes them
6
+ powerful markers for authorship attribution.
7
+
8
+ Related GitHub Issue:
9
+ #13 - Function Word Analysis
10
+ https://github.com/craigtrim/pystylometry/issues/13
11
+
12
+ Features implemented:
13
+ - Frequency profiles for all function word categories
14
+ - Ratios for specific grammatical categories
15
+ - Most/least frequently used function words
16
+ - Function word diversity metrics
17
+
18
+ Function word categories:
19
+ - Determiners: the, a, an, this, that, these, those, my, your, etc.
20
+ - Prepositions: in, on, at, by, for, with, from, to, of, etc.
21
+ - Conjunctions: and, but, or, nor, for, yet, so, because, although, etc.
22
+ - Pronouns: I, you, he, she, it, we, they, me, him, her, us, them, etc.
23
+ - Auxiliary verbs: be, have, do, can, will, shall, may, must, etc.
24
+ - Particles: up, down, out, off, over, away, back, etc.
25
+
26
+ References:
27
+ Mosteller, F., & Wallace, D. L. (1964). Inference and disputed authorship:
28
+ The Federalist. Addison-Wesley.
29
+ Burrows, J. (2002). 'Delta': A measure of stylistic difference and a guide
30
+ to likely authorship. Literary and Linguistic Computing, 17(3), 267-287.
31
+ Argamon, S., & Levitan, S. (2005). Measuring the usefulness of function
32
+ words for authorship attribution. ACH/ALLC.
33
+ """
34
+
35
+ from .._types import FunctionWordResult
36
+
37
+
38
+ # Function word lists for English
39
+ # GitHub Issue #13: https://github.com/craigtrim/pystylometry/issues/13
40
+ # These lists should be comprehensive and cover all major function word categories.
41
+ # Consider loading from external resource files for easier maintenance.
42
+
43
+ # Determiners (articles, demonstratives, possessives, quantifiers)
44
+ DETERMINERS = {
45
+ "the", "a", "an", # Articles
46
+ "this", "that", "these", "those", # Demonstratives
47
+ "my", "your", "his", "her", "its", "our", "their", # Possessive determiners
48
+ "some", "any", "no", "every", "each", "either", "neither", # Quantifiers
49
+ "much", "many", "more", "most", "few", "fewer", "less", "least",
50
+ "all", "both", "half", "several", "enough",
51
+ }
52
+
53
+ # Prepositions (locative, temporal, other)
54
+ PREPOSITIONS = {
55
+ "in", "on", "at", "by", "for", "with", "from", "to", "of",
56
+ "about", "above", "across", "after", "against", "along", "among",
57
+ "around", "as", "before", "behind", "below", "beneath", "beside",
58
+ "between", "beyond", "but", "concerning", "considering", "despite",
59
+ "down", "during", "except", "inside", "into", "like", "near",
60
+ "off", "onto", "out", "outside", "over", "past", "regarding",
61
+ "since", "through", "throughout", "till", "toward", "under",
62
+ "underneath", "until", "up", "upon", "via", "within", "without",
63
+ }
64
+
65
+ # Conjunctions (coordinating, subordinating, correlative)
66
+ CONJUNCTIONS = {
67
+ # Coordinating
68
+ "and", "but", "or", "nor", "for", "yet", "so",
69
+ # Subordinating
70
+ "although", "because", "since", "unless", "while", "if", "when",
71
+ "where", "after", "before", "once", "until", "as", "though",
72
+ "even", "whereas", "wherever", "whenever",
73
+ # Correlative components
74
+ "either", "neither", "both", "whether",
75
+ }
76
+
77
+ # Pronouns (personal, possessive, reflexive, demonstrative, relative, indefinite)
78
+ PRONOUNS = {
79
+ # Personal (subject)
80
+ "i", "you", "he", "she", "it", "we", "they",
81
+ # Personal (object)
82
+ "me", "him", "her", "us", "them",
83
+ # Possessive
84
+ "mine", "yours", "his", "hers", "its", "ours", "theirs",
85
+ # Reflexive
86
+ "myself", "yourself", "himself", "herself", "itself",
87
+ "ourselves", "yourselves", "themselves",
88
+ # Demonstrative
89
+ "this", "that", "these", "those",
90
+ # Relative
91
+ "who", "whom", "whose", "which", "that",
92
+ # Indefinite
93
+ "anybody", "anyone", "anything", "everybody", "everyone",
94
+ "everything", "nobody", "no one", "nothing", "somebody",
95
+ "someone", "something", "one",
96
+ }
97
+
98
+ # Auxiliary verbs (modal, primary)
99
+ AUXILIARIES = {
100
+ # Modals
101
+ "can", "could", "may", "might", "must", "shall", "should",
102
+ "will", "would", "ought",
103
+ # Primary auxiliaries (be, have, do)
104
+ "am", "is", "are", "was", "were", "be", "being", "been",
105
+ "have", "has", "had", "having",
106
+ "do", "does", "did", "doing",
107
+ }
108
+
109
+ # Particles (often used with phrasal verbs)
110
+ PARTICLES = {
111
+ "up", "down", "out", "off", "over", "in", "away",
112
+ "back", "on", "along", "forth", "apart", "aside",
113
+ }
114
+
115
+
116
+ def compute_function_words(text: str) -> FunctionWordResult:
117
+ """
118
+ Compute function word frequency profiles for authorship analysis.
119
+
120
+ Function words are closed-class words (determiners, prepositions,
121
+ conjunctions, pronouns, auxiliaries) that authors use largely
122
+ subconsciously and consistently. Their frequency patterns are
123
+ powerful authorship markers because they're independent of topic.
124
+
125
+ Related GitHub Issue:
126
+ #13 - Function Word Analysis
127
+ https://github.com/craigtrim/pystylometry/issues/13
128
+
129
+ Why function words matter for authorship:
130
+ 1. Topic-independent: Used consistently across different subjects
131
+ 2. Subconscious usage: Authors don't deliberately vary their use
132
+ 3. High frequency: Appear often enough for reliable statistics
133
+ 4. Stable over time: Authors' function word patterns remain consistent
134
+ 5. Discriminative power: Different authors show distinct patterns
135
+
136
+ Classic example: Mosteller & Wallace (1964) used function word
137
+ frequencies to resolve the disputed authorship of the Federalist Papers,
138
+ distinguishing between Hamilton and Madison based on their use of
139
+ "while" vs. "whilst", "upon" vs. "on", etc.
140
+
141
+ Args:
142
+ text: Input text to analyze. Should be at least a few hundred words
143
+ for reliable statistics. Function word analysis works best with
144
+ longer texts (1000+ words) where frequency patterns stabilize.
145
+
146
+ Returns:
147
+ FunctionWordResult containing:
148
+ - Ratios for each function word category (per total words)
149
+ - Total function word ratio
150
+ - Function word diversity (unique / total function words)
151
+ - Most/least frequent function words with counts
152
+ - Full distribution of all function words used
153
+ - Metadata with category-specific counts
154
+
155
+ Example:
156
+ >>> result = compute_function_words("Sample text for analysis...")
157
+ >>> print(f"Determiner ratio: {result.determiner_ratio:.3f}")
158
+ Determiner ratio: 0.156
159
+ >>> print(f"Preposition ratio: {result.preposition_ratio:.3f}")
160
+ Preposition ratio: 0.112
161
+ >>> print(f"Total function words: {result.total_function_word_ratio:.3f}")
162
+ Total function words: 0.487
163
+ >>> print(f"Most frequent: {result.most_frequent_function_words[:3]}")
164
+ Most frequent: [('the', 45), ('of', 32), ('to', 28)]
165
+
166
+ >>> # Authorship comparison example
167
+ >>> text1 = "Text by author 1..."
168
+ >>> text2 = "Text by author 2..."
169
+ >>> r1 = compute_function_words(text1)
170
+ >>> r2 = compute_function_words(text2)
171
+ >>> # Compare determiner ratios, preposition preferences, etc.
172
+
173
+ Note:
174
+ - Case-insensitive matching (all text lowercased for matching)
175
+ - Tokenization by whitespace and punctuation
176
+ - Words must match exactly (no stemming or lemmatization)
177
+ - Multi-word function words like "no one" are handled as separate tokens
178
+ - Empty or very short texts may have unreliable ratios
179
+ - Some words appear in multiple categories (e.g., "that" is both
180
+ determiner and pronoun) - each category is counted independently
181
+ """
182
+ # Step 1: Create union set of all function words (for total ratio calculation)
183
+ ALL_FUNCTION_WORDS = (
184
+ DETERMINERS
185
+ | PREPOSITIONS
186
+ | CONJUNCTIONS
187
+ | PRONOUNS
188
+ | AUXILIARIES
189
+ | PARTICLES
190
+ )
191
+
192
+ # Step 2: Tokenize text (lowercase, split on whitespace, strip punctuation)
193
+ if not text or not text.strip():
194
+ # Handle empty text edge case
195
+ return FunctionWordResult(
196
+ determiner_ratio=0.0,
197
+ preposition_ratio=0.0,
198
+ conjunction_ratio=0.0,
199
+ pronoun_ratio=0.0,
200
+ auxiliary_ratio=0.0,
201
+ particle_ratio=0.0,
202
+ total_function_word_ratio=0.0,
203
+ function_word_diversity=0.0,
204
+ most_frequent_function_words=[],
205
+ least_frequent_function_words=[],
206
+ function_word_distribution={},
207
+ metadata={
208
+ "total_word_count": 0,
209
+ "total_function_word_count": 0,
210
+ "unique_function_word_count": 0,
211
+ "determiner_count": 0,
212
+ "preposition_count": 0,
213
+ "conjunction_count": 0,
214
+ "pronoun_count": 0,
215
+ "auxiliary_count": 0,
216
+ "particle_count": 0,
217
+ "determiner_list": [],
218
+ "preposition_list": [],
219
+ "conjunction_list": [],
220
+ "pronoun_list": [],
221
+ "auxiliary_list": [],
222
+ "particle_list": [],
223
+ "overlapping_words": [],
224
+ "overlapping_word_categories": {},
225
+ },
226
+ )
227
+
228
+ # Lowercase entire text
229
+ text_lower = text.lower()
230
+
231
+ # Split on whitespace
232
+ raw_tokens = text_lower.split()
233
+
234
+ # Comprehensive punctuation set for stripping
235
+ PUNCTUATION = set(
236
+ ".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„""''‚'"
237
+ )
238
+
239
+ # Strip punctuation from each token
240
+ tokens = []
241
+ for token in raw_tokens:
242
+ # Strip leading and trailing punctuation
243
+ clean_token = token.strip("".join(PUNCTUATION))
244
+ if clean_token: # Only add non-empty tokens
245
+ tokens.append(clean_token)
246
+
247
+ total_words = len(tokens)
248
+
249
+ # Step 3: Initialize counters for each category
250
+ determiner_count = 0
251
+ preposition_count = 0
252
+ conjunction_count = 0
253
+ pronoun_count = 0
254
+ auxiliary_count = 0
255
+ particle_count = 0
256
+
257
+ # Step 4: Count tokens in each category (overlapping allowed)
258
+ for token in tokens:
259
+ if token in DETERMINERS:
260
+ determiner_count += 1
261
+ if token in PREPOSITIONS:
262
+ preposition_count += 1
263
+ if token in CONJUNCTIONS:
264
+ conjunction_count += 1
265
+ if token in PRONOUNS:
266
+ pronoun_count += 1
267
+ if token in AUXILIARIES:
268
+ auxiliary_count += 1
269
+ if token in PARTICLES:
270
+ particle_count += 1
271
+
272
+ # Step 5: Build distribution (count each function word only once per token)
273
+ function_word_counts: dict[str, int] = {}
274
+ for token in tokens:
275
+ if token in ALL_FUNCTION_WORDS:
276
+ function_word_counts[token] = function_word_counts.get(token, 0) + 1
277
+
278
+ # Step 6: Calculate ratios
279
+ if total_words > 0:
280
+ determiner_ratio = determiner_count / total_words
281
+ preposition_ratio = preposition_count / total_words
282
+ conjunction_ratio = conjunction_count / total_words
283
+ pronoun_ratio = pronoun_count / total_words
284
+ auxiliary_ratio = auxiliary_count / total_words
285
+ particle_ratio = particle_count / total_words
286
+
287
+ total_function_word_count = sum(function_word_counts.values())
288
+ total_function_word_ratio = total_function_word_count / total_words
289
+ else:
290
+ determiner_ratio = 0.0
291
+ preposition_ratio = 0.0
292
+ conjunction_ratio = 0.0
293
+ pronoun_ratio = 0.0
294
+ auxiliary_ratio = 0.0
295
+ particle_ratio = 0.0
296
+ total_function_word_count = 0
297
+ total_function_word_ratio = 0.0
298
+
299
+ # Step 7: Calculate diversity
300
+ unique_function_word_count = len(function_word_counts)
301
+ if total_function_word_count > 0:
302
+ function_word_diversity = unique_function_word_count / total_function_word_count
303
+ else:
304
+ function_word_diversity = 0.0
305
+
306
+ # Step 8: Find most/least frequent function words
307
+ if function_word_counts:
308
+ # Sort by count descending
309
+ sorted_by_count = sorted(
310
+ function_word_counts.items(), key=lambda x: x[1], reverse=True
311
+ )
312
+
313
+ # Top 10 most frequent
314
+ most_frequent = sorted_by_count[:10]
315
+
316
+ # Bottom 10 least frequent (reverse to get ascending order)
317
+ least_frequent = sorted_by_count[-10:]
318
+ least_frequent.reverse()
319
+ else:
320
+ most_frequent = []
321
+ least_frequent = []
322
+
323
+ # Step 9: Build category word lists (sorted)
324
+ determiner_list = sorted([w for w in function_word_counts if w in DETERMINERS])
325
+ preposition_list = sorted([w for w in function_word_counts if w in PREPOSITIONS])
326
+ conjunction_list = sorted([w for w in function_word_counts if w in CONJUNCTIONS])
327
+ pronoun_list = sorted([w for w in function_word_counts if w in PRONOUNS])
328
+ auxiliary_list = sorted([w for w in function_word_counts if w in AUXILIARIES])
329
+ particle_list = sorted([w for w in function_word_counts if w in PARTICLES])
330
+
331
+ # Step 10: Find overlapping words (words in multiple categories)
332
+ overlapping_words = []
333
+ overlapping_word_categories: dict[str, list[str]] = {}
334
+
335
+ for word in function_word_counts:
336
+ categories = []
337
+ if word in DETERMINERS:
338
+ categories.append("determiner")
339
+ if word in PREPOSITIONS:
340
+ categories.append("preposition")
341
+ if word in CONJUNCTIONS:
342
+ categories.append("conjunction")
343
+ if word in PRONOUNS:
344
+ categories.append("pronoun")
345
+ if word in AUXILIARIES:
346
+ categories.append("auxiliary")
347
+ if word in PARTICLES:
348
+ categories.append("particle")
349
+
350
+ if len(categories) > 1:
351
+ overlapping_words.append(word)
352
+ overlapping_word_categories[word] = categories
353
+
354
+ overlapping_words.sort()
355
+
356
+ # Step 11: Build metadata
357
+ metadata = {
358
+ "total_word_count": total_words,
359
+ "total_function_word_count": total_function_word_count,
360
+ "unique_function_word_count": unique_function_word_count,
361
+ "determiner_count": determiner_count,
362
+ "preposition_count": preposition_count,
363
+ "conjunction_count": conjunction_count,
364
+ "pronoun_count": pronoun_count,
365
+ "auxiliary_count": auxiliary_count,
366
+ "particle_count": particle_count,
367
+ "determiner_list": determiner_list,
368
+ "preposition_list": preposition_list,
369
+ "conjunction_list": conjunction_list,
370
+ "pronoun_list": pronoun_list,
371
+ "auxiliary_list": auxiliary_list,
372
+ "particle_list": particle_list,
373
+ "overlapping_words": overlapping_words,
374
+ "overlapping_word_categories": overlapping_word_categories,
375
+ }
376
+
377
+ # Step 12: Return result
378
+ return FunctionWordResult(
379
+ determiner_ratio=determiner_ratio,
380
+ preposition_ratio=preposition_ratio,
381
+ conjunction_ratio=conjunction_ratio,
382
+ pronoun_ratio=pronoun_ratio,
383
+ auxiliary_ratio=auxiliary_ratio,
384
+ particle_ratio=particle_ratio,
385
+ total_function_word_ratio=total_function_word_ratio,
386
+ function_word_diversity=function_word_diversity,
387
+ most_frequent_function_words=most_frequent,
388
+ least_frequent_function_words=least_frequent,
389
+ function_word_distribution=function_word_counts,
390
+ metadata=metadata,
391
+ )
@@ -1,9 +1,10 @@
1
1
  """Hapax legomena and related vocabulary richness metrics."""
2
2
 
3
+ import math
3
4
  from collections import Counter
4
5
 
5
- from .._types import HapaxResult
6
- from .._utils import tokenize
6
+ from .._types import HapaxLexiconResult, HapaxResult, LexiconCategories
7
+ from .._utils import check_optional_dependency, tokenize
7
8
 
8
9
 
9
10
  def compute_hapax_ratios(text: str) -> HapaxResult:
@@ -30,10 +31,18 @@ def compute_hapax_ratios(text: str) -> HapaxResult:
30
31
  Returns:
31
32
  HapaxResult with counts, ratios, Sichel's S, Honoré's R, and metadata
32
33
 
34
+ Note: When all words are unique (V₁ = V), Honoré's R returns float('inf')
35
+ to indicate maximal vocabulary richness (division by zero case).
36
+
33
37
  Example:
34
- >>> result = compute_hapax_ratios("The quick brown fox jumps over the lazy dog.")
35
- >>> print(f"Hapax ratio: {result.hapax_ratio:.3f}")
38
+ >>> text = "The quick brown fox jumps over the lazy dog"
39
+ >>> result = compute_hapax_ratios(text)
40
+ >>> result.hapax_count # Words appearing once
41
+ 7
42
+ >>> result.dis_hapax_count # Words appearing twice
43
+ 1
36
44
  >>> print(f"Sichel's S: {result.sichel_s:.3f}")
45
+ Sichel's S: 0.125
37
46
  """
38
47
  tokens = tokenize(text.lower())
39
48
  N = len(tokens) # noqa: N806
@@ -57,9 +66,18 @@ def compute_hapax_ratios(text: str) -> HapaxResult:
57
66
  V1 = sum(1 for count in freq_counter.values() if count == 1) # noqa: N806
58
67
  V2 = sum(1 for count in freq_counter.values() if count == 2) # noqa: N806
59
68
 
60
- # TODO: Implement Sichel's S and Honoré's R
61
- sichel_s = 0.0 # Placeholder
62
- honore_r = 0.0 # Placeholder
69
+ # Sichel's S: ratio of dislegomena to vocabulary size
70
+ # S = V₂ / V
71
+ sichel_s = V2 / V if V > 0 else 0.0
72
+
73
+ # Honoré's R: 100 × log(N) / (1 - V₁/V)
74
+ # R = 100 × log(N) / (1 - V₁/V)
75
+ # If V₁ = V (all words appear once), denominator is 0, return infinity
76
+ # This indicates maximal vocabulary richness (every word unique)
77
+ if V1 == V:
78
+ honore_r = float("inf")
79
+ else:
80
+ honore_r = 100 * math.log(N) / (1 - V1 / V)
63
81
 
64
82
  return HapaxResult(
65
83
  hapax_count=V1,
@@ -73,3 +91,132 @@ def compute_hapax_ratios(text: str) -> HapaxResult:
73
91
  "vocabulary_size": V,
74
92
  },
75
93
  )
94
+
95
+
96
+ def compute_hapax_with_lexicon_analysis(text: str) -> HapaxLexiconResult:
97
+ """
98
+ Compute hapax legomena with lexicon-based categorization.
99
+
100
+ Extends standard hapax analysis by categorizing hapax legomena based on
101
+ presence in WordNet and British National Corpus (BNC). This distinguishes
102
+ between:
103
+
104
+ 1. **Neologisms**: Words not in WordNet AND not in BNC
105
+ - True novel words or proper nouns
106
+ - High neologism ratio indicates vocabulary innovation
107
+
108
+ 2. **Rare Words**: Words in BNC but not WordNet, or vice versa
109
+ - Technical jargon, specialized terminology
110
+ - Words at the edges of common vocabulary
111
+
112
+ 3. **Common Words**: Words in both WordNet AND BNC
113
+ - Standard vocabulary that happens to appear once
114
+ - Low incidental usage of common words
115
+
116
+ This categorization is valuable for stylometric analysis:
117
+ - Authors with high neologism ratios are more innovative/creative
118
+ - Technical writing typically has higher rare word ratios
119
+ - Comparison of neologism vs common hapax distinguishes vocabulary
120
+ innovation from incidental word usage
121
+
122
+ Args:
123
+ text: Input text to analyze
124
+
125
+ Returns:
126
+ HapaxLexiconResult with standard hapax metrics and lexicon categorization
127
+
128
+ Raises:
129
+ ImportError: If bnc-lookup or wordnet-lookup packages are not installed
130
+
131
+ Example:
132
+ >>> text = "The xyzbot platform facilitates interdepartmental synergy."
133
+ >>> result = compute_hapax_with_lexicon_analysis(text)
134
+ >>> result.lexicon_analysis.neologisms
135
+ ['xyzbot', 'platform']
136
+ >>> result.lexicon_analysis.rare_words
137
+ ['facilitates', 'interdepartmental']
138
+ >>> result.lexicon_analysis.common_words
139
+ ['synergy']
140
+ >>> print(f"Neologism ratio: {result.lexicon_analysis.neologism_ratio:.2%}")
141
+ Neologism ratio: 40.00%
142
+
143
+ References:
144
+ British National Corpus: http://www.natcorp.ox.ac.uk/
145
+ WordNet: https://wordnet.princeton.edu/
146
+ """
147
+ # Check dependencies
148
+ check_optional_dependency("bnc_lookup", "lexical")
149
+ check_optional_dependency("wordnet_lookup", "lexical")
150
+
151
+ from bnc_lookup import is_bnc_term # type: ignore[import-not-found]
152
+ from wordnet_lookup import is_wordnet_term # type: ignore[import-not-found]
153
+
154
+ # First compute standard hapax metrics
155
+ hapax_result = compute_hapax_ratios(text)
156
+
157
+ # If no hapax legomena, return empty categorization
158
+ if hapax_result.hapax_count == 0:
159
+ return HapaxLexiconResult(
160
+ hapax_result=hapax_result,
161
+ lexicon_analysis=LexiconCategories(
162
+ neologisms=[],
163
+ rare_words=[],
164
+ common_words=[],
165
+ neologism_ratio=0.0,
166
+ rare_word_ratio=0.0,
167
+ metadata={"total_hapax": 0},
168
+ ),
169
+ metadata={"note": "No hapax legomena found"},
170
+ )
171
+
172
+ # Get tokens and identify hapax words
173
+ tokens = tokenize(text.lower())
174
+ freq_counter = Counter(tokens)
175
+ hapax_words = [word for word, count in freq_counter.items() if count == 1]
176
+
177
+ # Categorize each hapax word by lexicon presence
178
+ neologisms = []
179
+ rare_words = []
180
+ common_words = []
181
+
182
+ for word in hapax_words:
183
+ in_bnc = is_bnc_term(word)
184
+ in_wordnet = is_wordnet_term(word)
185
+
186
+ if not in_bnc and not in_wordnet:
187
+ # Not in either lexicon → true neologism
188
+ neologisms.append(word)
189
+ elif in_bnc and in_wordnet:
190
+ # In both lexicons → common word
191
+ common_words.append(word)
192
+ else:
193
+ # In one but not the other → rare word
194
+ rare_words.append(word)
195
+
196
+ # Calculate ratios
197
+ total_hapax = len(hapax_words)
198
+ neologism_ratio = len(neologisms) / total_hapax if total_hapax > 0 else 0.0
199
+ rare_word_ratio = len(rare_words) / total_hapax if total_hapax > 0 else 0.0
200
+ common_word_ratio = len(common_words) / total_hapax if total_hapax > 0 else 0.0
201
+
202
+ return HapaxLexiconResult(
203
+ hapax_result=hapax_result,
204
+ lexicon_analysis=LexiconCategories(
205
+ neologisms=sorted(neologisms),
206
+ rare_words=sorted(rare_words),
207
+ common_words=sorted(common_words),
208
+ neologism_ratio=neologism_ratio,
209
+ rare_word_ratio=rare_word_ratio,
210
+ metadata={
211
+ "total_hapax": total_hapax,
212
+ "neologism_count": len(neologisms),
213
+ "rare_word_count": len(rare_words),
214
+ "common_word_count": len(common_words),
215
+ "common_word_ratio": common_word_ratio,
216
+ },
217
+ ),
218
+ metadata={
219
+ "lexicons_used": ["bnc", "wordnet"],
220
+ "note": "Lexicon categorization based on BNC and WordNet presence",
221
+ },
222
+ )