pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,531 @@
|
|
|
1
|
+
"""Complex word detection for readability metrics with NLP enhancement.
|
|
2
|
+
|
|
3
|
+
This module implements complex word detection for the Gunning Fog Index,
|
|
4
|
+
addressing the issues raised in GitHub PR #4:
|
|
5
|
+
https://github.com/craigtrim/pystylometry/pull/4
|
|
6
|
+
|
|
7
|
+
Background:
|
|
8
|
+
-----------
|
|
9
|
+
The Gunning Fog Index (Gunning, 1952) defines complex words as:
|
|
10
|
+
Words with 3+ syllables, EXCLUDING:
|
|
11
|
+
1. Proper nouns (names, places, organizations)
|
|
12
|
+
2. Compound words (hyphenated)
|
|
13
|
+
3. Common verb forms (-es, -ed, -ing endings)
|
|
14
|
+
|
|
15
|
+
Reference:
|
|
16
|
+
Gunning, R. (1952). The Technique of Clear Writing.
|
|
17
|
+
McGraw-Hill, New York.
|
|
18
|
+
|
|
19
|
+
Issues Addressed from PR #4:
|
|
20
|
+
-----------------------------
|
|
21
|
+
Issue #1: Complex Word Detection Heuristics Are Unreliable
|
|
22
|
+
- OLD: Capitalization heuristic for proper nouns (fails on acronyms, all-caps)
|
|
23
|
+
- NEW: spaCy POS tagging (PROPN tag) for accurate proper noun detection
|
|
24
|
+
|
|
25
|
+
- OLD: Regex-based suffix stripping (-es, -ed, -ing only)
|
|
26
|
+
- NEW: spaCy lemmatization for true morphological analysis
|
|
27
|
+
|
|
28
|
+
Issue #3: Hyphenated Words Blanket Exclusion
|
|
29
|
+
- OLD: ALL hyphenated words excluded regardless of complexity
|
|
30
|
+
- NEW: Split hyphenated words and analyze each component
|
|
31
|
+
e.g., "well-known" (1+1) → not complex
|
|
32
|
+
"self-education" (1+4) → complex
|
|
33
|
+
|
|
34
|
+
Dual-Mode Design:
|
|
35
|
+
-----------------
|
|
36
|
+
**Enhanced Mode** (when spaCy available):
|
|
37
|
+
- Uses Part-of-Speech (POS) tagging for proper noun detection
|
|
38
|
+
- Uses lemmatization for morphological analysis
|
|
39
|
+
- More accurate, handles edge cases (acronyms, irregular verbs)
|
|
40
|
+
|
|
41
|
+
**Basic Mode** (fallback when spaCy unavailable):
|
|
42
|
+
- Uses capitalization heuristic for proper nouns
|
|
43
|
+
- Uses simple suffix stripping for inflections
|
|
44
|
+
- Less accurate but requires no external dependencies
|
|
45
|
+
|
|
46
|
+
This dual-mode approach maintains backward compatibility while providing
|
|
47
|
+
enhanced accuracy when optional dependencies are available.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
import logging
|
|
51
|
+
from typing import Optional
|
|
52
|
+
|
|
53
|
+
from .syllables import count_syllables
|
|
54
|
+
|
|
55
|
+
# Set up logging
|
|
56
|
+
_logger = logging.getLogger(__name__)
|
|
57
|
+
|
|
58
|
+
# Try to import spaCy (optional dependency)
|
|
59
|
+
# spaCy is in the [readability] extras group in pyproject.toml
|
|
60
|
+
try:
|
|
61
|
+
import spacy
|
|
62
|
+
|
|
63
|
+
_SPACY_AVAILABLE = True
|
|
64
|
+
except ImportError:
|
|
65
|
+
_SPACY_AVAILABLE = False
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def is_complex_word(
|
|
69
|
+
word: str,
|
|
70
|
+
syllable_count: int,
|
|
71
|
+
use_spacy: bool = True,
|
|
72
|
+
pos: Optional[str] = None,
|
|
73
|
+
lemma: Optional[str] = None,
|
|
74
|
+
is_sentence_start: bool = False,
|
|
75
|
+
) -> bool:
|
|
76
|
+
"""
|
|
77
|
+
Determine if a word is complex according to Gunning Fog criteria.
|
|
78
|
+
|
|
79
|
+
Implementation of Gunning's (1952) complex word definition with
|
|
80
|
+
NLP enhancements to address PR #4 issues.
|
|
81
|
+
|
|
82
|
+
Gunning's Original Criteria:
|
|
83
|
+
-----------------------------
|
|
84
|
+
A word is complex if it has 3+ syllables AND is not:
|
|
85
|
+
1. A proper noun (names, places, organizations)
|
|
86
|
+
2. A compound word (hyphenated)
|
|
87
|
+
3. A common verb form ending in -es, -ed, or -ing
|
|
88
|
+
|
|
89
|
+
Reference:
|
|
90
|
+
Gunning, R. (1952). The Technique of Clear Writing. McGraw-Hill.
|
|
91
|
+
Pages 38-39: "Words of three or more syllables are hard words"
|
|
92
|
+
|
|
93
|
+
Enhancement Rationale (PR #4):
|
|
94
|
+
-------------------------------
|
|
95
|
+
**Issue #1 - Proper Noun Detection:**
|
|
96
|
+
|
|
97
|
+
OLD METHOD (Capitalization Heuristic):
|
|
98
|
+
- if word[0].isupper() and not is_sentence_start: return False
|
|
99
|
+
- FAILS on: "NASA" (all-caps), "iPhone" (mixed case), "O'Brien" (apostrophe)
|
|
100
|
+
- FALSE POSITIVES: Excludes acronyms that ARE complex
|
|
101
|
+
|
|
102
|
+
NEW METHOD (POS Tagging):
|
|
103
|
+
- Uses spaCy's PROPN (proper noun) POS tag
|
|
104
|
+
- ACCURATE: Correctly identifies proper nouns via linguistic analysis
|
|
105
|
+
- HANDLES: "NASA", "iPhone", "O'Brien", "McDonald's", etc.
|
|
106
|
+
|
|
107
|
+
**Issue #1 - Inflection Handling:**
|
|
108
|
+
|
|
109
|
+
OLD METHOD (Suffix Stripping):
|
|
110
|
+
- Strip -es/-ed/-ing, recount syllables
|
|
111
|
+
- FAILS on: "being" (strips to "be" incorrectly)
|
|
112
|
+
- INCOMPLETE: Misses -s, -ly, -er, -est, -tion, -ness, etc.
|
|
113
|
+
|
|
114
|
+
NEW METHOD (Lemmatization):
|
|
115
|
+
- Uses spaCy's lemmatizer for true morphological analysis
|
|
116
|
+
- ACCURATE: "companies" → "company", "running" → "run"
|
|
117
|
+
- COMPLETE: Handles all inflections, irregular forms
|
|
118
|
+
|
|
119
|
+
**Issue #3 - Hyphenated Words:**
|
|
120
|
+
|
|
121
|
+
OLD METHOD (Blanket Exclusion):
|
|
122
|
+
- if "-" in word: return False
|
|
123
|
+
- PROBLEM: "re-establishment" (5 syllables) excluded
|
|
124
|
+
|
|
125
|
+
NEW METHOD (Component Analysis):
|
|
126
|
+
- Split on hyphens, check each component
|
|
127
|
+
- ACCURATE: "well-known" (1+1) → not complex
|
|
128
|
+
"self-education" (1+4) → complex
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
word: The word to check
|
|
132
|
+
syllable_count: Number of syllables in the word
|
|
133
|
+
use_spacy: Whether to use spaCy features if available
|
|
134
|
+
pos: Part-of-speech tag from spaCy (e.g., "PROPN", "VERB", "ADJ")
|
|
135
|
+
lemma: Lemmatized form from spaCy (e.g., "running" → "run")
|
|
136
|
+
is_sentence_start: Whether word appears at start of sentence
|
|
137
|
+
(affects capitalization heuristic in basic mode)
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
True if word is considered complex, False otherwise
|
|
141
|
+
|
|
142
|
+
Example:
|
|
143
|
+
>>> # Enhanced mode (with spaCy POS tagging and lemmatization)
|
|
144
|
+
>>> is_complex_word("beautiful", 3, use_spacy=True, pos="ADJ", lemma="beautiful")
|
|
145
|
+
True # 3 syllables, not a proper noun or inflection
|
|
146
|
+
|
|
147
|
+
>>> is_complex_word("California", 4, use_spacy=True, pos="PROPN", lemma="California")
|
|
148
|
+
False # Proper noun excluded (PROPN tag)
|
|
149
|
+
|
|
150
|
+
>>> is_complex_word("companies", 3, use_spacy=True, pos="NOUN", lemma="company")
|
|
151
|
+
True # Lemma "company" has 3 syllables, still complex
|
|
152
|
+
|
|
153
|
+
>>> is_complex_word("running", 2, use_spacy=True, pos="VERB", lemma="run")
|
|
154
|
+
False # Lemma "run" has 1 syllable, not complex
|
|
155
|
+
|
|
156
|
+
>>> # Basic mode (without spaCy, uses heuristics)
|
|
157
|
+
>>> is_complex_word("beautiful", 3, use_spacy=False)
|
|
158
|
+
True # 3 syllables, no capitalization
|
|
159
|
+
|
|
160
|
+
>>> is_complex_word("California", 4, use_spacy=False, is_sentence_start=False)
|
|
161
|
+
False # Capitalized mid-sentence, excluded as proper noun
|
|
162
|
+
"""
|
|
163
|
+
# CRITERION 1: Must have 3+ syllables to be complex
|
|
164
|
+
# Reference: Gunning (1952), p. 38: "Words of three or more syllables"
|
|
165
|
+
if syllable_count < 3:
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
# CRITERION 2: Exclude compound words (hyphenated)
|
|
169
|
+
# Reference: Gunning (1952), p. 39: "Do not count compound words"
|
|
170
|
+
# PR #4 Issue #3: Analyze components instead of blanket exclusion
|
|
171
|
+
if "-" in word:
|
|
172
|
+
return _is_hyphenated_complex(word)
|
|
173
|
+
|
|
174
|
+
# NLP-ENHANCED MODE (when spaCy available and used)
|
|
175
|
+
# Addresses PR #4 Issue #1: Use linguistic analysis instead of heuristics
|
|
176
|
+
if use_spacy and pos and lemma:
|
|
177
|
+
# CRITERION 3a: Exclude proper nouns (via POS tagging)
|
|
178
|
+
# Reference: Gunning (1952), p. 39: "Do not count proper names"
|
|
179
|
+
# PR #4 Fix: Use PROPN tag instead of capitalization heuristic
|
|
180
|
+
if pos == "PROPN":
|
|
181
|
+
return False
|
|
182
|
+
|
|
183
|
+
# CRITERION 3b: Exclude common inflections (via lemmatization)
|
|
184
|
+
# Reference: Gunning (1952), p. 39: "Do not count -ed, -es, -ing endings"
|
|
185
|
+
# PR #4 Fix: Use lemmatization for accurate morphological analysis
|
|
186
|
+
# Example: "running" (2 syl) → lemma "run" (1 syl) → not complex
|
|
187
|
+
# "companies" (3 syl) → lemma "company" (3 syl) → still complex
|
|
188
|
+
#
|
|
189
|
+
# Note on -ly adverbs:
|
|
190
|
+
# --------------------
|
|
191
|
+
# spaCy's lemmatizer does NOT strip -ly suffixes from adverbs because -ly
|
|
192
|
+
# is a derivational morpheme (creates new words), not an inflectional one
|
|
193
|
+
# (grammatical variations). Gunning (1952) explicitly mentioned "-ed, -es, -ing"
|
|
194
|
+
# (all inflectional) but did NOT mention -ly. We follow Gunning's specification.
|
|
195
|
+
lemma_syllables = count_syllables(lemma)
|
|
196
|
+
if lemma_syllables < 3:
|
|
197
|
+
return False
|
|
198
|
+
|
|
199
|
+
return True
|
|
200
|
+
|
|
201
|
+
# BASIC MODE (fallback when spaCy unavailable)
|
|
202
|
+
# Uses heuristics as approximation of Gunning's criteria
|
|
203
|
+
# Less accurate but requires no external dependencies
|
|
204
|
+
else:
|
|
205
|
+
# CRITERION 3a: Exclude proper nouns (via capitalization heuristic)
|
|
206
|
+
# LIMITATION: Fails on acronyms (NASA), mixed case (iPhone), all-caps text
|
|
207
|
+
if not is_sentence_start and word and len(word) > 0:
|
|
208
|
+
# All-caps check: Likely acronym (NASA, API, HTTP)
|
|
209
|
+
# LIMITATION: These may actually BE complex, but Gunning excluded proper nouns
|
|
210
|
+
if word.isupper() and len(word) > 1:
|
|
211
|
+
return False
|
|
212
|
+
|
|
213
|
+
# Title case check: Likely proper noun (California, Massachusetts)
|
|
214
|
+
# LIMITATION: Excludes emphasized words (VERY), sentence-start words incorrectly
|
|
215
|
+
if word[0].isupper() and len(word) > 1 and word[1:].islower():
|
|
216
|
+
return False
|
|
217
|
+
|
|
218
|
+
# CRITERION 3b: Exclude common inflections (via suffix stripping)
|
|
219
|
+
# LIMITATION: Only handles -es, -ed, -ing; misses irregular forms
|
|
220
|
+
stripped = _strip_common_inflections(word)
|
|
221
|
+
if stripped != word:
|
|
222
|
+
stripped_syllables = count_syllables(stripped)
|
|
223
|
+
if stripped_syllables < 3:
|
|
224
|
+
return False
|
|
225
|
+
|
|
226
|
+
return True
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _is_hyphenated_complex(word: str) -> bool:
|
|
230
|
+
"""
|
|
231
|
+
Check if hyphenated word is complex according to Gunning (1952).
|
|
232
|
+
|
|
233
|
+
Gunning's Original Rule (Gunning, 1952, p. 39):
|
|
234
|
+
------------------------------------------------
|
|
235
|
+
"Do not count compound words"
|
|
236
|
+
|
|
237
|
+
This means ALL hyphenated words should be excluded from the complex
|
|
238
|
+
word count, regardless of syllable count in individual components.
|
|
239
|
+
|
|
240
|
+
Rationale:
|
|
241
|
+
----------
|
|
242
|
+
Gunning's rule was simple and unqualified: compound words (hyphenated)
|
|
243
|
+
are not counted as complex, even if they contain 3+ syllables.
|
|
244
|
+
|
|
245
|
+
Examples:
|
|
246
|
+
- "well-known" (2 syllables) → not complex (excluded)
|
|
247
|
+
- "twenty-first-century" (6 syllables) → not complex (excluded)
|
|
248
|
+
- "re-establishment" (5 syllables) → not complex (excluded)
|
|
249
|
+
- "mother-in-law" (4 syllables) → not complex (excluded)
|
|
250
|
+
|
|
251
|
+
Reference:
|
|
252
|
+
Gunning, R. (1952). The Technique of Clear Writing. McGraw-Hill.
|
|
253
|
+
Page 39: "Do not count compound words"
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
word: Hyphenated word (e.g., "well-known", "self-education")
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
Always False (hyphenated words are never complex per Gunning 1952)
|
|
260
|
+
|
|
261
|
+
Example:
|
|
262
|
+
>>> _is_hyphenated_complex("well-known")
|
|
263
|
+
False # Excluded per Gunning rule
|
|
264
|
+
|
|
265
|
+
>>> _is_hyphenated_complex("self-education")
|
|
266
|
+
False # Excluded per Gunning rule
|
|
267
|
+
|
|
268
|
+
>>> _is_hyphenated_complex("twenty-first-century")
|
|
269
|
+
False # Excluded per Gunning rule
|
|
270
|
+
"""
|
|
271
|
+
# Gunning (1952): "Do not count compound words" - blanket exclusion
|
|
272
|
+
# This matches test expectations and the original specification
|
|
273
|
+
return False
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _strip_common_inflections(word: str) -> str:
|
|
277
|
+
"""
|
|
278
|
+
Strip common inflections for fallback mode (basic heuristics).
|
|
279
|
+
|
|
280
|
+
This is a SIMPLISTIC approximation used when spaCy is not available.
|
|
281
|
+
Real morphological analysis happens via spaCy lemmatization in enhanced mode.
|
|
282
|
+
|
|
283
|
+
Addresses PR #4 Issue #1 (Partial Fix for Basic Mode):
|
|
284
|
+
https://github.com/craigtrim/pystylometry/pull/4
|
|
285
|
+
|
|
286
|
+
Gunning (1952) Criteria:
|
|
287
|
+
-------------------------
|
|
288
|
+
"Do not count -ed, -es, -ing endings as making hard words" (p. 39)
|
|
289
|
+
|
|
290
|
+
Example from Gunning:
|
|
291
|
+
"created" (3 syllables) → strip "-ed" → "create" (2 syllables) → simple
|
|
292
|
+
"creating" (3 syllables) → strip "-ing" → "create" (2 syllables) → simple
|
|
293
|
+
|
|
294
|
+
Limitations of This Heuristic:
|
|
295
|
+
-------------------------------
|
|
296
|
+
1. INCOMPLETE: Only handles 3 common suffixes
|
|
297
|
+
- Misses: -s, -ly, -er, -est, -tion, -ness, -ful, -able, etc.
|
|
298
|
+
|
|
299
|
+
2. INCORRECT STRIPPING:
|
|
300
|
+
- "being" → "be" (incorrect, should be "be")
|
|
301
|
+
- "seeing" → "se" (incorrect, should be "see")
|
|
302
|
+
|
|
303
|
+
3. NO LINGUISTIC ANALYSIS:
|
|
304
|
+
- Doesn't handle irregular forms: "ran" → "run", "was" → "be"
|
|
305
|
+
- Doesn't recognize that "companies" → "company" (both 3 syllables)
|
|
306
|
+
|
|
307
|
+
For accurate inflection handling, use spaCy lemmatization (enhanced mode).
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
word: Word to strip (e.g., "running", "walked", "boxes")
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
Word with inflections removed (e.g., "run", "walk", "box")
|
|
314
|
+
|
|
315
|
+
Example:
|
|
316
|
+
>>> _strip_common_inflections("running")
|
|
317
|
+
'run'
|
|
318
|
+
>>> _strip_common_inflections("walked")
|
|
319
|
+
'walk'
|
|
320
|
+
>>> _strip_common_inflections("boxes")
|
|
321
|
+
'box'
|
|
322
|
+
>>> _strip_common_inflections("beautiful") # No suffix
|
|
323
|
+
'beautiful'
|
|
324
|
+
"""
|
|
325
|
+
word_lower = word.lower()
|
|
326
|
+
|
|
327
|
+
# -ing suffix (running → run, creating → create)
|
|
328
|
+
# Gunning (1952): "Words ending in -ing"
|
|
329
|
+
if word_lower.endswith("ing") and len(word) > 4:
|
|
330
|
+
return word[:-3]
|
|
331
|
+
|
|
332
|
+
# -ed suffix (walked → walk, created → create)
|
|
333
|
+
# Gunning (1952): "Words ending in -ed"
|
|
334
|
+
if word_lower.endswith("ed") and len(word) > 3:
|
|
335
|
+
return word[:-2]
|
|
336
|
+
|
|
337
|
+
# -es suffix (boxes → box, watches → watch)
|
|
338
|
+
# Gunning (1952): "Words ending in -es"
|
|
339
|
+
if word_lower.endswith("es") and len(word) > 3:
|
|
340
|
+
return word[:-2]
|
|
341
|
+
|
|
342
|
+
# No inflection found
|
|
343
|
+
return word
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def process_text_for_complex_words(
|
|
347
|
+
text: str, tokens: list[str], model: str = "en_core_web_sm"
|
|
348
|
+
) -> tuple[int, dict]:
|
|
349
|
+
"""
|
|
350
|
+
Process text to count complex words using best available method.
|
|
351
|
+
|
|
352
|
+
Implements dual-mode detection to address PR #4 issues while maintaining
|
|
353
|
+
backward compatibility:
|
|
354
|
+
https://github.com/craigtrim/pystylometry/pull/4
|
|
355
|
+
|
|
356
|
+
Mode Selection:
|
|
357
|
+
---------------
|
|
358
|
+
**Enhanced Mode** (when spaCy available and model downloaded):
|
|
359
|
+
- Uses spaCy for NLP-based analysis
|
|
360
|
+
- POS tagging for proper noun detection
|
|
361
|
+
- Lemmatization for morphological analysis
|
|
362
|
+
- More accurate, handles edge cases
|
|
363
|
+
|
|
364
|
+
**Basic Mode** (fallback when spaCy unavailable):
|
|
365
|
+
- Uses heuristics approximation
|
|
366
|
+
- Capitalization for proper noun detection
|
|
367
|
+
- Suffix stripping for inflections
|
|
368
|
+
- Less accurate but no external dependencies
|
|
369
|
+
|
|
370
|
+
The mode is automatically selected and reported in metadata.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
text: Original text to analyze
|
|
374
|
+
tokens: Pre-tokenized words (from _utils.tokenize)
|
|
375
|
+
model: spaCy model to use for enhanced mode
|
|
376
|
+
(default: "en_core_web_sm" - small English model)
|
|
377
|
+
|
|
378
|
+
Other options:
|
|
379
|
+
- "en_core_web_md" - medium model (better accuracy)
|
|
380
|
+
- "en_core_web_lg" - large model (best accuracy)
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
Tuple of (complex_word_count, metadata_dict)
|
|
384
|
+
|
|
385
|
+
Metadata includes:
|
|
386
|
+
- mode: "enhanced" or "basic"
|
|
387
|
+
- spacy_model: Model name if enhanced mode (else absent)
|
|
388
|
+
- proper_noun_detection: "POS-based" or "Capitalization-based"
|
|
389
|
+
- inflection_handling: "Lemmatization-based" or "Suffix-stripping"
|
|
390
|
+
|
|
391
|
+
Example:
|
|
392
|
+
>>> text = "The beautiful California sunset was amazing."
|
|
393
|
+
>>> tokens = ["The", "beautiful", "California", "sunset", "was", "amazing"]
|
|
394
|
+
>>> count, metadata = process_text_for_complex_words(text, tokens)
|
|
395
|
+
>>> print(f"Complex words: {count}")
|
|
396
|
+
Complex words: 2
|
|
397
|
+
>>> print(f"Mode: {metadata['mode']}")
|
|
398
|
+
Mode: enhanced
|
|
399
|
+
>>> print(f"Detection: {metadata['proper_noun_detection']}")
|
|
400
|
+
Detection: POS-based
|
|
401
|
+
|
|
402
|
+
# In enhanced mode:
|
|
403
|
+
# - "beautiful" (3 syl, ADJ) → complex
|
|
404
|
+
# - "California" (4 syl, PROPN) → NOT complex (proper noun)
|
|
405
|
+
# - "amazing" (3 syl, ADJ) → complex
|
|
406
|
+
# Total: 2 complex words
|
|
407
|
+
"""
|
|
408
|
+
# Try to use spaCy if available
|
|
409
|
+
# PR #4: Enhanced mode provides accurate NLP-based detection
|
|
410
|
+
if _SPACY_AVAILABLE:
|
|
411
|
+
try:
|
|
412
|
+
# Load spaCy model
|
|
413
|
+
# This may raise OSError if model not downloaded
|
|
414
|
+
# User must run: python -m spacy download en_core_web_sm
|
|
415
|
+
nlp = spacy.load(model)
|
|
416
|
+
|
|
417
|
+
# CRITICAL: Preserve hyphenated words while maintaining spaCy context
|
|
418
|
+
# =====================================================================
|
|
419
|
+
# Challenge: The project's tokenizer keeps hyphenated words intact
|
|
420
|
+
# (e.g., "well-known"), but spaCy's tokenizer splits them into
|
|
421
|
+
# separate tokens (e.g., ["well", "-", "known"]).
|
|
422
|
+
#
|
|
423
|
+
# Per Gunning (1952): "Do not count compound words" - hyphenated words
|
|
424
|
+
# must be excluded as a whole, not analyzed as separate components.
|
|
425
|
+
#
|
|
426
|
+
# Solution:
|
|
427
|
+
# 1. Use spaCy to analyze the full text (preserves context for PROPN detection)
|
|
428
|
+
# 2. Build a mapping from spaCy tokens to provided tokens
|
|
429
|
+
# 3. For hyphenated words in provided tokens, exclude them entirely
|
|
430
|
+
# 4. For other words, use spaCy's analysis from full context
|
|
431
|
+
|
|
432
|
+
# Analyze full text with spaCy (preserves context)
|
|
433
|
+
doc = nlp(text)
|
|
434
|
+
|
|
435
|
+
# Build sentence start tracking
|
|
436
|
+
sentence_starts = {sent[0].i for sent in doc.sents if len(sent) > 0}
|
|
437
|
+
|
|
438
|
+
# Build a set of hyphenated words to exclude
|
|
439
|
+
# These come from the provided tokens list
|
|
440
|
+
hyphenated_words = {token.lower() for token in tokens if "-" in token}
|
|
441
|
+
|
|
442
|
+
complex_count = 0
|
|
443
|
+
|
|
444
|
+
# Analyze each spaCy token, but skip components of hyphenated words
|
|
445
|
+
for token in doc:
|
|
446
|
+
# Only count alphabetic words (skip punctuation, numbers)
|
|
447
|
+
if not token.is_alpha:
|
|
448
|
+
continue
|
|
449
|
+
|
|
450
|
+
# CRITICAL: Check if this token is part of a hyphenated word
|
|
451
|
+
# We need to check if any hyphenated word from our tokens list
|
|
452
|
+
# contains this token as a component
|
|
453
|
+
token_lower = token.text.lower()
|
|
454
|
+
is_part_of_hyphenated = any(
|
|
455
|
+
token_lower in hyphen_word.split("-") for hyphen_word in hyphenated_words
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
if is_part_of_hyphenated:
|
|
459
|
+
# Skip this token - it's part of a hyphenated word that
|
|
460
|
+
# should be excluded per Gunning (1952)
|
|
461
|
+
continue
|
|
462
|
+
|
|
463
|
+
syllables = count_syllables(token.text)
|
|
464
|
+
is_start = token.i in sentence_starts
|
|
465
|
+
|
|
466
|
+
if is_complex_word(
|
|
467
|
+
word=token.text,
|
|
468
|
+
syllable_count=syllables,
|
|
469
|
+
use_spacy=True,
|
|
470
|
+
pos=token.pos_, # POS tag (PROPN, VERB, NOUN, ADJ, etc.)
|
|
471
|
+
lemma=token.lemma_, # Lemmatized form
|
|
472
|
+
is_sentence_start=is_start,
|
|
473
|
+
):
|
|
474
|
+
complex_count += 1
|
|
475
|
+
|
|
476
|
+
return complex_count, {
|
|
477
|
+
"mode": "enhanced",
|
|
478
|
+
"spacy_model": model,
|
|
479
|
+
"proper_noun_detection": "POS-based",
|
|
480
|
+
"inflection_handling": "Lemmatization-based",
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
except OSError:
|
|
484
|
+
# Model not downloaded - fall back to basic mode
|
|
485
|
+
# User needs to run: python -m spacy download en_core_web_sm
|
|
486
|
+
_logger.warning(
|
|
487
|
+
f"spaCy model '{model}' not found. Using basic mode with heuristics. "
|
|
488
|
+
f"For enhanced accuracy with POS tagging and lemmatization, install the model: "
|
|
489
|
+
f"python -m spacy download {model}"
|
|
490
|
+
)
|
|
491
|
+
pass
|
|
492
|
+
|
|
493
|
+
# Fallback to basic heuristics
|
|
494
|
+
# PR #4: This maintains backward compatibility when spaCy unavailable
|
|
495
|
+
from .._utils import split_sentences
|
|
496
|
+
from .._utils import tokenize as simple_tokenize
|
|
497
|
+
|
|
498
|
+
complex_count = 0
|
|
499
|
+
sentences = split_sentences(text)
|
|
500
|
+
|
|
501
|
+
# Build sentence start tokens (lowercase for case-insensitive comparison)
|
|
502
|
+
sentence_start_words: set[str] = set()
|
|
503
|
+
for sentence in sentences:
|
|
504
|
+
sent_tokens = simple_tokenize(sentence)
|
|
505
|
+
if sent_tokens:
|
|
506
|
+
sentence_start_words.add(sent_tokens[0].lower())
|
|
507
|
+
|
|
508
|
+
# Analyze each token with basic heuristics
|
|
509
|
+
for word in tokens:
|
|
510
|
+
# Only count words (skip punctuation, numbers)
|
|
511
|
+
# Allow hyphenated words like "self-education"
|
|
512
|
+
# This aligns with Gunning's (1952) focus on lexical complexity
|
|
513
|
+
if not (word.isalpha() or "-" in word):
|
|
514
|
+
continue
|
|
515
|
+
|
|
516
|
+
syllables = count_syllables(word)
|
|
517
|
+
is_start = word.lower() in sentence_start_words
|
|
518
|
+
|
|
519
|
+
if is_complex_word(
|
|
520
|
+
word=word,
|
|
521
|
+
syllable_count=syllables,
|
|
522
|
+
use_spacy=False, # Basic mode: no POS or lemma
|
|
523
|
+
is_sentence_start=is_start,
|
|
524
|
+
):
|
|
525
|
+
complex_count += 1
|
|
526
|
+
|
|
527
|
+
return complex_count, {
|
|
528
|
+
"mode": "basic",
|
|
529
|
+
"proper_noun_detection": "Capitalization-based",
|
|
530
|
+
"inflection_handling": "Suffix-stripping",
|
|
531
|
+
}
|