pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pystylometry/__init__.py +1 -2
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1224 -2
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +4 -0
  6. pystylometry/authorship/additional_methods.py +100 -0
  7. pystylometry/character/__init__.py +15 -0
  8. pystylometry/character/character_metrics.py +301 -0
  9. pystylometry/lexical/__init__.py +13 -6
  10. pystylometry/lexical/advanced_diversity.py +641 -0
  11. pystylometry/lexical/function_words.py +391 -0
  12. pystylometry/lexical/hapax.py +154 -7
  13. pystylometry/lexical/mtld.py +83 -7
  14. pystylometry/lexical/ttr.py +83 -0
  15. pystylometry/lexical/word_frequency_sophistication.py +581 -0
  16. pystylometry/lexical/yule.py +34 -7
  17. pystylometry/ngrams/__init__.py +2 -0
  18. pystylometry/ngrams/extended_ngrams.py +235 -0
  19. pystylometry/prosody/__init__.py +12 -0
  20. pystylometry/prosody/rhythm_prosody.py +53 -0
  21. pystylometry/readability/__init__.py +12 -0
  22. pystylometry/readability/additional_formulas.py +985 -0
  23. pystylometry/readability/ari.py +93 -17
  24. pystylometry/readability/coleman_liau.py +102 -9
  25. pystylometry/readability/complex_words.py +531 -0
  26. pystylometry/readability/flesch.py +59 -14
  27. pystylometry/readability/gunning_fog.py +194 -25
  28. pystylometry/readability/smog.py +31 -14
  29. pystylometry/readability/syllables.py +137 -30
  30. pystylometry/stylistic/__init__.py +20 -0
  31. pystylometry/stylistic/cohesion_coherence.py +45 -0
  32. pystylometry/stylistic/genre_register.py +45 -0
  33. pystylometry/stylistic/markers.py +131 -0
  34. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  35. pystylometry/syntactic/__init__.py +4 -0
  36. pystylometry/syntactic/advanced_syntactic.py +432 -0
  37. pystylometry/syntactic/pos_ratios.py +104 -13
  38. pystylometry/syntactic/sentence_stats.py +57 -13
  39. pystylometry/syntactic/sentence_types.py +470 -0
  40. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
  41. pystylometry-1.0.0.dist-info/RECORD +46 -0
  42. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
  43. pystylometry-0.1.0.dist-info/RECORD +0 -26
@@ -40,22 +40,113 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
40
40
  """
41
41
  check_optional_dependency("spacy", "syntactic")
42
42
 
43
- # TODO: Implement spaCy-based POS analysis
44
- # import spacy
45
- # nlp = spacy.load(model)
46
- # doc = nlp(text)
43
+ import spacy
44
+
45
+ # Load spaCy model
46
+ try:
47
+ nlp = spacy.load(model)
48
+ except OSError:
49
+ raise OSError(
50
+ f"spaCy model '{model}' not found. "
51
+ f"Download it with: python -m spacy download {model}"
52
+ )
53
+
54
+ # Process text with spaCy
55
+ doc = nlp(text)
56
+
57
+ # Count POS tags
58
+ noun_count = 0
59
+ verb_count = 0
60
+ adj_count = 0
61
+ adv_count = 0
62
+ det_count = 0
63
+ adp_count = 0 # Adpositions (prepositions)
64
+ conj_count = 0 # Conjunctions (coordinating and subordinating)
65
+ total_tokens = 0
66
+
67
+ for token in doc:
68
+ # Only count alphabetic tokens (skip punctuation, numbers, etc.)
69
+ if not token.is_alpha:
70
+ continue
71
+
72
+ total_tokens += 1
73
+ pos = token.pos_
74
+
75
+ if pos == "NOUN" or pos == "PROPN":
76
+ noun_count += 1
77
+ elif pos == "VERB":
78
+ verb_count += 1
79
+ elif pos == "ADJ":
80
+ adj_count += 1
81
+ elif pos == "ADV":
82
+ adv_count += 1
83
+ elif pos == "DET":
84
+ det_count += 1
85
+ elif pos == "ADP":
86
+ adp_count += 1
87
+ elif pos in ("CCONJ", "SCONJ"):
88
+ conj_count += 1
89
+
90
+ # Handle empty text
91
+ if total_tokens == 0:
92
+ return POSResult(
93
+ noun_ratio=float("nan"),
94
+ verb_ratio=float("nan"),
95
+ adjective_ratio=float("nan"),
96
+ adverb_ratio=float("nan"),
97
+ noun_verb_ratio=float("nan"),
98
+ adjective_noun_ratio=float("nan"),
99
+ lexical_density=float("nan"),
100
+ function_word_ratio=float("nan"),
101
+ metadata={
102
+ "model": model,
103
+ "token_count": 0,
104
+ "noun_count": 0,
105
+ "verb_count": 0,
106
+ "adjective_count": 0,
107
+ "adverb_count": 0,
108
+ },
109
+ )
110
+
111
+ # Calculate ratios
112
+ noun_ratio = noun_count / total_tokens
113
+ verb_ratio = verb_count / total_tokens
114
+ adj_ratio = adj_count / total_tokens
115
+ adv_ratio = adv_count / total_tokens
116
+
117
+ # Noun-verb ratio (handle division by zero)
118
+ noun_verb_ratio = noun_count / verb_count if verb_count > 0 else float("nan")
119
+
120
+ # Adjective-noun ratio (handle division by zero)
121
+ adj_noun_ratio = adj_count / noun_count if noun_count > 0 else float("nan")
122
+
123
+ # Lexical density: (content words) / total words
124
+ # Content words = nouns + verbs + adjectives + adverbs
125
+ lexical_words = noun_count + verb_count + adj_count + adv_count
126
+ lexical_density = lexical_words / total_tokens
127
+
128
+ # Function word ratio: (determiners + prepositions + conjunctions) / total words
129
+ function_words = det_count + adp_count + conj_count
130
+ function_word_ratio = function_words / total_tokens
47
131
 
48
132
  return POSResult(
49
- noun_ratio=0.0,
50
- verb_ratio=0.0,
51
- adjective_ratio=0.0,
52
- adverb_ratio=0.0,
53
- noun_verb_ratio=0.0,
54
- adjective_noun_ratio=0.0,
55
- lexical_density=0.0,
56
- function_word_ratio=0.0,
133
+ noun_ratio=noun_ratio,
134
+ verb_ratio=verb_ratio,
135
+ adjective_ratio=adj_ratio,
136
+ adverb_ratio=adv_ratio,
137
+ noun_verb_ratio=noun_verb_ratio,
138
+ adjective_noun_ratio=adj_noun_ratio,
139
+ lexical_density=lexical_density,
140
+ function_word_ratio=function_word_ratio,
57
141
  metadata={
58
142
  "model": model,
59
- "token_count": 0,
143
+ "token_count": total_tokens,
144
+ "noun_count": noun_count,
145
+ "verb_count": verb_count,
146
+ "adjective_count": adj_count,
147
+ "adverb_count": adv_count,
148
+ "determiner_count": det_count,
149
+ "adposition_count": adp_count,
150
+ "conjunction_count": conj_count,
60
151
  },
61
152
  )
@@ -38,23 +38,67 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
38
38
  """
39
39
  check_optional_dependency("spacy", "syntactic")
40
40
 
41
- # TODO: Implement spaCy-based sentence analysis
42
- # import spacy
43
- # nlp = spacy.load(model)
44
- # doc = nlp(text)
45
- # sentences = list(doc.sents)
41
+ import spacy
46
42
 
47
- # For now, use simple fallback
48
- sentences = split_sentences(text)
43
+ # Load spaCy model
44
+ try:
45
+ nlp = spacy.load(model)
46
+ except OSError:
47
+ raise OSError(
48
+ f"spaCy model '{model}' not found. "
49
+ f"Download it with: python -m spacy download {model}"
50
+ )
51
+
52
+ # Process text with spaCy
53
+ doc = nlp(text)
54
+
55
+ # Extract sentences and count words in each
56
+ sentence_lengths = []
57
+ for sent in doc.sents:
58
+ # Count only alphabetic tokens (exclude punctuation)
59
+ word_count = sum(1 for token in sent if token.is_alpha)
60
+ if word_count > 0: # Only include non-empty sentences
61
+ sentence_lengths.append(word_count)
62
+
63
+ # Handle empty text
64
+ if len(sentence_lengths) == 0:
65
+ return SentenceStatsResult(
66
+ mean_sentence_length=float("nan"),
67
+ sentence_length_std=float("nan"),
68
+ sentence_length_range=0,
69
+ min_sentence_length=0,
70
+ max_sentence_length=0,
71
+ sentence_count=0,
72
+ metadata={
73
+ "model": model,
74
+ },
75
+ )
76
+
77
+ # Calculate statistics
78
+ mean_length = sum(sentence_lengths) / len(sentence_lengths)
79
+
80
+ # Standard deviation
81
+ if len(sentence_lengths) > 1:
82
+ variance = sum((x - mean_length) ** 2 for x in sentence_lengths) / (
83
+ len(sentence_lengths) - 1
84
+ )
85
+ std_dev = variance**0.5
86
+ else:
87
+ std_dev = 0.0
88
+
89
+ min_length = min(sentence_lengths)
90
+ max_length = max(sentence_lengths)
91
+ length_range = max_length - min_length
49
92
 
50
93
  return SentenceStatsResult(
51
- mean_sentence_length=0.0,
52
- sentence_length_std=0.0,
53
- sentence_length_range=0,
54
- min_sentence_length=0,
55
- max_sentence_length=0,
56
- sentence_count=len(sentences),
94
+ mean_sentence_length=mean_length,
95
+ sentence_length_std=std_dev,
96
+ sentence_length_range=length_range,
97
+ min_sentence_length=min_length,
98
+ max_sentence_length=max_length,
99
+ sentence_count=len(sentence_lengths),
57
100
  metadata={
58
101
  "model": model,
102
+ "sentence_lengths": sentence_lengths,
59
103
  },
60
104
  )
@@ -0,0 +1,470 @@
1
+ """Sentence type classification for syntactic analysis.
2
+
3
+ This module classifies sentences by their grammatical structure (simple, compound,
4
+ complex, compound-complex) and communicative function (declarative, interrogative,
5
+ imperative, exclamatory). These classifications reveal authorial preferences and
6
+ genre-specific patterns.
7
+
8
+ Related GitHub Issue:
9
+ #18 - Sentence Type Classification
10
+ https://github.com/craigtrim/pystylometry/issues/18
11
+
12
+ Structural classifications:
13
+ - Simple: One independent clause
14
+ - Compound: Multiple independent clauses joined by coordination
15
+ - Complex: One independent clause + one or more dependent clauses
16
+ - Compound-Complex: Multiple independent + dependent clauses
17
+
18
+ Functional classifications:
19
+ - Declarative: Makes a statement (ends with period)
20
+ - Interrogative: Asks a question (ends with question mark)
21
+ - Imperative: Gives a command (subject often implicit "you")
22
+ - Exclamatory: Expresses strong emotion (ends with exclamation mark)
23
+
24
+ References:
25
+ Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
26
+ Huddleston, R., & Pullum, G. K. (2002). The Cambridge Grammar of the English Language.
27
+ Quirk, R., et al. (1985). A Comprehensive Grammar of the English Language. Longman.
28
+ """
29
+
30
+ from .._types import SentenceTypeResult
31
+ from .._utils import check_optional_dependency
32
+
33
+
34
+ def compute_sentence_types(
35
+ text: str,
36
+ model: str = "en_core_web_sm",
37
+ ) -> SentenceTypeResult:
38
+ """
39
+ Classify sentences by structure and function.
40
+
41
+ Analyzes text to determine the distribution of sentence types, both
42
+ structural (based on clause organization) and functional (based on
43
+ communicative purpose). Different authors and genres show characteristic
44
+ patterns in sentence type usage.
45
+
46
+ Related GitHub Issue:
47
+ #18 - Sentence Type Classification
48
+ https://github.com/craigtrim/pystylometry/issues/18
49
+
50
+ Why sentence types matter:
51
+
52
+ Structural complexity:
53
+ - Simple sentences: Direct, clear, easy to process
54
+ - Compound sentences: Coordinate ideas of equal importance
55
+ - Complex sentences: Subordinate ideas, show relationships
56
+ - Compound-complex: Sophisticated, academic style
57
+
58
+ Functional diversity:
59
+ - Declarative dominance: Expository/academic writing
60
+ - Interrogative use: Interactive, rhetorical questions
61
+ - Imperative use: Instructional texts, commands
62
+ - Exclamatory use: Emotional, emphatic style
63
+
64
+ Genre patterns:
65
+ - Academic: High proportion of complex sentences
66
+ - Fiction: Mix of simple and complex for variety
67
+ - Journalism: Mostly simple and compound for clarity
68
+ - Technical: Predominantly declarative complex sentences
69
+
70
+ Structural Classification Algorithm:
71
+
72
+ Simple Sentence:
73
+ - Contains exactly one independent clause
74
+ - No dependent clauses
75
+ - Example: "The cat sat on the mat."
76
+
77
+ Compound Sentence:
78
+ - Contains two or more independent clauses
79
+ - Joined by coordinating conjunction or semicolon
80
+ - No dependent clauses
81
+ - Example: "I came, and I saw."
82
+
83
+ Complex Sentence:
84
+ - Contains one independent clause
85
+ - Plus one or more dependent clauses
86
+ - Example: "When I arrived, I saw her."
87
+
88
+ Compound-Complex Sentence:
89
+ - Contains two or more independent clauses
90
+ - Plus one or more dependent clauses
91
+ - Example: "I came when called, and I stayed because I wanted to."
92
+
93
+ Functional Classification Algorithm:
94
+
95
+ Declarative:
96
+ - Makes a statement
97
+ - Typically ends with period
98
+ - Subject before verb
99
+ - Example: "The sky is blue."
100
+
101
+ Interrogative:
102
+ - Asks a question
103
+ - Ends with question mark
104
+ - Often inverted word order or question words
105
+ - Example: "Is the sky blue?"
106
+
107
+ Imperative:
108
+ - Gives a command or instruction
109
+ - Subject typically implicit ("you")
110
+ - Often begins with base verb
111
+ - Example: "Look at the sky!"
112
+
113
+ Exclamatory:
114
+ - Expresses strong emotion
115
+ - Ends with exclamation mark
116
+ - May have inverted structure
117
+ - Example: "What a blue sky!"
118
+
119
+ Args:
120
+ text: Input text to analyze. Should contain multiple sentences for
121
+ meaningful distributions. Single-sentence texts will have ratios
122
+ of 1.0 for one type and 0.0 for others.
123
+ model: spaCy model with dependency parser. Default is "en_core_web_sm".
124
+ Larger models provide better clause detection accuracy.
125
+
126
+ Returns:
127
+ SentenceTypeResult containing:
128
+
129
+ Structural ratios (sum to 1.0):
130
+ - simple_ratio: Simple sentences / total
131
+ - compound_ratio: Compound sentences / total
132
+ - complex_ratio: Complex sentences / total
133
+ - compound_complex_ratio: Compound-complex / total
134
+
135
+ Functional ratios (sum to 1.0):
136
+ - declarative_ratio: Declarative sentences / total
137
+ - interrogative_ratio: Questions / total
138
+ - imperative_ratio: Commands / total
139
+ - exclamatory_ratio: Exclamations / total
140
+
141
+ Counts:
142
+ - simple_count, compound_count, complex_count, compound_complex_count
143
+ - declarative_count, interrogative_count, imperative_count, exclamatory_count
144
+ - total_sentences
145
+
146
+ Diversity metrics:
147
+ - structural_diversity: Shannon entropy of structural distribution
148
+ - functional_diversity: Shannon entropy of functional distribution
149
+
150
+ Metadata:
151
+ - sentence_by_sentence_classifications
152
+ - clause_counts_per_sentence
153
+ - etc.
154
+
155
+ Example:
156
+ >>> result = compute_sentence_types("Mix of sentence types here...")
157
+ >>> print(f"Simple: {result.simple_ratio * 100:.1f}%")
158
+ Simple: 35.2%
159
+ >>> print(f"Complex: {result.complex_ratio * 100:.1f}%")
160
+ Complex: 41.3%
161
+ >>> print(f"Questions: {result.interrogative_ratio * 100:.1f}%")
162
+ Questions: 8.5%
163
+ >>> print(f"Structural diversity: {result.structural_diversity:.3f}")
164
+ Structural diversity: 0.847
165
+
166
+ >>> # Compare genres
167
+ >>> academic = compute_sentence_types("Academic paper text...")
168
+ >>> fiction = compute_sentence_types("Fiction narrative...")
169
+ >>> print(f"Academic complex: {academic.complex_ratio:.2f}")
170
+ >>> print(f"Fiction simple: {fiction.simple_ratio:.2f}")
171
+
172
+ Note:
173
+ - Requires spaCy with dependency parser
174
+ - Clause detection based on dependency relations
175
+ - Coordinating conjunctions: and, but, or, nor, for, yet, so
176
+ - Dependent clause markers: ccomp, advcl, acl, relcl
177
+ - Punctuation used for functional classification
178
+ - Imperative detection uses missing subject + base verb pattern
179
+ - Empty text returns NaN for ratios, 0 for counts
180
+ """
181
+ check_optional_dependency("spacy", "syntactic")
182
+
183
+ try:
184
+ import spacy # type: ignore
185
+ except ImportError as e:
186
+ raise ImportError(
187
+ "spaCy is required for sentence type classification. "
188
+ "Install with: pip install spacy && python -m spacy download en_core_web_sm"
189
+ ) from e
190
+
191
+ # Load spaCy model
192
+ try:
193
+ nlp = spacy.load(model)
194
+ except OSError as e:
195
+ raise OSError(
196
+ f"spaCy model '{model}' not found. "
197
+ f"Download with: python -m spacy download {model}"
198
+ ) from e
199
+
200
+ # Parse text
201
+ doc = nlp(text)
202
+ sentences = list(doc.sents)
203
+
204
+ # Handle empty text
205
+ if len(sentences) == 0:
206
+ return SentenceTypeResult(
207
+ simple_ratio=float("nan"),
208
+ compound_ratio=float("nan"),
209
+ complex_ratio=float("nan"),
210
+ compound_complex_ratio=float("nan"),
211
+ declarative_ratio=float("nan"),
212
+ interrogative_ratio=float("nan"),
213
+ imperative_ratio=float("nan"),
214
+ exclamatory_ratio=float("nan"),
215
+ simple_count=0,
216
+ compound_count=0,
217
+ complex_count=0,
218
+ compound_complex_count=0,
219
+ declarative_count=0,
220
+ interrogative_count=0,
221
+ imperative_count=0,
222
+ exclamatory_count=0,
223
+ total_sentences=0,
224
+ structural_diversity=float("nan"),
225
+ functional_diversity=float("nan"),
226
+ metadata={
227
+ "warning": "Empty text or no sentences found",
228
+ },
229
+ )
230
+
231
+ # Classify each sentence
232
+ structural_counts = {"simple": 0, "compound": 0, "complex": 0, "compound_complex": 0}
233
+ functional_counts = {"declarative": 0, "interrogative": 0, "imperative": 0, "exclamatory": 0}
234
+ sentence_classifications = []
235
+ clause_counts_per_sentence = []
236
+
237
+ for sent in sentences:
238
+ # Count clauses
239
+ independent_count = _count_independent_clauses(sent)
240
+ dependent_count = _count_dependent_clauses(sent)
241
+ clause_counts_per_sentence.append((independent_count, dependent_count))
242
+
243
+ # Structural classification
244
+ structural_type = _classify_structural(independent_count, dependent_count)
245
+ structural_counts[structural_type] += 1
246
+
247
+ # Functional classification
248
+ functional_type = _classify_functional(sent)
249
+ functional_counts[functional_type] += 1
250
+
251
+ # Store classification
252
+ sentence_classifications.append({
253
+ "text": sent.text,
254
+ "structural_type": structural_type,
255
+ "functional_type": functional_type,
256
+ "independent_clauses": independent_count,
257
+ "dependent_clauses": dependent_count,
258
+ })
259
+
260
+ # Calculate ratios
261
+ total_sentences = len(sentences)
262
+ simple_ratio = structural_counts["simple"] / total_sentences
263
+ compound_ratio = structural_counts["compound"] / total_sentences
264
+ complex_ratio = structural_counts["complex"] / total_sentences
265
+ compound_complex_ratio = structural_counts["compound_complex"] / total_sentences
266
+
267
+ declarative_ratio = functional_counts["declarative"] / total_sentences
268
+ interrogative_ratio = functional_counts["interrogative"] / total_sentences
269
+ imperative_ratio = functional_counts["imperative"] / total_sentences
270
+ exclamatory_ratio = functional_counts["exclamatory"] / total_sentences
271
+
272
+ # Calculate diversity metrics
273
+ structural_ratios = [simple_ratio, compound_ratio, complex_ratio, compound_complex_ratio]
274
+ functional_ratios = [declarative_ratio, interrogative_ratio, imperative_ratio, exclamatory_ratio]
275
+
276
+ structural_diversity = _calculate_shannon_entropy(structural_ratios)
277
+ functional_diversity = _calculate_shannon_entropy(functional_ratios)
278
+
279
+ # Collect metadata
280
+ metadata = {
281
+ "sentence_count": total_sentences,
282
+ "sentence_classifications": sentence_classifications,
283
+ "clause_counts_per_sentence": clause_counts_per_sentence,
284
+ "structural_counts": structural_counts,
285
+ "functional_counts": functional_counts,
286
+ "model_used": model,
287
+ }
288
+
289
+ return SentenceTypeResult(
290
+ simple_ratio=simple_ratio,
291
+ compound_ratio=compound_ratio,
292
+ complex_ratio=complex_ratio,
293
+ compound_complex_ratio=compound_complex_ratio,
294
+ declarative_ratio=declarative_ratio,
295
+ interrogative_ratio=interrogative_ratio,
296
+ imperative_ratio=imperative_ratio,
297
+ exclamatory_ratio=exclamatory_ratio,
298
+ simple_count=structural_counts["simple"],
299
+ compound_count=structural_counts["compound"],
300
+ complex_count=structural_counts["complex"],
301
+ compound_complex_count=structural_counts["compound_complex"],
302
+ declarative_count=functional_counts["declarative"],
303
+ interrogative_count=functional_counts["interrogative"],
304
+ imperative_count=functional_counts["imperative"],
305
+ exclamatory_count=functional_counts["exclamatory"],
306
+ total_sentences=total_sentences,
307
+ structural_diversity=structural_diversity,
308
+ functional_diversity=functional_diversity,
309
+ metadata=metadata,
310
+ )
311
+
312
+
313
+ def _count_independent_clauses(sent) -> int:
314
+ """
315
+ Count independent clauses in a sentence.
316
+
317
+ Independent clauses are:
318
+ 1. The root clause (always 1)
319
+ 2. Coordinated clauses (conj with VERB POS and cc child)
320
+
321
+ Args:
322
+ sent: spaCy Span representing a sentence
323
+
324
+ Returns:
325
+ Number of independent clauses
326
+ """
327
+ count = 1 # Always start with root clause
328
+
329
+ for token in sent:
330
+ # Coordinated independent clause
331
+ if token.dep_ == "conj" and token.pos_ == "VERB":
332
+ # Check if coordinating conjunction present
333
+ if any(child.dep_ == "cc" for child in token.head.children):
334
+ count += 1
335
+
336
+ return count
337
+
338
+
339
+ def _count_dependent_clauses(sent) -> int:
340
+ """
341
+ Count dependent clauses in a sentence.
342
+
343
+ Dependent clauses are identified by dependency labels:
344
+ - ccomp: clausal complement
345
+ - advcl: adverbial clause
346
+ - acl: adnominal clause
347
+ - relcl: relative clause
348
+ - xcomp: open clausal complement (sometimes)
349
+
350
+ Args:
351
+ sent: spaCy Span representing a sentence
352
+
353
+ Returns:
354
+ Number of dependent clauses
355
+ """
356
+ dependent_labels = {"ccomp", "advcl", "acl", "relcl", "xcomp"}
357
+ count = sum(1 for token in sent if token.dep_ in dependent_labels)
358
+ return count
359
+
360
+
361
+ def _classify_structural(independent: int, dependent: int) -> str:
362
+ """
363
+ Classify sentence structure based on clause counts.
364
+
365
+ Args:
366
+ independent: Number of independent clauses
367
+ dependent: Number of dependent clauses
368
+
369
+ Returns:
370
+ One of: "simple", "compound", "complex", "compound_complex"
371
+ """
372
+ if independent == 1 and dependent == 0:
373
+ return "simple"
374
+ elif independent >= 2 and dependent == 0:
375
+ return "compound"
376
+ elif independent == 1 and dependent >= 1:
377
+ return "complex"
378
+ elif independent >= 2 and dependent >= 1:
379
+ return "compound_complex"
380
+ else:
381
+ # Fallback (shouldn't happen with valid counts)
382
+ return "simple"
383
+
384
+
385
+ def _classify_functional(sent) -> str:
386
+ """
387
+ Classify sentence function based on punctuation and structure.
388
+
389
+ Args:
390
+ sent: spaCy Span representing a sentence
391
+
392
+ Returns:
393
+ One of: "declarative", "interrogative", "imperative", "exclamatory"
394
+ """
395
+ # Get last token for punctuation
396
+ last_token = sent[-1]
397
+
398
+ # Check for question mark (interrogative)
399
+ if last_token.text == "?":
400
+ return "interrogative"
401
+
402
+ # Check for exclamation mark
403
+ if last_token.text == "!":
404
+ # Could be imperative or exclamatory
405
+ # Check if imperative structure
406
+ if _is_imperative_structure(sent):
407
+ return "imperative"
408
+ return "exclamatory"
409
+
410
+ # Check for imperative structure (missing subject + base verb)
411
+ if _is_imperative_structure(sent):
412
+ return "imperative"
413
+
414
+ # Default: declarative
415
+ return "declarative"
416
+
417
+
418
+ def _is_imperative_structure(sent) -> bool:
419
+ """
420
+ Check if sentence has imperative structure.
421
+
422
+ Imperatives typically:
423
+ - Missing nominal subject (nsubj)
424
+ - Root verb is base form (VB) or imperative
425
+
426
+ Args:
427
+ sent: spaCy Span representing a sentence
428
+
429
+ Returns:
430
+ True if imperative structure detected
431
+ """
432
+ # Check for nominal subject
433
+ has_nominal_subject = any(token.dep_ == "nsubj" for token in sent)
434
+
435
+ # Get root verb
436
+ root_verb = sent.root
437
+
438
+ # If no nominal subject and root is a verb
439
+ if not has_nominal_subject and root_verb.pos_ == "VERB":
440
+ # Check if root is base form (VB) or present tense without subject
441
+ if root_verb.tag_ in {"VB", "VBP"}:
442
+ return True
443
+
444
+ return False
445
+
446
+
447
+ def _calculate_shannon_entropy(probabilities: list[float]) -> float:
448
+ """
449
+ Calculate Shannon entropy for a probability distribution.
450
+
451
+ H = -sum(p * log2(p)) for p > 0
452
+
453
+ Args:
454
+ probabilities: List of probabilities (should sum to 1.0)
455
+
456
+ Returns:
457
+ Shannon entropy in bits (0.0 to log2(n) where n is number of categories)
458
+ """
459
+ import math
460
+
461
+ # Filter out zero probabilities (log(0) undefined)
462
+ non_zero_probs = [p for p in probabilities if p > 0]
463
+
464
+ if not non_zero_probs:
465
+ return 0.0
466
+
467
+ # Calculate entropy
468
+ entropy = -sum(p * math.log2(p) for p in non_zero_probs)
469
+
470
+ return entropy