pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,526 @@
|
|
|
1
|
+
"""Sentence type classification for syntactic analysis.
|
|
2
|
+
|
|
3
|
+
This module classifies sentences by their grammatical structure (simple, compound,
|
|
4
|
+
complex, compound-complex) and communicative function (declarative, interrogative,
|
|
5
|
+
imperative, exclamatory). These classifications reveal authorial preferences and
|
|
6
|
+
genre-specific patterns.
|
|
7
|
+
|
|
8
|
+
Related GitHub Issue:
|
|
9
|
+
#18 - Sentence Type Classification
|
|
10
|
+
https://github.com/craigtrim/pystylometry/issues/18
|
|
11
|
+
|
|
12
|
+
Structural classifications:
|
|
13
|
+
- Simple: One independent clause
|
|
14
|
+
- Compound: Multiple independent clauses joined by coordination
|
|
15
|
+
- Complex: One independent clause + one or more dependent clauses
|
|
16
|
+
- Compound-Complex: Multiple independent + dependent clauses
|
|
17
|
+
|
|
18
|
+
Functional classifications:
|
|
19
|
+
- Declarative: Makes a statement (ends with period)
|
|
20
|
+
- Interrogative: Asks a question (ends with question mark)
|
|
21
|
+
- Imperative: Gives a command (subject often implicit "you")
|
|
22
|
+
- Exclamatory: Expresses strong emotion (ends with exclamation mark)
|
|
23
|
+
|
|
24
|
+
References:
|
|
25
|
+
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
26
|
+
Huddleston, R., & Pullum, G. K. (2002). The Cambridge Grammar of the English Language.
|
|
27
|
+
Quirk, R., et al. (1985). A Comprehensive Grammar of the English Language. Longman.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
from .._types import Distribution, SentenceTypeResult, make_distribution
|
|
33
|
+
from .._utils import check_optional_dependency
|
|
34
|
+
|
|
35
|
+
# Type alias for spaCy Span (loaded dynamically)
|
|
36
|
+
_SpaCySpan = Any
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def compute_sentence_types(
|
|
40
|
+
text: str,
|
|
41
|
+
model: str = "en_core_web_sm",
|
|
42
|
+
chunk_size: int = 1000,
|
|
43
|
+
) -> SentenceTypeResult:
|
|
44
|
+
"""
|
|
45
|
+
Classify sentences by structure and function.
|
|
46
|
+
|
|
47
|
+
Analyzes text to determine the distribution of sentence types, both
|
|
48
|
+
structural (based on clause organization) and functional (based on
|
|
49
|
+
communicative purpose). Different authors and genres show characteristic
|
|
50
|
+
patterns in sentence type usage.
|
|
51
|
+
|
|
52
|
+
Related GitHub Issue:
|
|
53
|
+
#18 - Sentence Type Classification
|
|
54
|
+
https://github.com/craigtrim/pystylometry/issues/18
|
|
55
|
+
|
|
56
|
+
Why sentence types matter:
|
|
57
|
+
|
|
58
|
+
Structural complexity:
|
|
59
|
+
- Simple sentences: Direct, clear, easy to process
|
|
60
|
+
- Compound sentences: Coordinate ideas of equal importance
|
|
61
|
+
- Complex sentences: Subordinate ideas, show relationships
|
|
62
|
+
- Compound-complex: Sophisticated, academic style
|
|
63
|
+
|
|
64
|
+
Functional diversity:
|
|
65
|
+
- Declarative dominance: Expository/academic writing
|
|
66
|
+
- Interrogative use: Interactive, rhetorical questions
|
|
67
|
+
- Imperative use: Instructional texts, commands
|
|
68
|
+
- Exclamatory use: Emotional, emphatic style
|
|
69
|
+
|
|
70
|
+
Genre patterns:
|
|
71
|
+
- Academic: High proportion of complex sentences
|
|
72
|
+
- Fiction: Mix of simple and complex for variety
|
|
73
|
+
- Journalism: Mostly simple and compound for clarity
|
|
74
|
+
- Technical: Predominantly declarative complex sentences
|
|
75
|
+
|
|
76
|
+
Structural Classification Algorithm:
|
|
77
|
+
|
|
78
|
+
Simple Sentence:
|
|
79
|
+
- Contains exactly one independent clause
|
|
80
|
+
- No dependent clauses
|
|
81
|
+
- Example: "The cat sat on the mat."
|
|
82
|
+
|
|
83
|
+
Compound Sentence:
|
|
84
|
+
- Contains two or more independent clauses
|
|
85
|
+
- Joined by coordinating conjunction or semicolon
|
|
86
|
+
- No dependent clauses
|
|
87
|
+
- Example: "I came, and I saw."
|
|
88
|
+
|
|
89
|
+
Complex Sentence:
|
|
90
|
+
- Contains one independent clause
|
|
91
|
+
- Plus one or more dependent clauses
|
|
92
|
+
- Example: "When I arrived, I saw her."
|
|
93
|
+
|
|
94
|
+
Compound-Complex Sentence:
|
|
95
|
+
- Contains two or more independent clauses
|
|
96
|
+
- Plus one or more dependent clauses
|
|
97
|
+
- Example: "I came when called, and I stayed because I wanted to."
|
|
98
|
+
|
|
99
|
+
Functional Classification Algorithm:
|
|
100
|
+
|
|
101
|
+
Declarative:
|
|
102
|
+
- Makes a statement
|
|
103
|
+
- Typically ends with period
|
|
104
|
+
- Subject before verb
|
|
105
|
+
- Example: "The sky is blue."
|
|
106
|
+
|
|
107
|
+
Interrogative:
|
|
108
|
+
- Asks a question
|
|
109
|
+
- Ends with question mark
|
|
110
|
+
- Often inverted word order or question words
|
|
111
|
+
- Example: "Is the sky blue?"
|
|
112
|
+
|
|
113
|
+
Imperative:
|
|
114
|
+
- Gives a command or instruction
|
|
115
|
+
- Subject typically implicit ("you")
|
|
116
|
+
- Often begins with base verb
|
|
117
|
+
- Example: "Look at the sky!"
|
|
118
|
+
|
|
119
|
+
Exclamatory:
|
|
120
|
+
- Expresses strong emotion
|
|
121
|
+
- Ends with exclamation mark
|
|
122
|
+
- May have inverted structure
|
|
123
|
+
- Example: "What a blue sky!"
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
text: Input text to analyze. Should contain multiple sentences for
|
|
127
|
+
meaningful distributions. Single-sentence texts will have ratios
|
|
128
|
+
of 1.0 for one type and 0.0 for others.
|
|
129
|
+
model: spaCy model with dependency parser. Default is "en_core_web_sm".
|
|
130
|
+
Larger models provide better clause detection accuracy.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
SentenceTypeResult containing:
|
|
134
|
+
|
|
135
|
+
Structural ratios (sum to 1.0):
|
|
136
|
+
- simple_ratio: Simple sentences / total
|
|
137
|
+
- compound_ratio: Compound sentences / total
|
|
138
|
+
- complex_ratio: Complex sentences / total
|
|
139
|
+
- compound_complex_ratio: Compound-complex / total
|
|
140
|
+
|
|
141
|
+
Functional ratios (sum to 1.0):
|
|
142
|
+
- declarative_ratio: Declarative sentences / total
|
|
143
|
+
- interrogative_ratio: Questions / total
|
|
144
|
+
- imperative_ratio: Commands / total
|
|
145
|
+
- exclamatory_ratio: Exclamations / total
|
|
146
|
+
|
|
147
|
+
Counts:
|
|
148
|
+
- simple_count, compound_count, complex_count, compound_complex_count
|
|
149
|
+
- declarative_count, interrogative_count, imperative_count, exclamatory_count
|
|
150
|
+
- total_sentences
|
|
151
|
+
|
|
152
|
+
Diversity metrics:
|
|
153
|
+
- structural_diversity: Shannon entropy of structural distribution
|
|
154
|
+
- functional_diversity: Shannon entropy of functional distribution
|
|
155
|
+
|
|
156
|
+
Metadata:
|
|
157
|
+
- sentence_by_sentence_classifications
|
|
158
|
+
- clause_counts_per_sentence
|
|
159
|
+
- etc.
|
|
160
|
+
|
|
161
|
+
Example:
|
|
162
|
+
>>> result = compute_sentence_types("Mix of sentence types here...")
|
|
163
|
+
>>> print(f"Simple: {result.simple_ratio * 100:.1f}%")
|
|
164
|
+
Simple: 35.2%
|
|
165
|
+
>>> print(f"Complex: {result.complex_ratio * 100:.1f}%")
|
|
166
|
+
Complex: 41.3%
|
|
167
|
+
>>> print(f"Questions: {result.interrogative_ratio * 100:.1f}%")
|
|
168
|
+
Questions: 8.5%
|
|
169
|
+
>>> print(f"Structural diversity: {result.structural_diversity:.3f}")
|
|
170
|
+
Structural diversity: 0.847
|
|
171
|
+
|
|
172
|
+
>>> # Compare genres
|
|
173
|
+
>>> academic = compute_sentence_types("Academic paper text...")
|
|
174
|
+
>>> fiction = compute_sentence_types("Fiction narrative...")
|
|
175
|
+
>>> print(f"Academic complex: {academic.complex_ratio:.2f}")
|
|
176
|
+
>>> print(f"Fiction simple: {fiction.simple_ratio:.2f}")
|
|
177
|
+
|
|
178
|
+
Note:
|
|
179
|
+
- Requires spaCy with dependency parser
|
|
180
|
+
- Clause detection based on dependency relations
|
|
181
|
+
- Coordinating conjunctions: and, but, or, nor, for, yet, so
|
|
182
|
+
- Dependent clause markers: ccomp, advcl, acl, relcl
|
|
183
|
+
- Punctuation used for functional classification
|
|
184
|
+
- Imperative detection uses missing subject + base verb pattern
|
|
185
|
+
- Empty text returns NaN for ratios, 0 for counts
|
|
186
|
+
"""
|
|
187
|
+
check_optional_dependency("spacy", "syntactic")
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
import spacy # type: ignore
|
|
191
|
+
except ImportError as e:
|
|
192
|
+
raise ImportError(
|
|
193
|
+
"spaCy is required for sentence type classification. "
|
|
194
|
+
"Install with: pip install spacy && python -m spacy download en_core_web_sm"
|
|
195
|
+
) from e
|
|
196
|
+
|
|
197
|
+
# Load spaCy model
|
|
198
|
+
try:
|
|
199
|
+
nlp = spacy.load(model)
|
|
200
|
+
except OSError as e:
|
|
201
|
+
raise OSError(
|
|
202
|
+
f"spaCy model '{model}' not found. Download with: python -m spacy download {model}"
|
|
203
|
+
) from e
|
|
204
|
+
|
|
205
|
+
# Parse text
|
|
206
|
+
doc = nlp(text)
|
|
207
|
+
sentences = list(doc.sents)
|
|
208
|
+
|
|
209
|
+
# Handle empty text
|
|
210
|
+
if len(sentences) == 0:
|
|
211
|
+
empty_dist = Distribution(
|
|
212
|
+
values=[],
|
|
213
|
+
mean=float("nan"),
|
|
214
|
+
median=float("nan"),
|
|
215
|
+
std=0.0,
|
|
216
|
+
range=0.0,
|
|
217
|
+
iqr=0.0,
|
|
218
|
+
)
|
|
219
|
+
return SentenceTypeResult(
|
|
220
|
+
simple_ratio=float("nan"),
|
|
221
|
+
compound_ratio=float("nan"),
|
|
222
|
+
complex_ratio=float("nan"),
|
|
223
|
+
compound_complex_ratio=float("nan"),
|
|
224
|
+
declarative_ratio=float("nan"),
|
|
225
|
+
interrogative_ratio=float("nan"),
|
|
226
|
+
imperative_ratio=float("nan"),
|
|
227
|
+
exclamatory_ratio=float("nan"),
|
|
228
|
+
simple_count=0,
|
|
229
|
+
compound_count=0,
|
|
230
|
+
complex_count=0,
|
|
231
|
+
compound_complex_count=0,
|
|
232
|
+
declarative_count=0,
|
|
233
|
+
interrogative_count=0,
|
|
234
|
+
imperative_count=0,
|
|
235
|
+
exclamatory_count=0,
|
|
236
|
+
total_sentences=0,
|
|
237
|
+
structural_diversity=float("nan"),
|
|
238
|
+
functional_diversity=float("nan"),
|
|
239
|
+
simple_ratio_dist=empty_dist,
|
|
240
|
+
compound_ratio_dist=empty_dist,
|
|
241
|
+
complex_ratio_dist=empty_dist,
|
|
242
|
+
compound_complex_ratio_dist=empty_dist,
|
|
243
|
+
declarative_ratio_dist=empty_dist,
|
|
244
|
+
interrogative_ratio_dist=empty_dist,
|
|
245
|
+
imperative_ratio_dist=empty_dist,
|
|
246
|
+
exclamatory_ratio_dist=empty_dist,
|
|
247
|
+
structural_diversity_dist=empty_dist,
|
|
248
|
+
functional_diversity_dist=empty_dist,
|
|
249
|
+
chunk_size=chunk_size,
|
|
250
|
+
chunk_count=0,
|
|
251
|
+
metadata={
|
|
252
|
+
"warning": "Empty text or no sentences found",
|
|
253
|
+
},
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# Classify each sentence
|
|
257
|
+
structural_counts = {"simple": 0, "compound": 0, "complex": 0, "compound_complex": 0}
|
|
258
|
+
functional_counts = {"declarative": 0, "interrogative": 0, "imperative": 0, "exclamatory": 0}
|
|
259
|
+
sentence_classifications = []
|
|
260
|
+
clause_counts_per_sentence = []
|
|
261
|
+
|
|
262
|
+
for sent in sentences:
|
|
263
|
+
# Count clauses
|
|
264
|
+
independent_count = _count_independent_clauses(sent)
|
|
265
|
+
dependent_count = _count_dependent_clauses(sent)
|
|
266
|
+
clause_counts_per_sentence.append((independent_count, dependent_count))
|
|
267
|
+
|
|
268
|
+
# Structural classification
|
|
269
|
+
structural_type = _classify_structural(independent_count, dependent_count)
|
|
270
|
+
structural_counts[structural_type] += 1
|
|
271
|
+
|
|
272
|
+
# Functional classification
|
|
273
|
+
functional_type = _classify_functional(sent)
|
|
274
|
+
functional_counts[functional_type] += 1
|
|
275
|
+
|
|
276
|
+
# Store classification
|
|
277
|
+
sentence_classifications.append(
|
|
278
|
+
{
|
|
279
|
+
"text": sent.text,
|
|
280
|
+
"structural_type": structural_type,
|
|
281
|
+
"functional_type": functional_type,
|
|
282
|
+
"independent_clauses": independent_count,
|
|
283
|
+
"dependent_clauses": dependent_count,
|
|
284
|
+
}
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Calculate ratios
|
|
288
|
+
total_sentences = len(sentences)
|
|
289
|
+
simple_ratio = structural_counts["simple"] / total_sentences
|
|
290
|
+
compound_ratio = structural_counts["compound"] / total_sentences
|
|
291
|
+
complex_ratio = structural_counts["complex"] / total_sentences
|
|
292
|
+
compound_complex_ratio = structural_counts["compound_complex"] / total_sentences
|
|
293
|
+
|
|
294
|
+
declarative_ratio = functional_counts["declarative"] / total_sentences
|
|
295
|
+
interrogative_ratio = functional_counts["interrogative"] / total_sentences
|
|
296
|
+
imperative_ratio = functional_counts["imperative"] / total_sentences
|
|
297
|
+
exclamatory_ratio = functional_counts["exclamatory"] / total_sentences
|
|
298
|
+
|
|
299
|
+
# Calculate diversity metrics
|
|
300
|
+
structural_ratios = [simple_ratio, compound_ratio, complex_ratio, compound_complex_ratio]
|
|
301
|
+
functional_ratios = [
|
|
302
|
+
declarative_ratio,
|
|
303
|
+
interrogative_ratio,
|
|
304
|
+
imperative_ratio,
|
|
305
|
+
exclamatory_ratio,
|
|
306
|
+
]
|
|
307
|
+
|
|
308
|
+
structural_diversity = _calculate_shannon_entropy(structural_ratios)
|
|
309
|
+
functional_diversity = _calculate_shannon_entropy(functional_ratios)
|
|
310
|
+
|
|
311
|
+
# Create single-value distributions (sentence analysis is done on full text)
|
|
312
|
+
simple_ratio_dist = make_distribution([simple_ratio])
|
|
313
|
+
compound_ratio_dist = make_distribution([compound_ratio])
|
|
314
|
+
complex_ratio_dist = make_distribution([complex_ratio])
|
|
315
|
+
compound_complex_ratio_dist = make_distribution([compound_complex_ratio])
|
|
316
|
+
declarative_ratio_dist = make_distribution([declarative_ratio])
|
|
317
|
+
interrogative_ratio_dist = make_distribution([interrogative_ratio])
|
|
318
|
+
imperative_ratio_dist = make_distribution([imperative_ratio])
|
|
319
|
+
exclamatory_ratio_dist = make_distribution([exclamatory_ratio])
|
|
320
|
+
structural_diversity_dist = make_distribution([structural_diversity])
|
|
321
|
+
functional_diversity_dist = make_distribution([functional_diversity])
|
|
322
|
+
|
|
323
|
+
# Collect metadata
|
|
324
|
+
metadata = {
|
|
325
|
+
"sentence_count": total_sentences,
|
|
326
|
+
"sentence_classifications": sentence_classifications,
|
|
327
|
+
"clause_counts_per_sentence": clause_counts_per_sentence,
|
|
328
|
+
"structural_counts": structural_counts,
|
|
329
|
+
"functional_counts": functional_counts,
|
|
330
|
+
"model_used": model,
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
return SentenceTypeResult(
|
|
334
|
+
simple_ratio=simple_ratio,
|
|
335
|
+
compound_ratio=compound_ratio,
|
|
336
|
+
complex_ratio=complex_ratio,
|
|
337
|
+
compound_complex_ratio=compound_complex_ratio,
|
|
338
|
+
declarative_ratio=declarative_ratio,
|
|
339
|
+
interrogative_ratio=interrogative_ratio,
|
|
340
|
+
imperative_ratio=imperative_ratio,
|
|
341
|
+
exclamatory_ratio=exclamatory_ratio,
|
|
342
|
+
simple_count=structural_counts["simple"],
|
|
343
|
+
compound_count=structural_counts["compound"],
|
|
344
|
+
complex_count=structural_counts["complex"],
|
|
345
|
+
compound_complex_count=structural_counts["compound_complex"],
|
|
346
|
+
declarative_count=functional_counts["declarative"],
|
|
347
|
+
interrogative_count=functional_counts["interrogative"],
|
|
348
|
+
imperative_count=functional_counts["imperative"],
|
|
349
|
+
exclamatory_count=functional_counts["exclamatory"],
|
|
350
|
+
total_sentences=total_sentences,
|
|
351
|
+
structural_diversity=structural_diversity,
|
|
352
|
+
functional_diversity=functional_diversity,
|
|
353
|
+
simple_ratio_dist=simple_ratio_dist,
|
|
354
|
+
compound_ratio_dist=compound_ratio_dist,
|
|
355
|
+
complex_ratio_dist=complex_ratio_dist,
|
|
356
|
+
compound_complex_ratio_dist=compound_complex_ratio_dist,
|
|
357
|
+
declarative_ratio_dist=declarative_ratio_dist,
|
|
358
|
+
interrogative_ratio_dist=interrogative_ratio_dist,
|
|
359
|
+
imperative_ratio_dist=imperative_ratio_dist,
|
|
360
|
+
exclamatory_ratio_dist=exclamatory_ratio_dist,
|
|
361
|
+
structural_diversity_dist=structural_diversity_dist,
|
|
362
|
+
functional_diversity_dist=functional_diversity_dist,
|
|
363
|
+
chunk_size=chunk_size,
|
|
364
|
+
chunk_count=1, # Single pass analysis
|
|
365
|
+
metadata=metadata,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _count_independent_clauses(sent: _SpaCySpan) -> int:
|
|
370
|
+
"""
|
|
371
|
+
Count independent clauses in a sentence.
|
|
372
|
+
|
|
373
|
+
Independent clauses are:
|
|
374
|
+
1. The root clause (always 1)
|
|
375
|
+
2. Coordinated clauses (conj with VERB POS and cc child)
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
sent: spaCy Span representing a sentence
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Number of independent clauses
|
|
382
|
+
"""
|
|
383
|
+
count = 1 # Always start with root clause
|
|
384
|
+
|
|
385
|
+
for token in sent:
|
|
386
|
+
# Coordinated independent clause
|
|
387
|
+
if token.dep_ == "conj" and token.pos_ == "VERB":
|
|
388
|
+
# Check if coordinating conjunction present
|
|
389
|
+
if any(child.dep_ == "cc" for child in token.head.children):
|
|
390
|
+
count += 1
|
|
391
|
+
|
|
392
|
+
return count
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _count_dependent_clauses(sent: _SpaCySpan) -> int:
|
|
396
|
+
"""
|
|
397
|
+
Count dependent clauses in a sentence.
|
|
398
|
+
|
|
399
|
+
Dependent clauses are identified by dependency labels:
|
|
400
|
+
- ccomp: clausal complement
|
|
401
|
+
- advcl: adverbial clause
|
|
402
|
+
- acl: adnominal clause
|
|
403
|
+
- relcl: relative clause
|
|
404
|
+
- xcomp: open clausal complement (sometimes)
|
|
405
|
+
|
|
406
|
+
Args:
|
|
407
|
+
sent: spaCy Span representing a sentence
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
Number of dependent clauses
|
|
411
|
+
"""
|
|
412
|
+
dependent_labels = {"ccomp", "advcl", "acl", "relcl", "xcomp"}
|
|
413
|
+
count = sum(1 for token in sent if token.dep_ in dependent_labels)
|
|
414
|
+
return count
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def _classify_structural(independent: int, dependent: int) -> str:
|
|
418
|
+
"""
|
|
419
|
+
Classify sentence structure based on clause counts.
|
|
420
|
+
|
|
421
|
+
Args:
|
|
422
|
+
independent: Number of independent clauses
|
|
423
|
+
dependent: Number of dependent clauses
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
One of: "simple", "compound", "complex", "compound_complex"
|
|
427
|
+
"""
|
|
428
|
+
if independent == 1 and dependent == 0:
|
|
429
|
+
return "simple"
|
|
430
|
+
elif independent >= 2 and dependent == 0:
|
|
431
|
+
return "compound"
|
|
432
|
+
elif independent == 1 and dependent >= 1:
|
|
433
|
+
return "complex"
|
|
434
|
+
elif independent >= 2 and dependent >= 1:
|
|
435
|
+
return "compound_complex"
|
|
436
|
+
else:
|
|
437
|
+
# Fallback (shouldn't happen with valid counts)
|
|
438
|
+
return "simple"
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def _classify_functional(sent: _SpaCySpan) -> str:
|
|
442
|
+
"""
|
|
443
|
+
Classify sentence function based on punctuation and structure.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
sent: spaCy Span representing a sentence
|
|
447
|
+
|
|
448
|
+
Returns:
|
|
449
|
+
One of: "declarative", "interrogative", "imperative", "exclamatory"
|
|
450
|
+
"""
|
|
451
|
+
# Get last token for punctuation
|
|
452
|
+
last_token = sent[-1]
|
|
453
|
+
|
|
454
|
+
# Check for question mark (interrogative)
|
|
455
|
+
if last_token.text == "?":
|
|
456
|
+
return "interrogative"
|
|
457
|
+
|
|
458
|
+
# Check for exclamation mark
|
|
459
|
+
if last_token.text == "!":
|
|
460
|
+
# Could be imperative or exclamatory
|
|
461
|
+
# Check if imperative structure
|
|
462
|
+
if _is_imperative_structure(sent):
|
|
463
|
+
return "imperative"
|
|
464
|
+
return "exclamatory"
|
|
465
|
+
|
|
466
|
+
# Check for imperative structure (missing subject + base verb)
|
|
467
|
+
if _is_imperative_structure(sent):
|
|
468
|
+
return "imperative"
|
|
469
|
+
|
|
470
|
+
# Default: declarative
|
|
471
|
+
return "declarative"
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def _is_imperative_structure(sent: _SpaCySpan) -> bool:
|
|
475
|
+
"""
|
|
476
|
+
Check if sentence has imperative structure.
|
|
477
|
+
|
|
478
|
+
Imperatives typically:
|
|
479
|
+
- Missing nominal subject (nsubj)
|
|
480
|
+
- Root verb is base form (VB) or imperative
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
sent: spaCy Span representing a sentence
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
True if imperative structure detected
|
|
487
|
+
"""
|
|
488
|
+
# Check for nominal subject
|
|
489
|
+
has_nominal_subject = any(token.dep_ == "nsubj" for token in sent)
|
|
490
|
+
|
|
491
|
+
# Get root verb
|
|
492
|
+
root_verb = sent.root
|
|
493
|
+
|
|
494
|
+
# If no nominal subject and root is a verb
|
|
495
|
+
if not has_nominal_subject and root_verb.pos_ == "VERB":
|
|
496
|
+
# Check if root is base form (VB) or present tense without subject
|
|
497
|
+
if root_verb.tag_ in {"VB", "VBP"}:
|
|
498
|
+
return True
|
|
499
|
+
|
|
500
|
+
return False
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def _calculate_shannon_entropy(probabilities: list[float]) -> float:
|
|
504
|
+
"""
|
|
505
|
+
Calculate Shannon entropy for a probability distribution.
|
|
506
|
+
|
|
507
|
+
H = -sum(p * log2(p)) for p > 0
|
|
508
|
+
|
|
509
|
+
Args:
|
|
510
|
+
probabilities: List of probabilities (should sum to 1.0)
|
|
511
|
+
|
|
512
|
+
Returns:
|
|
513
|
+
Shannon entropy in bits (0.0 to log2(n) where n is number of categories)
|
|
514
|
+
"""
|
|
515
|
+
import math
|
|
516
|
+
|
|
517
|
+
# Filter out zero probabilities (log(0) undefined)
|
|
518
|
+
non_zero_probs = [p for p in probabilities if p > 0]
|
|
519
|
+
|
|
520
|
+
if not non_zero_probs:
|
|
521
|
+
return 0.0
|
|
522
|
+
|
|
523
|
+
# Calculate entropy
|
|
524
|
+
entropy = -sum(p * math.log2(p) for p in non_zero_probs)
|
|
525
|
+
|
|
526
|
+
return entropy
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Visualization module for pystylometry.
|
|
2
|
+
|
|
3
|
+
This module provides visualization functions for stylometric analysis results.
|
|
4
|
+
|
|
5
|
+
Matplotlib Functions (PNG output):
|
|
6
|
+
Requires optional dependencies: pip install pystylometry[viz]
|
|
7
|
+
|
|
8
|
+
plot_drift_timeline: Line chart of chi-squared values over document
|
|
9
|
+
plot_drift_scatter: Scatter plot with reference zones (tic-tac-toe style)
|
|
10
|
+
plot_drift_report: Combined multi-panel visualization
|
|
11
|
+
|
|
12
|
+
Interactive JSX Functions (HTML output):
|
|
13
|
+
No additional dependencies required (uses React via CDN)
|
|
14
|
+
|
|
15
|
+
export_drift_timeline_jsx: Interactive timeline chart
|
|
16
|
+
export_drift_report_jsx: Interactive multi-panel dashboard
|
|
17
|
+
export_drift_viewer: Standalone viewer with file upload
|
|
18
|
+
|
|
19
|
+
Related GitHub Issues:
|
|
20
|
+
#38 - Visualization Options for Style Drift Detection
|
|
21
|
+
https://github.com/craigtrim/pystylometry/issues/38
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
>>> from pystylometry.consistency import compute_kilgarriff_drift
|
|
25
|
+
>>> from pystylometry.viz import plot_drift_timeline, export_drift_timeline_jsx
|
|
26
|
+
>>>
|
|
27
|
+
>>> result = compute_kilgarriff_drift(text)
|
|
28
|
+
>>> plot_drift_timeline(result, output="timeline.png") # Static PNG
|
|
29
|
+
>>> export_drift_timeline_jsx(result, "timeline.html") # Interactive HTML
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from .drift import ( # noqa: E402
|
|
33
|
+
plot_drift_report,
|
|
34
|
+
plot_drift_scatter,
|
|
35
|
+
plot_drift_timeline,
|
|
36
|
+
)
|
|
37
|
+
from .jsx import ( # noqa: E402
|
|
38
|
+
export_drift_report_jsx,
|
|
39
|
+
export_drift_timeline_jsx,
|
|
40
|
+
export_drift_viewer,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
import matplotlib # noqa: F401
|
|
45
|
+
import seaborn # noqa: F401 # type: ignore[import-untyped]
|
|
46
|
+
|
|
47
|
+
_VIZ_AVAILABLE = True
|
|
48
|
+
except ImportError:
|
|
49
|
+
_VIZ_AVAILABLE = False
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _check_viz_available() -> None:
|
|
53
|
+
"""Raise ImportError if visualization dependencies are not installed."""
|
|
54
|
+
if not _VIZ_AVAILABLE:
|
|
55
|
+
raise ImportError(
|
|
56
|
+
"Visualization requires optional dependencies. "
|
|
57
|
+
"Install with: pip install pystylometry[viz] or poetry install --with viz"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
__all__ = [
|
|
62
|
+
# Matplotlib (PNG)
|
|
63
|
+
"plot_drift_timeline",
|
|
64
|
+
"plot_drift_scatter",
|
|
65
|
+
"plot_drift_report",
|
|
66
|
+
# JSX (HTML)
|
|
67
|
+
"export_drift_timeline_jsx",
|
|
68
|
+
"export_drift_report_jsx",
|
|
69
|
+
# Standalone viewer
|
|
70
|
+
"export_drift_viewer",
|
|
71
|
+
]
|