pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +45 -3
- pystylometry/_types.py +1017 -259
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +28 -4
- pystylometry/authorship/additional_methods.py +260 -40
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +354 -0
- pystylometry/character/README.md +17 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/README.md +27 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- pystylometry-1.3.0.dist-info/RECORD +76 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
- pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/METADATA +0 -275
- pystylometry-1.0.0.dist-info/RECORD +0 -46
|
@@ -10,15 +10,275 @@ Related GitHub Issue:
|
|
|
10
10
|
|
|
11
11
|
References:
|
|
12
12
|
Jaccard, P. (1912). The distribution of the flora in the alpine zone.
|
|
13
|
-
|
|
13
|
+
New Phytologist, 11(2), 37-50.
|
|
14
|
+
Sørensen, T. (1948). A method of establishing groups of equal amplitude in
|
|
15
|
+
plant sociology based on similarity of species. Kongelige Danske
|
|
16
|
+
Videnskabernes Selskab, 5(4), 1-34.
|
|
17
|
+
Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
|
|
18
|
+
Retrieval. McGraw-Hill.
|
|
19
|
+
Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
|
|
20
|
+
Annals of Mathematical Statistics, 22(1), 79-86.
|
|
21
|
+
Manning, C. D., & Schütze, H. (1999). Foundations of Statistical NLP.
|
|
22
|
+
MIT Press.
|
|
14
23
|
"""
|
|
15
24
|
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import math
|
|
28
|
+
import re
|
|
29
|
+
from collections import Counter
|
|
30
|
+
|
|
16
31
|
from .._types import VocabularyOverlapResult
|
|
17
32
|
|
|
18
33
|
|
|
19
|
-
def
|
|
34
|
+
def _tokenize(text: str) -> list[str]:
|
|
35
|
+
"""Tokenize text into lowercase words.
|
|
36
|
+
|
|
37
|
+
Uses a simple regex-based tokenizer that extracts word characters.
|
|
38
|
+
Converts to lowercase for case-insensitive comparison.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
text: Input text to tokenize
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
List of lowercase word tokens
|
|
45
|
+
"""
|
|
46
|
+
# Match word characters, convert to lowercase
|
|
47
|
+
tokens = re.findall(r"\b[a-zA-Z]+\b", text.lower())
|
|
48
|
+
return tokens
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _compute_jaccard(set1: set[str], set2: set[str]) -> float:
|
|
52
|
+
"""Compute Jaccard similarity coefficient.
|
|
53
|
+
|
|
54
|
+
The Jaccard index measures similarity as the size of the intersection
|
|
55
|
+
divided by the size of the union of two sets.
|
|
56
|
+
|
|
57
|
+
J(A, B) = |A ∩ B| / |A ∪ B|
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
set1: First vocabulary set
|
|
61
|
+
set2: Second vocabulary set
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Jaccard similarity coefficient (0.0 to 1.0)
|
|
65
|
+
|
|
66
|
+
References:
|
|
67
|
+
Jaccard, P. (1912). The distribution of the flora in the alpine zone.
|
|
68
|
+
"""
|
|
69
|
+
if not set1 and not set2:
|
|
70
|
+
return 1.0 # Both empty = identical
|
|
71
|
+
|
|
72
|
+
intersection = len(set1 & set2)
|
|
73
|
+
union = len(set1 | set2)
|
|
74
|
+
|
|
75
|
+
return intersection / union if union > 0 else 0.0
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _compute_dice(set1: set[str], set2: set[str]) -> float:
|
|
79
|
+
"""Compute Sørensen-Dice coefficient.
|
|
80
|
+
|
|
81
|
+
The Dice coefficient is similar to Jaccard but weights the intersection
|
|
82
|
+
more heavily. Also known as the Sørensen-Dice index.
|
|
83
|
+
|
|
84
|
+
D(A, B) = 2|A ∩ B| / (|A| + |B|)
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
set1: First vocabulary set
|
|
88
|
+
set2: Second vocabulary set
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Dice coefficient (0.0 to 1.0)
|
|
92
|
+
|
|
93
|
+
References:
|
|
94
|
+
Sørensen, T. (1948). A method of establishing groups of equal amplitude
|
|
95
|
+
in plant sociology based on similarity of species.
|
|
20
96
|
"""
|
|
21
|
-
|
|
97
|
+
if not set1 and not set2:
|
|
98
|
+
return 1.0 # Both empty = identical
|
|
99
|
+
|
|
100
|
+
intersection = len(set1 & set2)
|
|
101
|
+
total_size = len(set1) + len(set2)
|
|
102
|
+
|
|
103
|
+
return (2 * intersection) / total_size if total_size > 0 else 0.0
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _compute_overlap_coefficient(set1: set[str], set2: set[str]) -> float:
|
|
107
|
+
"""Compute overlap coefficient.
|
|
108
|
+
|
|
109
|
+
The overlap coefficient measures the overlap relative to the smaller set.
|
|
110
|
+
Useful when comparing texts of very different lengths.
|
|
111
|
+
|
|
112
|
+
O(A, B) = |A ∩ B| / min(|A|, |B|)
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
set1: First vocabulary set
|
|
116
|
+
set2: Second vocabulary set
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Overlap coefficient (0.0 to 1.0)
|
|
120
|
+
"""
|
|
121
|
+
if not set1 or not set2:
|
|
122
|
+
return 0.0 if set1 or set2 else 1.0
|
|
123
|
+
|
|
124
|
+
intersection = len(set1 & set2)
|
|
125
|
+
min_size = min(len(set1), len(set2))
|
|
126
|
+
|
|
127
|
+
return intersection / min_size if min_size > 0 else 0.0
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _compute_cosine_similarity(freq1: Counter[str], freq2: Counter[str], vocab: set[str]) -> float:
|
|
131
|
+
"""Compute cosine similarity between term frequency vectors.
|
|
132
|
+
|
|
133
|
+
Treats each text as a vector in vocabulary space where each dimension
|
|
134
|
+
is the frequency of a word. Computes the cosine of the angle between vectors.
|
|
135
|
+
|
|
136
|
+
cos(θ) = (A · B) / (||A|| × ||B||)
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
freq1: Word frequencies for text 1
|
|
140
|
+
freq2: Word frequencies for text 2
|
|
141
|
+
vocab: Combined vocabulary (union of both texts)
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Cosine similarity (-1.0 to 1.0, though word frequencies yield 0.0 to 1.0)
|
|
145
|
+
|
|
146
|
+
References:
|
|
147
|
+
Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
|
|
148
|
+
Retrieval.
|
|
149
|
+
"""
|
|
150
|
+
if not vocab:
|
|
151
|
+
return 1.0 # Both empty = identical
|
|
152
|
+
|
|
153
|
+
# Compute dot product and magnitudes
|
|
154
|
+
dot_product = 0.0
|
|
155
|
+
magnitude1 = 0.0
|
|
156
|
+
magnitude2 = 0.0
|
|
157
|
+
|
|
158
|
+
for word in vocab:
|
|
159
|
+
f1 = freq1.get(word, 0)
|
|
160
|
+
f2 = freq2.get(word, 0)
|
|
161
|
+
dot_product += f1 * f2
|
|
162
|
+
magnitude1 += f1 * f1
|
|
163
|
+
magnitude2 += f2 * f2
|
|
164
|
+
|
|
165
|
+
magnitude1 = math.sqrt(magnitude1)
|
|
166
|
+
magnitude2 = math.sqrt(magnitude2)
|
|
167
|
+
|
|
168
|
+
if magnitude1 == 0 or magnitude2 == 0:
|
|
169
|
+
return 0.0
|
|
170
|
+
|
|
171
|
+
return dot_product / (magnitude1 * magnitude2)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _compute_kl_divergence(
|
|
175
|
+
freq1: Counter[str], freq2: Counter[str], vocab: set[str], smoothing: float = 1e-10
|
|
176
|
+
) -> float:
|
|
177
|
+
"""Compute Kullback-Leibler divergence from text1 to text2.
|
|
178
|
+
|
|
179
|
+
KL divergence measures how one probability distribution diverges from
|
|
180
|
+
another. It is asymmetric: D_KL(P || Q) ≠ D_KL(Q || P).
|
|
181
|
+
|
|
182
|
+
D_KL(P || Q) = Σ P(x) log(P(x) / Q(x))
|
|
183
|
+
|
|
184
|
+
A small smoothing value is added to avoid division by zero when Q(x) = 0.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
freq1: Word frequencies for text 1 (P distribution)
|
|
188
|
+
freq2: Word frequencies for text 2 (Q distribution)
|
|
189
|
+
vocab: Combined vocabulary (union of both texts)
|
|
190
|
+
smoothing: Small value added to probabilities to avoid log(0)
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
KL divergence (non-negative, unbounded above)
|
|
194
|
+
|
|
195
|
+
Note:
|
|
196
|
+
Returns 0.0 for identical distributions. Higher values indicate
|
|
197
|
+
greater difference between distributions.
|
|
198
|
+
|
|
199
|
+
References:
|
|
200
|
+
Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
|
|
201
|
+
"""
|
|
202
|
+
if not vocab:
|
|
203
|
+
return 0.0 # Both empty = identical
|
|
204
|
+
|
|
205
|
+
# Convert frequencies to probabilities
|
|
206
|
+
total1 = sum(freq1.values())
|
|
207
|
+
total2 = sum(freq2.values())
|
|
208
|
+
|
|
209
|
+
if total1 == 0 or total2 == 0:
|
|
210
|
+
return 0.0
|
|
211
|
+
|
|
212
|
+
kl_div = 0.0
|
|
213
|
+
for word in vocab:
|
|
214
|
+
p = (freq1.get(word, 0) / total1) + smoothing
|
|
215
|
+
q = (freq2.get(word, 0) / total2) + smoothing
|
|
216
|
+
kl_div += p * math.log(p / q)
|
|
217
|
+
|
|
218
|
+
return max(0.0, kl_div) # Ensure non-negative due to smoothing artifacts
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _compute_tfidf_distinctive_words(
|
|
222
|
+
freq1: Counter[str],
|
|
223
|
+
freq2: Counter[str],
|
|
224
|
+
unique_to_1: set[str],
|
|
225
|
+
unique_to_2: set[str],
|
|
226
|
+
top_n: int = 20,
|
|
227
|
+
) -> tuple[list[tuple[str, float]], list[tuple[str, float]]]:
|
|
228
|
+
"""Compute distinctive words for each text using TF-IDF-like scoring.
|
|
229
|
+
|
|
230
|
+
Words unique to each text are scored by their frequency, providing
|
|
231
|
+
a measure of how "distinctive" they are for that text.
|
|
232
|
+
|
|
233
|
+
For texts with shared vocabulary, the scoring considers relative
|
|
234
|
+
frequency differences.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
freq1: Word frequencies for text 1
|
|
238
|
+
freq2: Word frequencies for text 2
|
|
239
|
+
unique_to_1: Words appearing only in text 1
|
|
240
|
+
unique_to_2: Words appearing only in text 2
|
|
241
|
+
top_n: Number of top distinctive words to return
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Tuple of (text1_distinctive, text2_distinctive) lists,
|
|
245
|
+
each containing (word, score) tuples sorted by score descending
|
|
246
|
+
"""
|
|
247
|
+
# For unique words, score by frequency
|
|
248
|
+
text1_scores: list[tuple[str, float]] = []
|
|
249
|
+
for word in unique_to_1:
|
|
250
|
+
score = float(freq1[word])
|
|
251
|
+
text1_scores.append((word, score))
|
|
252
|
+
|
|
253
|
+
text2_scores: list[tuple[str, float]] = []
|
|
254
|
+
for word in unique_to_2:
|
|
255
|
+
score = float(freq2[word])
|
|
256
|
+
text2_scores.append((word, score))
|
|
257
|
+
|
|
258
|
+
# Sort by score descending
|
|
259
|
+
text1_scores.sort(key=lambda x: x[1], reverse=True)
|
|
260
|
+
text2_scores.sort(key=lambda x: x[1], reverse=True)
|
|
261
|
+
|
|
262
|
+
return text1_scores[:top_n], text2_scores[:top_n]
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def compute_vocabulary_overlap(
|
|
266
|
+
text1: str,
|
|
267
|
+
text2: str,
|
|
268
|
+
top_distinctive: int = 20,
|
|
269
|
+
) -> VocabularyOverlapResult:
|
|
270
|
+
"""Compute vocabulary overlap and similarity between two texts.
|
|
271
|
+
|
|
272
|
+
This function computes multiple similarity metrics based on vocabulary
|
|
273
|
+
comparison, useful for authorship verification, plagiarism detection,
|
|
274
|
+
and measuring stylistic consistency across texts.
|
|
275
|
+
|
|
276
|
+
Metrics computed:
|
|
277
|
+
- Jaccard similarity: intersection / union (set-based)
|
|
278
|
+
- Sørensen-Dice coefficient: 2 * intersection / (size1 + size2)
|
|
279
|
+
- Overlap coefficient: intersection / min(size1, size2)
|
|
280
|
+
- Cosine similarity: dot product of frequency vectors
|
|
281
|
+
- KL divergence: distributional difference (asymmetric)
|
|
22
282
|
|
|
23
283
|
Related GitHub Issue:
|
|
24
284
|
#21 - Vocabulary Overlap and Similarity Metrics
|
|
@@ -27,21 +287,102 @@ def compute_vocabulary_overlap(text1: str, text2: str) -> VocabularyOverlapResul
|
|
|
27
287
|
Args:
|
|
28
288
|
text1: First text to compare
|
|
29
289
|
text2: Second text to compare
|
|
290
|
+
top_distinctive: Number of most distinctive words to return per text
|
|
30
291
|
|
|
31
292
|
Returns:
|
|
32
|
-
VocabularyOverlapResult with
|
|
33
|
-
shared vocabulary
|
|
293
|
+
VocabularyOverlapResult with similarity scores, vocabulary statistics,
|
|
294
|
+
shared vocabulary, and distinctive words for each text.
|
|
34
295
|
|
|
35
296
|
Example:
|
|
36
|
-
>>> result = compute_vocabulary_overlap(
|
|
297
|
+
>>> result = compute_vocabulary_overlap(
|
|
298
|
+
... "The quick brown fox jumps over the lazy dog",
|
|
299
|
+
... "The fast brown fox leaps over the sleepy dog"
|
|
300
|
+
... )
|
|
37
301
|
>>> print(f"Jaccard similarity: {result.jaccard_similarity:.3f}")
|
|
38
|
-
Jaccard similarity: 0.
|
|
302
|
+
Jaccard similarity: 0.583
|
|
39
303
|
>>> print(f"Shared words: {result.shared_vocab_size}")
|
|
40
|
-
Shared words:
|
|
304
|
+
Shared words: 7
|
|
305
|
+
>>> print(f"Text1 distinctive: {result.text1_distinctive_words}")
|
|
306
|
+
[('quick', 1.0), ('jumps', 1.0), ('lazy', 1.0)]
|
|
307
|
+
|
|
308
|
+
References:
|
|
309
|
+
Jaccard, P. (1912). The distribution of the flora in the alpine zone.
|
|
310
|
+
New Phytologist, 11(2), 37-50.
|
|
311
|
+
Sørensen, T. (1948). A method of establishing groups of equal amplitude
|
|
312
|
+
in plant sociology based on similarity of species.
|
|
313
|
+
Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
|
|
314
|
+
Retrieval. McGraw-Hill.
|
|
315
|
+
Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
|
|
316
|
+
Annals of Mathematical Statistics, 22(1), 79-86.
|
|
317
|
+
Manning, C. D., & Schütze, H. (1999). Foundations of Statistical NLP.
|
|
318
|
+
MIT Press.
|
|
41
319
|
"""
|
|
42
|
-
#
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
320
|
+
# Tokenize texts
|
|
321
|
+
tokens1 = _tokenize(text1)
|
|
322
|
+
tokens2 = _tokenize(text2)
|
|
323
|
+
|
|
324
|
+
# Build frequency counters and vocabulary sets
|
|
325
|
+
freq1: Counter[str] = Counter(tokens1)
|
|
326
|
+
freq2: Counter[str] = Counter(tokens2)
|
|
327
|
+
|
|
328
|
+
vocab1 = set(freq1.keys())
|
|
329
|
+
vocab2 = set(freq2.keys())
|
|
330
|
+
|
|
331
|
+
# Compute set operations
|
|
332
|
+
shared = vocab1 & vocab2
|
|
333
|
+
union = vocab1 | vocab2
|
|
334
|
+
unique_to_1 = vocab1 - vocab2
|
|
335
|
+
unique_to_2 = vocab2 - vocab1
|
|
336
|
+
|
|
337
|
+
# Compute similarity metrics
|
|
338
|
+
jaccard = _compute_jaccard(vocab1, vocab2)
|
|
339
|
+
dice = _compute_dice(vocab1, vocab2)
|
|
340
|
+
overlap = _compute_overlap_coefficient(vocab1, vocab2)
|
|
341
|
+
cosine = _compute_cosine_similarity(freq1, freq2, union)
|
|
342
|
+
kl_div = _compute_kl_divergence(freq1, freq2, union)
|
|
343
|
+
|
|
344
|
+
# Compute coverage ratios
|
|
345
|
+
text1_coverage = len(shared) / len(vocab1) if vocab1 else 0.0
|
|
346
|
+
text2_coverage = len(shared) / len(vocab2) if vocab2 else 0.0
|
|
347
|
+
|
|
348
|
+
# Get distinctive words
|
|
349
|
+
text1_distinctive, text2_distinctive = _compute_tfidf_distinctive_words(
|
|
350
|
+
freq1, freq2, unique_to_1, unique_to_2, top_distinctive
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
# Build shared words list (sorted by combined frequency)
|
|
354
|
+
shared_with_freq = [(word, freq1[word] + freq2[word]) for word in shared]
|
|
355
|
+
shared_with_freq.sort(key=lambda x: x[1], reverse=True)
|
|
356
|
+
shared_words = [word for word, _ in shared_with_freq]
|
|
357
|
+
|
|
358
|
+
return VocabularyOverlapResult(
|
|
359
|
+
# Similarity scores
|
|
360
|
+
jaccard_similarity=jaccard,
|
|
361
|
+
dice_coefficient=dice,
|
|
362
|
+
overlap_coefficient=overlap,
|
|
363
|
+
cosine_similarity=cosine,
|
|
364
|
+
kl_divergence=kl_div,
|
|
365
|
+
# Vocabulary sizes
|
|
366
|
+
text1_vocab_size=len(vocab1),
|
|
367
|
+
text2_vocab_size=len(vocab2),
|
|
368
|
+
shared_vocab_size=len(shared),
|
|
369
|
+
union_vocab_size=len(union),
|
|
370
|
+
text1_unique_count=len(unique_to_1),
|
|
371
|
+
text2_unique_count=len(unique_to_2),
|
|
372
|
+
# Shared and distinctive vocabulary
|
|
373
|
+
shared_words=shared_words,
|
|
374
|
+
text1_distinctive_words=text1_distinctive,
|
|
375
|
+
text2_distinctive_words=text2_distinctive,
|
|
376
|
+
# Coverage ratios
|
|
377
|
+
text1_coverage=text1_coverage,
|
|
378
|
+
text2_coverage=text2_coverage,
|
|
379
|
+
# Metadata
|
|
380
|
+
metadata={
|
|
381
|
+
"text1_token_count": len(tokens1),
|
|
382
|
+
"text2_token_count": len(tokens2),
|
|
383
|
+
"text1_frequencies": dict(freq1),
|
|
384
|
+
"text2_frequencies": dict(freq2),
|
|
385
|
+
"unique_to_text1": sorted(unique_to_1),
|
|
386
|
+
"unique_to_text2": sorted(unique_to_2),
|
|
387
|
+
},
|
|
47
388
|
)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# syntactic
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Sentence structure, part-of-speech, and parse tree analysis.
|
|
7
|
+
|
|
8
|
+
## Catalogue
|
|
9
|
+
|
|
10
|
+
| File | Function | What It Measures |
|
|
11
|
+
|------|----------|-----------------|
|
|
12
|
+
| `pos_ratios.py` | `compute_pos_ratios` | Noun/verb/adjective/adverb ratios |
|
|
13
|
+
| `sentence_stats.py` | `compute_sentence_stats` | Sentence length, word length distributions |
|
|
14
|
+
| `sentence_types.py` | `compute_sentence_types` | Declarative, interrogative, imperative, exclamatory classification |
|
|
15
|
+
| `advanced_syntactic.py` | `compute_advanced_syntactic` | Parse tree depth, clausal density, passive voice, T-units, dependency distance, subordination/coordination ratios |
|
|
16
|
+
|
|
17
|
+
## See Also
|
|
18
|
+
|
|
19
|
+
- [`stylistic/`](../stylistic/) for higher-level style features built on syntactic foundations
|
|
20
|
+
- [`ngrams/`](../ngrams/) for POS n-gram sequences via `compute_extended_ngrams(text, pos=True)`
|
|
@@ -28,13 +28,21 @@ References:
|
|
|
28
28
|
of linguistic complexity. In Image, language, brain (pp. 95-126).
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
-
from
|
|
31
|
+
from typing import Any
|
|
32
|
+
|
|
33
|
+
from .._types import AdvancedSyntacticResult, Distribution, make_distribution
|
|
32
34
|
from .._utils import check_optional_dependency
|
|
33
35
|
|
|
36
|
+
# Type aliases for spaCy objects (loaded dynamically)
|
|
37
|
+
_SpaCyToken = Any
|
|
38
|
+
_SpaCyDoc = Any
|
|
39
|
+
_SpaCySpan = Any
|
|
40
|
+
|
|
34
41
|
|
|
35
42
|
def compute_advanced_syntactic(
|
|
36
43
|
text: str,
|
|
37
44
|
model: str = "en_core_web_sm",
|
|
45
|
+
chunk_size: int = 1000,
|
|
38
46
|
) -> AdvancedSyntacticResult:
|
|
39
47
|
"""
|
|
40
48
|
Compute advanced syntactic complexity metrics using dependency parsing.
|
|
@@ -147,7 +155,6 @@ def compute_advanced_syntactic(
|
|
|
147
155
|
|
|
148
156
|
try:
|
|
149
157
|
import spacy # type: ignore
|
|
150
|
-
from spacy.tokens import Doc, Span, Token # type: ignore
|
|
151
158
|
except ImportError as e:
|
|
152
159
|
raise ImportError(
|
|
153
160
|
"spaCy is required for advanced syntactic analysis. "
|
|
@@ -159,8 +166,7 @@ def compute_advanced_syntactic(
|
|
|
159
166
|
nlp = spacy.load(model)
|
|
160
167
|
except OSError as e:
|
|
161
168
|
raise OSError(
|
|
162
|
-
f"spaCy model '{model}' not found. "
|
|
163
|
-
f"Download with: python -m spacy download {model}"
|
|
169
|
+
f"spaCy model '{model}' not found. Download with: python -m spacy download {model}"
|
|
164
170
|
) from e
|
|
165
171
|
|
|
166
172
|
# Parse text
|
|
@@ -169,6 +175,14 @@ def compute_advanced_syntactic(
|
|
|
169
175
|
|
|
170
176
|
# Handle empty text
|
|
171
177
|
if len(sentences) == 0 or len(doc) == 0:
|
|
178
|
+
empty_dist = Distribution(
|
|
179
|
+
values=[],
|
|
180
|
+
mean=float("nan"),
|
|
181
|
+
median=float("nan"),
|
|
182
|
+
std=0.0,
|
|
183
|
+
range=0.0,
|
|
184
|
+
iqr=0.0,
|
|
185
|
+
)
|
|
172
186
|
return AdvancedSyntacticResult(
|
|
173
187
|
mean_parse_tree_depth=float("nan"),
|
|
174
188
|
max_parse_tree_depth=0,
|
|
@@ -183,6 +197,20 @@ def compute_advanced_syntactic(
|
|
|
183
197
|
dependency_distance=float("nan"),
|
|
184
198
|
left_branching_ratio=float("nan"),
|
|
185
199
|
right_branching_ratio=float("nan"),
|
|
200
|
+
mean_parse_tree_depth_dist=empty_dist,
|
|
201
|
+
max_parse_tree_depth_dist=empty_dist,
|
|
202
|
+
mean_t_unit_length_dist=empty_dist,
|
|
203
|
+
clausal_density_dist=empty_dist,
|
|
204
|
+
dependent_clause_ratio_dist=empty_dist,
|
|
205
|
+
passive_voice_ratio_dist=empty_dist,
|
|
206
|
+
subordination_index_dist=empty_dist,
|
|
207
|
+
coordination_index_dist=empty_dist,
|
|
208
|
+
sentence_complexity_score_dist=empty_dist,
|
|
209
|
+
dependency_distance_dist=empty_dist,
|
|
210
|
+
left_branching_ratio_dist=empty_dist,
|
|
211
|
+
right_branching_ratio_dist=empty_dist,
|
|
212
|
+
chunk_size=chunk_size,
|
|
213
|
+
chunk_count=0,
|
|
186
214
|
metadata={
|
|
187
215
|
"sentence_count": 0,
|
|
188
216
|
"word_count": 0,
|
|
@@ -229,9 +257,7 @@ def compute_advanced_syntactic(
|
|
|
229
257
|
coordinate_clause_count = 0
|
|
230
258
|
|
|
231
259
|
for sent in sentences:
|
|
232
|
-
sent_total, sent_dependent, sent_subordinate, sent_coordinate = _count_clauses(
|
|
233
|
-
sent
|
|
234
|
-
)
|
|
260
|
+
sent_total, sent_dependent, sent_subordinate, sent_coordinate = _count_clauses(sent)
|
|
235
261
|
total_clauses += sent_total
|
|
236
262
|
dependent_clause_count += sent_dependent
|
|
237
263
|
subordinate_clause_count += sent_subordinate
|
|
@@ -279,14 +305,22 @@ def compute_advanced_syntactic(
|
|
|
279
305
|
# Normalize individual metrics to 0-1 range
|
|
280
306
|
normalized_parse_depth = min(mean_parse_tree_depth / 10, 1.0)
|
|
281
307
|
normalized_clausal_density = (
|
|
282
|
-
min(clausal_density / 3, 1.0)
|
|
308
|
+
min(clausal_density / 3, 1.0)
|
|
309
|
+
if not isinstance(clausal_density, float) or not (clausal_density != clausal_density)
|
|
310
|
+
else 0.0
|
|
283
311
|
)
|
|
284
312
|
normalized_t_unit_length = (
|
|
285
|
-
min(mean_t_unit_length / 25, 1.0)
|
|
313
|
+
min(mean_t_unit_length / 25, 1.0)
|
|
314
|
+
if not isinstance(mean_t_unit_length, float)
|
|
315
|
+
or not (mean_t_unit_length != mean_t_unit_length)
|
|
316
|
+
else 0.0
|
|
286
317
|
)
|
|
287
318
|
normalized_dependency_distance = min(mean_dependency_distance / 5, 1.0)
|
|
288
319
|
normalized_subordination = (
|
|
289
|
-
subordination_index
|
|
320
|
+
subordination_index
|
|
321
|
+
if not isinstance(subordination_index, float)
|
|
322
|
+
or not (subordination_index != subordination_index)
|
|
323
|
+
else 0.0
|
|
290
324
|
)
|
|
291
325
|
|
|
292
326
|
# Weighted combination
|
|
@@ -298,6 +332,20 @@ def compute_advanced_syntactic(
|
|
|
298
332
|
+ 0.1 * normalized_dependency_distance
|
|
299
333
|
)
|
|
300
334
|
|
|
335
|
+
# Create single-value distributions (analysis is done on full text)
|
|
336
|
+
mean_parse_tree_depth_dist = make_distribution([mean_parse_tree_depth])
|
|
337
|
+
max_parse_tree_depth_dist = make_distribution([float(max_parse_tree_depth)])
|
|
338
|
+
mean_t_unit_length_dist = make_distribution([mean_t_unit_length])
|
|
339
|
+
clausal_density_dist = make_distribution([clausal_density])
|
|
340
|
+
dependent_clause_ratio_dist = make_distribution([dependent_clause_ratio])
|
|
341
|
+
passive_voice_ratio_dist = make_distribution([passive_voice_ratio])
|
|
342
|
+
subordination_index_dist = make_distribution([subordination_index])
|
|
343
|
+
coordination_index_dist = make_distribution([coordination_index])
|
|
344
|
+
sentence_complexity_score_dist = make_distribution([sentence_complexity_score])
|
|
345
|
+
dependency_distance_dist = make_distribution([mean_dependency_distance])
|
|
346
|
+
left_branching_ratio_dist = make_distribution([left_branching_ratio])
|
|
347
|
+
right_branching_ratio_dist = make_distribution([right_branching_ratio])
|
|
348
|
+
|
|
301
349
|
# Collect metadata
|
|
302
350
|
metadata = {
|
|
303
351
|
"sentence_count": len(sentences),
|
|
@@ -331,11 +379,25 @@ def compute_advanced_syntactic(
|
|
|
331
379
|
dependency_distance=mean_dependency_distance,
|
|
332
380
|
left_branching_ratio=left_branching_ratio,
|
|
333
381
|
right_branching_ratio=right_branching_ratio,
|
|
382
|
+
mean_parse_tree_depth_dist=mean_parse_tree_depth_dist,
|
|
383
|
+
max_parse_tree_depth_dist=max_parse_tree_depth_dist,
|
|
384
|
+
mean_t_unit_length_dist=mean_t_unit_length_dist,
|
|
385
|
+
clausal_density_dist=clausal_density_dist,
|
|
386
|
+
dependent_clause_ratio_dist=dependent_clause_ratio_dist,
|
|
387
|
+
passive_voice_ratio_dist=passive_voice_ratio_dist,
|
|
388
|
+
subordination_index_dist=subordination_index_dist,
|
|
389
|
+
coordination_index_dist=coordination_index_dist,
|
|
390
|
+
sentence_complexity_score_dist=sentence_complexity_score_dist,
|
|
391
|
+
dependency_distance_dist=dependency_distance_dist,
|
|
392
|
+
left_branching_ratio_dist=left_branching_ratio_dist,
|
|
393
|
+
right_branching_ratio_dist=right_branching_ratio_dist,
|
|
394
|
+
chunk_size=chunk_size,
|
|
395
|
+
chunk_count=1, # Single pass analysis
|
|
334
396
|
metadata=metadata,
|
|
335
397
|
)
|
|
336
398
|
|
|
337
399
|
|
|
338
|
-
def _calculate_max_tree_depth(token) -> int:
|
|
400
|
+
def _calculate_max_tree_depth(token: _SpaCyToken) -> int:
|
|
339
401
|
"""
|
|
340
402
|
Calculate maximum depth of dependency tree starting from token.
|
|
341
403
|
|
|
@@ -352,7 +414,7 @@ def _calculate_max_tree_depth(token) -> int:
|
|
|
352
414
|
return max(child_depths) + 1
|
|
353
415
|
|
|
354
416
|
|
|
355
|
-
def _identify_t_units(doc) -> list:
|
|
417
|
+
def _identify_t_units(doc: _SpaCyDoc) -> list[_SpaCySpan]:
|
|
356
418
|
"""
|
|
357
419
|
Identify T-units (minimal terminable units) in document.
|
|
358
420
|
|
|
@@ -371,7 +433,7 @@ def _identify_t_units(doc) -> list:
|
|
|
371
433
|
return list(doc.sents)
|
|
372
434
|
|
|
373
435
|
|
|
374
|
-
def _count_clauses(sent) -> tuple[int, int, int, int]:
|
|
436
|
+
def _count_clauses(sent: _SpaCySpan) -> tuple[int, int, int, int]:
|
|
375
437
|
"""
|
|
376
438
|
Count different types of clauses in sentence.
|
|
377
439
|
|
|
@@ -406,7 +468,7 @@ def _count_clauses(sent) -> tuple[int, int, int, int]:
|
|
|
406
468
|
return total, dependent, subordinate, coordinate
|
|
407
469
|
|
|
408
470
|
|
|
409
|
-
def _is_passive_voice(sent) -> bool:
|
|
471
|
+
def _is_passive_voice(sent: _SpaCySpan) -> bool:
|
|
410
472
|
"""
|
|
411
473
|
Detect if sentence contains passive voice construction.
|
|
412
474
|
|