pystylometry 1.1.0__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,15 +10,275 @@ Related GitHub Issue:
10
10
 
11
11
  References:
12
12
  Jaccard, P. (1912). The distribution of the flora in the alpine zone.
13
- Salton, G., & McGill, M. J. (1983). Introduction to Modern Information Retrieval.
13
+ New Phytologist, 11(2), 37-50.
14
+ Sørensen, T. (1948). A method of establishing groups of equal amplitude in
15
+ plant sociology based on similarity of species. Kongelige Danske
16
+ Videnskabernes Selskab, 5(4), 1-34.
17
+ Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
18
+ Retrieval. McGraw-Hill.
19
+ Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
20
+ Annals of Mathematical Statistics, 22(1), 79-86.
21
+ Manning, C. D., & Schütze, H. (1999). Foundations of Statistical NLP.
22
+ MIT Press.
14
23
  """
15
24
 
25
+ from __future__ import annotations
26
+
27
+ import math
28
+ import re
29
+ from collections import Counter
30
+
16
31
  from .._types import VocabularyOverlapResult
17
32
 
18
33
 
19
- def compute_vocabulary_overlap(text1: str, text2: str) -> VocabularyOverlapResult:
34
+ def _tokenize(text: str) -> list[str]:
35
+ """Tokenize text into lowercase words.
36
+
37
+ Uses a simple regex-based tokenizer that extracts word characters.
38
+ Converts to lowercase for case-insensitive comparison.
39
+
40
+ Args:
41
+ text: Input text to tokenize
42
+
43
+ Returns:
44
+ List of lowercase word tokens
45
+ """
46
+ # Match word characters, convert to lowercase
47
+ tokens = re.findall(r"\b[a-zA-Z]+\b", text.lower())
48
+ return tokens
49
+
50
+
51
+ def _compute_jaccard(set1: set[str], set2: set[str]) -> float:
52
+ """Compute Jaccard similarity coefficient.
53
+
54
+ The Jaccard index measures similarity as the size of the intersection
55
+ divided by the size of the union of two sets.
56
+
57
+ J(A, B) = |A ∩ B| / |A ∪ B|
58
+
59
+ Args:
60
+ set1: First vocabulary set
61
+ set2: Second vocabulary set
62
+
63
+ Returns:
64
+ Jaccard similarity coefficient (0.0 to 1.0)
65
+
66
+ References:
67
+ Jaccard, P. (1912). The distribution of the flora in the alpine zone.
68
+ """
69
+ if not set1 and not set2:
70
+ return 1.0 # Both empty = identical
71
+
72
+ intersection = len(set1 & set2)
73
+ union = len(set1 | set2)
74
+
75
+ return intersection / union if union > 0 else 0.0
76
+
77
+
78
+ def _compute_dice(set1: set[str], set2: set[str]) -> float:
79
+ """Compute Sørensen-Dice coefficient.
80
+
81
+ The Dice coefficient is similar to Jaccard but weights the intersection
82
+ more heavily. Also known as the Sørensen-Dice index.
83
+
84
+ D(A, B) = 2|A ∩ B| / (|A| + |B|)
85
+
86
+ Args:
87
+ set1: First vocabulary set
88
+ set2: Second vocabulary set
89
+
90
+ Returns:
91
+ Dice coefficient (0.0 to 1.0)
92
+
93
+ References:
94
+ Sørensen, T. (1948). A method of establishing groups of equal amplitude
95
+ in plant sociology based on similarity of species.
20
96
  """
21
- Compute vocabulary overlap and similarity between two texts.
97
+ if not set1 and not set2:
98
+ return 1.0 # Both empty = identical
99
+
100
+ intersection = len(set1 & set2)
101
+ total_size = len(set1) + len(set2)
102
+
103
+ return (2 * intersection) / total_size if total_size > 0 else 0.0
104
+
105
+
106
+ def _compute_overlap_coefficient(set1: set[str], set2: set[str]) -> float:
107
+ """Compute overlap coefficient.
108
+
109
+ The overlap coefficient measures the overlap relative to the smaller set.
110
+ Useful when comparing texts of very different lengths.
111
+
112
+ O(A, B) = |A ∩ B| / min(|A|, |B|)
113
+
114
+ Args:
115
+ set1: First vocabulary set
116
+ set2: Second vocabulary set
117
+
118
+ Returns:
119
+ Overlap coefficient (0.0 to 1.0)
120
+ """
121
+ if not set1 or not set2:
122
+ return 0.0 if set1 or set2 else 1.0
123
+
124
+ intersection = len(set1 & set2)
125
+ min_size = min(len(set1), len(set2))
126
+
127
+ return intersection / min_size if min_size > 0 else 0.0
128
+
129
+
130
+ def _compute_cosine_similarity(freq1: Counter[str], freq2: Counter[str], vocab: set[str]) -> float:
131
+ """Compute cosine similarity between term frequency vectors.
132
+
133
+ Treats each text as a vector in vocabulary space where each dimension
134
+ is the frequency of a word. Computes the cosine of the angle between vectors.
135
+
136
+ cos(θ) = (A · B) / (||A|| × ||B||)
137
+
138
+ Args:
139
+ freq1: Word frequencies for text 1
140
+ freq2: Word frequencies for text 2
141
+ vocab: Combined vocabulary (union of both texts)
142
+
143
+ Returns:
144
+ Cosine similarity (-1.0 to 1.0, though word frequencies yield 0.0 to 1.0)
145
+
146
+ References:
147
+ Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
148
+ Retrieval.
149
+ """
150
+ if not vocab:
151
+ return 1.0 # Both empty = identical
152
+
153
+ # Compute dot product and magnitudes
154
+ dot_product = 0.0
155
+ magnitude1 = 0.0
156
+ magnitude2 = 0.0
157
+
158
+ for word in vocab:
159
+ f1 = freq1.get(word, 0)
160
+ f2 = freq2.get(word, 0)
161
+ dot_product += f1 * f2
162
+ magnitude1 += f1 * f1
163
+ magnitude2 += f2 * f2
164
+
165
+ magnitude1 = math.sqrt(magnitude1)
166
+ magnitude2 = math.sqrt(magnitude2)
167
+
168
+ if magnitude1 == 0 or magnitude2 == 0:
169
+ return 0.0
170
+
171
+ return dot_product / (magnitude1 * magnitude2)
172
+
173
+
174
+ def _compute_kl_divergence(
175
+ freq1: Counter[str], freq2: Counter[str], vocab: set[str], smoothing: float = 1e-10
176
+ ) -> float:
177
+ """Compute Kullback-Leibler divergence from text1 to text2.
178
+
179
+ KL divergence measures how one probability distribution diverges from
180
+ another. It is asymmetric: D_KL(P || Q) ≠ D_KL(Q || P).
181
+
182
+ D_KL(P || Q) = Σ P(x) log(P(x) / Q(x))
183
+
184
+ A small smoothing value is added to avoid division by zero when Q(x) = 0.
185
+
186
+ Args:
187
+ freq1: Word frequencies for text 1 (P distribution)
188
+ freq2: Word frequencies for text 2 (Q distribution)
189
+ vocab: Combined vocabulary (union of both texts)
190
+ smoothing: Small value added to probabilities to avoid log(0)
191
+
192
+ Returns:
193
+ KL divergence (non-negative, unbounded above)
194
+
195
+ Note:
196
+ Returns 0.0 for identical distributions. Higher values indicate
197
+ greater difference between distributions.
198
+
199
+ References:
200
+ Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
201
+ """
202
+ if not vocab:
203
+ return 0.0 # Both empty = identical
204
+
205
+ # Convert frequencies to probabilities
206
+ total1 = sum(freq1.values())
207
+ total2 = sum(freq2.values())
208
+
209
+ if total1 == 0 or total2 == 0:
210
+ return 0.0
211
+
212
+ kl_div = 0.0
213
+ for word in vocab:
214
+ p = (freq1.get(word, 0) / total1) + smoothing
215
+ q = (freq2.get(word, 0) / total2) + smoothing
216
+ kl_div += p * math.log(p / q)
217
+
218
+ return max(0.0, kl_div) # Ensure non-negative due to smoothing artifacts
219
+
220
+
221
+ def _compute_tfidf_distinctive_words(
222
+ freq1: Counter[str],
223
+ freq2: Counter[str],
224
+ unique_to_1: set[str],
225
+ unique_to_2: set[str],
226
+ top_n: int = 20,
227
+ ) -> tuple[list[tuple[str, float]], list[tuple[str, float]]]:
228
+ """Compute distinctive words for each text using TF-IDF-like scoring.
229
+
230
+ Words unique to each text are scored by their frequency, providing
231
+ a measure of how "distinctive" they are for that text.
232
+
233
+ For texts with shared vocabulary, the scoring considers relative
234
+ frequency differences.
235
+
236
+ Args:
237
+ freq1: Word frequencies for text 1
238
+ freq2: Word frequencies for text 2
239
+ unique_to_1: Words appearing only in text 1
240
+ unique_to_2: Words appearing only in text 2
241
+ top_n: Number of top distinctive words to return
242
+
243
+ Returns:
244
+ Tuple of (text1_distinctive, text2_distinctive) lists,
245
+ each containing (word, score) tuples sorted by score descending
246
+ """
247
+ # For unique words, score by frequency
248
+ text1_scores: list[tuple[str, float]] = []
249
+ for word in unique_to_1:
250
+ score = float(freq1[word])
251
+ text1_scores.append((word, score))
252
+
253
+ text2_scores: list[tuple[str, float]] = []
254
+ for word in unique_to_2:
255
+ score = float(freq2[word])
256
+ text2_scores.append((word, score))
257
+
258
+ # Sort by score descending
259
+ text1_scores.sort(key=lambda x: x[1], reverse=True)
260
+ text2_scores.sort(key=lambda x: x[1], reverse=True)
261
+
262
+ return text1_scores[:top_n], text2_scores[:top_n]
263
+
264
+
265
+ def compute_vocabulary_overlap(
266
+ text1: str,
267
+ text2: str,
268
+ top_distinctive: int = 20,
269
+ ) -> VocabularyOverlapResult:
270
+ """Compute vocabulary overlap and similarity between two texts.
271
+
272
+ This function computes multiple similarity metrics based on vocabulary
273
+ comparison, useful for authorship verification, plagiarism detection,
274
+ and measuring stylistic consistency across texts.
275
+
276
+ Metrics computed:
277
+ - Jaccard similarity: intersection / union (set-based)
278
+ - Sørensen-Dice coefficient: 2 * intersection / (size1 + size2)
279
+ - Overlap coefficient: intersection / min(size1, size2)
280
+ - Cosine similarity: dot product of frequency vectors
281
+ - KL divergence: distributional difference (asymmetric)
22
282
 
23
283
  Related GitHub Issue:
24
284
  #21 - Vocabulary Overlap and Similarity Metrics
@@ -27,21 +287,102 @@ def compute_vocabulary_overlap(text1: str, text2: str) -> VocabularyOverlapResul
27
287
  Args:
28
288
  text1: First text to compare
29
289
  text2: Second text to compare
290
+ top_distinctive: Number of most distinctive words to return per text
30
291
 
31
292
  Returns:
32
- VocabularyOverlapResult with Jaccard, Dice, cosine similarities,
33
- shared vocabulary statistics, and distinctive words for each text.
293
+ VocabularyOverlapResult with similarity scores, vocabulary statistics,
294
+ shared vocabulary, and distinctive words for each text.
34
295
 
35
296
  Example:
36
- >>> result = compute_vocabulary_overlap(text1, text2)
297
+ >>> result = compute_vocabulary_overlap(
298
+ ... "The quick brown fox jumps over the lazy dog",
299
+ ... "The fast brown fox leaps over the sleepy dog"
300
+ ... )
37
301
  >>> print(f"Jaccard similarity: {result.jaccard_similarity:.3f}")
38
- Jaccard similarity: 0.456
302
+ Jaccard similarity: 0.583
39
303
  >>> print(f"Shared words: {result.shared_vocab_size}")
40
- Shared words: 234
304
+ Shared words: 7
305
+ >>> print(f"Text1 distinctive: {result.text1_distinctive_words}")
306
+ [('quick', 1.0), ('jumps', 1.0), ('lazy', 1.0)]
307
+
308
+ References:
309
+ Jaccard, P. (1912). The distribution of the flora in the alpine zone.
310
+ New Phytologist, 11(2), 37-50.
311
+ Sørensen, T. (1948). A method of establishing groups of equal amplitude
312
+ in plant sociology based on similarity of species.
313
+ Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
314
+ Retrieval. McGraw-Hill.
315
+ Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
316
+ Annals of Mathematical Statistics, 22(1), 79-86.
317
+ Manning, C. D., & Schütze, H. (1999). Foundations of Statistical NLP.
318
+ MIT Press.
41
319
  """
42
- # TODO: Implement vocabulary overlap analysis
43
- # GitHub Issue #21: https://github.com/craigtrim/pystylometry/issues/21
44
- raise NotImplementedError(
45
- "Vocabulary overlap not yet implemented. "
46
- "See GitHub Issue #21: https://github.com/craigtrim/pystylometry/issues/21"
320
+ # Tokenize texts
321
+ tokens1 = _tokenize(text1)
322
+ tokens2 = _tokenize(text2)
323
+
324
+ # Build frequency counters and vocabulary sets
325
+ freq1: Counter[str] = Counter(tokens1)
326
+ freq2: Counter[str] = Counter(tokens2)
327
+
328
+ vocab1 = set(freq1.keys())
329
+ vocab2 = set(freq2.keys())
330
+
331
+ # Compute set operations
332
+ shared = vocab1 & vocab2
333
+ union = vocab1 | vocab2
334
+ unique_to_1 = vocab1 - vocab2
335
+ unique_to_2 = vocab2 - vocab1
336
+
337
+ # Compute similarity metrics
338
+ jaccard = _compute_jaccard(vocab1, vocab2)
339
+ dice = _compute_dice(vocab1, vocab2)
340
+ overlap = _compute_overlap_coefficient(vocab1, vocab2)
341
+ cosine = _compute_cosine_similarity(freq1, freq2, union)
342
+ kl_div = _compute_kl_divergence(freq1, freq2, union)
343
+
344
+ # Compute coverage ratios
345
+ text1_coverage = len(shared) / len(vocab1) if vocab1 else 0.0
346
+ text2_coverage = len(shared) / len(vocab2) if vocab2 else 0.0
347
+
348
+ # Get distinctive words
349
+ text1_distinctive, text2_distinctive = _compute_tfidf_distinctive_words(
350
+ freq1, freq2, unique_to_1, unique_to_2, top_distinctive
351
+ )
352
+
353
+ # Build shared words list (sorted by combined frequency)
354
+ shared_with_freq = [(word, freq1[word] + freq2[word]) for word in shared]
355
+ shared_with_freq.sort(key=lambda x: x[1], reverse=True)
356
+ shared_words = [word for word, _ in shared_with_freq]
357
+
358
+ return VocabularyOverlapResult(
359
+ # Similarity scores
360
+ jaccard_similarity=jaccard,
361
+ dice_coefficient=dice,
362
+ overlap_coefficient=overlap,
363
+ cosine_similarity=cosine,
364
+ kl_divergence=kl_div,
365
+ # Vocabulary sizes
366
+ text1_vocab_size=len(vocab1),
367
+ text2_vocab_size=len(vocab2),
368
+ shared_vocab_size=len(shared),
369
+ union_vocab_size=len(union),
370
+ text1_unique_count=len(unique_to_1),
371
+ text2_unique_count=len(unique_to_2),
372
+ # Shared and distinctive vocabulary
373
+ shared_words=shared_words,
374
+ text1_distinctive_words=text1_distinctive,
375
+ text2_distinctive_words=text2_distinctive,
376
+ # Coverage ratios
377
+ text1_coverage=text1_coverage,
378
+ text2_coverage=text2_coverage,
379
+ # Metadata
380
+ metadata={
381
+ "text1_token_count": len(tokens1),
382
+ "text2_token_count": len(tokens2),
383
+ "text1_frequencies": dict(freq1),
384
+ "text2_frequencies": dict(freq2),
385
+ "unique_to_text1": sorted(unique_to_1),
386
+ "unique_to_text2": sorted(unique_to_2),
387
+ },
47
388
  )
@@ -0,0 +1,20 @@
1
+ # syntactic
2
+
3
+ ![4 public functions](https://img.shields.io/badge/functions-4-blue)
4
+ ![Requires spaCy](https://img.shields.io/badge/requires-spaCy-orange)
5
+
6
+ Sentence structure, part-of-speech, and parse tree analysis.
7
+
8
+ ## Catalogue
9
+
10
+ | File | Function | What It Measures |
11
+ |------|----------|-----------------|
12
+ | `pos_ratios.py` | `compute_pos_ratios` | Noun/verb/adjective/adverb ratios |
13
+ | `sentence_stats.py` | `compute_sentence_stats` | Sentence length, word length distributions |
14
+ | `sentence_types.py` | `compute_sentence_types` | Declarative, interrogative, imperative, exclamatory classification |
15
+ | `advanced_syntactic.py` | `compute_advanced_syntactic` | Parse tree depth, clausal density, passive voice, T-units, dependency distance, subordination/coordination ratios |
16
+
17
+ ## See Also
18
+
19
+ - [`stylistic/`](../stylistic/) for higher-level style features built on syntactic foundations
20
+ - [`ngrams/`](../ngrams/) for POS n-gram sequences via `compute_extended_ngrams(text, pos=True)`
@@ -0,0 +1,27 @@
1
+ # viz
2
+
3
+ ![6 public functions](https://img.shields.io/badge/functions-6-blue)
4
+ ![Optional: matplotlib](https://img.shields.io/badge/optional-matplotlib-yellow)
5
+
6
+ Visualization for drift detection results. Two output modes: static PNG (matplotlib) and interactive HTML (React JSX).
7
+
8
+ ## Catalogue
9
+
10
+ | File | Functions | Output |
11
+ |------|-----------|--------|
12
+ | `drift.py` | `plot_drift_timeline`, `plot_drift_scatter`, `plot_drift_report` | PNG via matplotlib/seaborn |
13
+ | `jsx/report.py` | `export_drift_report_jsx` | Interactive HTML dashboard |
14
+ | `jsx/timeline.py` | `export_drift_timeline_jsx` | Interactive HTML timeline |
15
+ | `jsx/viewer.py` | `export_drift_viewer` | Standalone HTML viewer with file upload |
16
+ | `jsx/_base.py` | _(internal)_ | React/JSX rendering base |
17
+
18
+ ## Install
19
+
20
+ ```
21
+ pip install pystylometry[viz] # For PNG output (matplotlib + seaborn)
22
+ # JSX/HTML output requires no additional dependencies
23
+ ```
24
+
25
+ ## See Also
26
+
27
+ - [`consistency/`](../consistency/) produces the `KilgarriffDriftResult` consumed by all viz functions
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Craig Trim
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,79 @@
1
+ Metadata-Version: 2.1
2
+ Name: pystylometry
3
+ Version: 1.3.1
4
+ Summary: Comprehensive Python package for stylometric analysis
5
+ License: MIT
6
+ Keywords: stylometry,nlp,text-analysis,authorship,readability,lexical-diversity,readability-metrics
7
+ Author: Craig Trim
8
+ Author-email: craigtrim@gmail.com
9
+ Requires-Python: >=3.9,<4.0
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
20
+ Classifier: Topic :: Text Processing :: Linguistic
21
+ Classifier: Typing :: Typed
22
+ Requires-Dist: stylometry-ttr (>=1.0.3,<2.0.0)
23
+ Project-URL: Homepage, https://github.com/craigtrim/pystylometry
24
+ Project-URL: Issues, https://github.com/craigtrim/pystylometry/issues
25
+ Project-URL: Repository, https://github.com/craigtrim/pystylometry
26
+ Description-Content-Type: text/markdown
27
+
28
+ # pystylometry
29
+
30
+ [![PyPI version](https://badge.fury.io/py/pystylometry.svg)](https://badge.fury.io/py/pystylometry)
31
+ [![Downloads](https://static.pepy.tech/badge/pystylometry)](https://pepy.tech/project/pystylometry)
32
+ [![Downloads/Month](https://static.pepy.tech/badge/pystylometry/month)](https://pepy.tech/project/pystylometry)
33
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9%2B-blue.svg)](https://www.python.org/downloads/)
34
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
35
+ [![Tests](https://img.shields.io/badge/tests-1022%20passed-brightgreen)]()
36
+
37
+ Stylometric analysis and authorship attribution for Python. 50+ metrics across 11 modules, from vocabulary diversity to AI-generation detection.
38
+
39
+ ## Install
40
+
41
+ ```bash
42
+ pip install pystylometry # Core (lexical metrics)
43
+ pip install pystylometry[all] # Everything
44
+ ```
45
+
46
+ ## Modules
47
+
48
+ | Module | Metrics | Description |
49
+ |--------|---------|-------------|
50
+ | [**lexical**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/lexical) | TTR, MTLD, Yule's K/I, Hapax, MATTR, VocD-D, HD-D, MSTTR, function words, word frequency | Vocabulary diversity and richness |
51
+ | [**readability**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/readability) | Flesch, Flesch-Kincaid, SMOG, Gunning Fog, Coleman-Liau, ARI, Dale-Chall, Fry, FORCAST, Linsear Write, Powers-Sumner-Kearl | Grade-level and difficulty scoring |
52
+ | [**syntactic**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/syntactic) | POS ratios, sentence types, parse tree depth, clausal density, passive voice, T-units, dependency distance | Sentence and parse structure (requires spaCy) |
53
+ | [**authorship**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/authorship) | Burrows' Delta, Cosine Delta, Zeta, Kilgarriff chi-squared, MinMax, John's Delta, NCD | Author attribution and text comparison |
54
+ | [**stylistic**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/stylistic) | Contractions, hedges, intensifiers, modals, punctuation, vocabulary overlap (Jaccard/Dice/Cosine/KL), cohesion, genre/register | Style markers and text similarity |
55
+ | [**character**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/character) | Letter frequencies, digit/uppercase ratios, special characters, whitespace | Character-level fingerprinting |
56
+ | [**ngrams**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/ngrams) | Word/character/POS n-grams, Shannon entropy, skipgrams | N-gram profiles and entropy |
57
+ | [**dialect**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/dialect) | British/American classification, spelling/grammar/vocabulary markers, markedness | Regional dialect detection |
58
+ | [**consistency**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/consistency) | Sliding-window chi-squared drift, pattern classification | Intra-document style analysis |
59
+ | [**prosody**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/prosody) | Syllable stress, rhythm regularity | Prose rhythm (requires spaCy) |
60
+ | [**viz**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/viz) | Timeline, scatter, report (PNG + interactive HTML) | Drift detection visualization |
61
+
62
+ ## Development
63
+
64
+ ```bash
65
+ git clone https://github.com/craigtrim/pystylometry && cd pystylometry
66
+ pip install -e ".[dev,all]"
67
+ make test # 1022 tests
68
+ make lint # ruff + mypy
69
+ make all # lint + test + build
70
+ ```
71
+
72
+ ## License
73
+
74
+ MIT
75
+
76
+ ## Author
77
+
78
+ Craig Trim -- craigtrim@gmail.com
79
+
@@ -1,35 +1,46 @@
1
- pystylometry/__init__.py,sha256=UQGe2EJUdMh1rE1zmIGNqrMgbipAhPDDU5Cvp_w--64,8594
1
+ pystylometry/README.md,sha256=WFOtCAF3qtDTgGG3a_jTjNSwVgpQEXI1PKqbVBfyo1M,2366
2
+ pystylometry/__init__.py,sha256=Z6zkHlX05SUeObDca9dL1Gkfq4UPBWbU2M4sp4fVj78,9220
2
3
  pystylometry/_normalize.py,sha256=7tdfgAKg5CI2d4eoDypmFqOVByoxpwgUUZD6vyBH86A,8679
3
- pystylometry/_types.py,sha256=OOKJ0Y_2OtaiQn_Y0EVHTOkPrNlWDhdu5Jl-4quuNZw,74257
4
+ pystylometry/_types.py,sha256=g6XzwCHeMAIBfexId6Pd9EQfJzvZ0KYMfD4kpS5T7BQ,82284
4
5
  pystylometry/_utils.py,sha256=CXTx4KDJ_6iiHcc2OXqOYs-izhLf_ZEmJFKdHyd7q34,5282
5
- pystylometry/authorship/__init__.py,sha256=QJMZ9xx5mf3u9X-HAdZsgqzZGhi4G6Pgj7wjNYGholE,1343
6
- pystylometry/authorship/additional_methods.py,sha256=ZCYwM_OEq3fCZGkCL0wsOUBiMSvrF1paVON4xueJDW4,2583
6
+ pystylometry/authorship/README.md,sha256=zNXCpLj7nczPnYykJnCUw3y-kxfC9mWZmngi3nfw6us,1016
7
+ pystylometry/authorship/__init__.py,sha256=D7m38hWi_62o1ZDSrghLCfob9YsykTht4K37wiVgHfg,1530
8
+ pystylometry/authorship/additional_methods.py,sha256=jvEg6TMI55jhkDt1jpC-08iXTzz6TaNmKOkJy5qNF0c,11487
7
9
  pystylometry/authorship/burrows_delta.py,sha256=6XC8I7EcBTLbn9BNKZsOtL0otL4vKFX10aHBlU4Bki4,5677
8
- pystylometry/authorship/kilgarriff.py,sha256=Hqv5Ww7s_Tn4KSpcDAE_dTVESv4X3pglkQt0bYjQGW0,13097
10
+ pystylometry/authorship/compression.py,sha256=qqUHDd7wWOB6Q2E97-cczBEWhKDTF3ynJUhbRqGq_RA,6296
11
+ pystylometry/authorship/kilgarriff.py,sha256=oz4JbLnFEuPXZYLmhfkuapg516A554FvXvVNIVu7uKk,13379
9
12
  pystylometry/authorship/zeta.py,sha256=oOi9Y6ZPq15ILLVl6So9O9ERvzig26en6_dpQJWeoOc,4338
13
+ pystylometry/character/README.md,sha256=poQwhbI8MabVD_626CWjEL87IOX5YDGS0ZJTH1hNwEE,607
10
14
  pystylometry/character/__init__.py,sha256=CiiKJmZ10UJE8qAecavpOKyw-vGonsOew_mFH34ZOC0,371
11
15
  pystylometry/character/character_metrics.py,sha256=OCIGP_ivtwtzcifcxcbmp2R5SIKh2tKyvKcHAv64S8g,14029
12
16
  pystylometry/cli.py,sha256=z0yx2O_E05tHT9_BHgSaQ2zq5_fBERXfhbYHcuQ2y-A,15477
17
+ pystylometry/consistency/README.md,sha256=HG_Rd6WRBnIz3M7J11dVDv1S2ARkMABFYrTn-VV8xRY,1058
13
18
  pystylometry/consistency/__init__.py,sha256=l7nzpS7M4yHDBbM2LGAtW0XGT2n7YjSey_1xKf45224,2181
14
19
  pystylometry/consistency/_thresholds.py,sha256=5fZwdJ_cnDy0ED7CCYs6V_zP6kIAR1p0h0NYkbZ0HRg,6381
15
20
  pystylometry/consistency/drift.py,sha256=ZqK7YJXic8ceIfQLkH9ZtXFJCFyOuto5Mktz4qLG9ps,20682
21
+ pystylometry/dialect/README.md,sha256=Bz0oGFRaWXjfZQqlMgvQ75rA9U0E67am2mJ9nWcSBhQ,1089
16
22
  pystylometry/dialect/__init__.py,sha256=6S4OKymniuDXPm3ZMqWyy9179RlWoLJoDzkCP4P7Jss,2486
17
23
  pystylometry/dialect/_data/dialect_markers.json,sha256=DthluOA6q0rG_8IrCrFIYWh_EMvINqYv7W664sEjNN4,51799
18
24
  pystylometry/dialect/_loader.py,sha256=M2ATp-5754v_yX9EWvBP0r5qgNf8xlL8XadVsVb_Hco,12989
19
25
  pystylometry/dialect/detector.py,sha256=9x0ZuIfTIjsmdNSx0Ezy5AC0SAFtC4kVw11iOSBd9gQ,20147
20
- pystylometry/lexical/__init__.py,sha256=HTncnGVZgpktZqpf-r4_HI_9Jq42WkZZKXn8nho3y3s,751
26
+ pystylometry/lexical/README.md,sha256=cFQ7KRZV4ubsQwIlOH3YHTbhhNl5X91Sr3zcn-3x0HI,1185
27
+ pystylometry/lexical/__init__.py,sha256=_VpemdfVYZYXHP4ulTItoyegJ-3lE85wlfzDCpseaNE,898
21
28
  pystylometry/lexical/advanced_diversity.py,sha256=rL1hlNqTnaEFcA2v4oBJlojHZMTqdvvm4jYXTFGVpYE,25664
22
29
  pystylometry/lexical/function_words.py,sha256=eel9bq_qWgWlvG0NtDiouilMt9kaFqz2rh3add2UC4U,17832
23
30
  pystylometry/lexical/hapax.py,sha256=djTqZyZIYXa3GRiPoy6TTGHPm0wCRNJ9U0Rwnf5NoDk,12173
24
31
  pystylometry/lexical/mtld.py,sha256=XpeCF8sOXZhWbaazHGuqm08mrOf_DYfkfGGAltWnyy4,7101
32
+ pystylometry/lexical/repetition.py,sha256=A9L0oNwfnCepVkWy57kjHV47Pw4M6fZXEl25hBVdq2s,18318
25
33
  pystylometry/lexical/ttr.py,sha256=iEsXkoSPyZEyiiFwKatKA8KhLRukD7RDRvyRkRQOTsk,5848
26
34
  pystylometry/lexical/word_frequency_sophistication.py,sha256=OHOS0fBvd1Bz8zsJk-pJbWLTgImmBd-aewQnp_kq8BY,38828
27
35
  pystylometry/lexical/yule.py,sha256=NXggha8jmQCu4i-qKZpISwyJBqNpuPHyVR86BLDLgio,5192
36
+ pystylometry/ngrams/README.md,sha256=50wyaWcLGbosLzTPR1cXdE_xAVU8jVY7fd3ReEk9KnY,802
28
37
  pystylometry/ngrams/__init__.py,sha256=eyITmSG4QP1NtVSagPsvc4j6W_E8TdB9wvBvXQHUnwo,379
29
38
  pystylometry/ngrams/entropy.py,sha256=i2RzYXrcTTIv6QaUCNQjAahL5LFOctG3ZE1OJ_tY4II,7246
30
- pystylometry/ngrams/extended_ngrams.py,sha256=OsBHTaaK73ZOhpS_yG2aWip1yWY2Fitdte0qx3wIshc,9475
39
+ pystylometry/ngrams/extended_ngrams.py,sha256=288nrXbY6-PIJiQ3NaspnuRZ7qWakantnNKvtb5LhWI,18316
40
+ pystylometry/prosody/README.md,sha256=YNTU0sTnXbCJ9GBPDDfTqHELr4YoF59_bg99ejPiqEE,608
31
41
  pystylometry/prosody/__init__.py,sha256=9tiD-U4sqEtUV8n9X339oF_C5tBNingjL-shGBXOrnY,265
32
- pystylometry/prosody/rhythm_prosody.py,sha256=V9OoxV5d4AZRZAb2HDY7-iEK1ijE7gtHhvFRD2DJvdA,1960
42
+ pystylometry/prosody/rhythm_prosody.py,sha256=fifKW0FiRwC6xPX1NX0Yr4Il3APNfQiBEXB-uXXgZo8,28697
43
+ pystylometry/readability/README.md,sha256=jj5I5525WRJceMJR8lECiZb-7y1nFzSK00GSotqupFs,1173
33
44
  pystylometry/readability/__init__.py,sha256=bJenjlGpNx7FF5AfOb6VA-wODdIa7Hc9iqoba1DLlh0,637
34
45
  pystylometry/readability/additional_formulas.py,sha256=nlVegnn_RRh6TP0BoLWlLBNnAgtFqLqyDsxFN_fUrAg,44993
35
46
  pystylometry/readability/ari.py,sha256=_wPl0FjEReLRHN0v4JQbRaU_kbikIxkr9mLO6hmNVyI,6833
@@ -39,17 +50,20 @@ pystylometry/readability/flesch.py,sha256=7kMeqpYnm-oqQGsDw7yJBhFecXB5ZRU9C8P4UK
39
50
  pystylometry/readability/gunning_fog.py,sha256=ntV90NUfqSm_84H1jBa2Fhr5DhlkderHLq8_z3khb48,8375
40
51
  pystylometry/readability/smog.py,sha256=8hdQQHUR9UBP-02AyZK3TbNhyyE1LQuZmlnVrs5Yvrk,5742
41
52
  pystylometry/readability/syllables.py,sha256=U_tO1fmdOh2xyIJVkFooGMhmZs1hqlFPBa9wBjEwLw8,4272
53
+ pystylometry/stylistic/README.md,sha256=1GBo3AQ8f4ATap723is6pJtgUM9jmLy-hDOTcVWuI48,1020
42
54
  pystylometry/stylistic/__init__.py,sha256=nMykFZUCUKj-ZTk5H0OSKn24w6CSVEVIWieNG2B2hhc,581
43
- pystylometry/stylistic/cohesion_coherence.py,sha256=M_Pqfj0ZfCLDZBKFQCPx7rX9k6mxWFOjIsm1gsLdFyg,1618
44
- pystylometry/stylistic/genre_register.py,sha256=R32csC0M3eRcnACJNqMsyN-1ucMwdK8Twm5Tsa0Dd4k,1664
45
- pystylometry/stylistic/markers.py,sha256=s0ybwUZ6_wE064NXL9kQeTLKVeSHScFgZip7zkKYi2U,5134
46
- pystylometry/stylistic/vocabulary_overlap.py,sha256=TD8Rn32htB6MPHjc9xkr0LepJ6Q9k7f6uJvZt9_5aXA,1717
55
+ pystylometry/stylistic/cohesion_coherence.py,sha256=9al3AYH2KQ62aluQJQr0pQHcNf1Aec6G8Oa9zux_uZk,23286
56
+ pystylometry/stylistic/genre_register.py,sha256=4s-TxEBnFB-iog2yIO1RT6D66AQ3ChOjakRmOZzL8LM,41279
57
+ pystylometry/stylistic/markers.py,sha256=AsuBsq5ZNTGHEp12AEL0mHj9XCJBKf3bwt7JW4H_xKs,24204
58
+ pystylometry/stylistic/vocabulary_overlap.py,sha256=6ujoiE7TqrCiGEBrBuDeU6sdKSQYAG6IbrYVR3o9lMY,12931
59
+ pystylometry/syntactic/README.md,sha256=0eQGqQz9MIE024_Oge4pq9LNdi-GmuTuAlz-DrK2jDI,982
47
60
  pystylometry/syntactic/__init__.py,sha256=B9qe0R7w9t5x2s2dXygSuvciuEHrScgD3CkxvPWKMPE,391
48
61
  pystylometry/syntactic/advanced_syntactic.py,sha256=ygbm7y1hrNJCaIxRCfZsafvt6BInh2iCTY1eWk2PdaE,19195
49
62
  pystylometry/syntactic/pos_ratios.py,sha256=lcvtx6tshVG6MpTWivyWnqFsjFXIHK3LCqyg2AL2AjY,7444
50
63
  pystylometry/syntactic/sentence_stats.py,sha256=SJg6TYCiT3gs2bXHYuEMSRgzFnxqOCH5q6WyhjXKgH4,4947
51
64
  pystylometry/syntactic/sentence_types.py,sha256=xEQPieGqTInCz9BinvItBX5Z_ofQ-BbFwTFNgY0jWx0,18730
52
65
  pystylometry/tokenizer.py,sha256=03FEF4kKp72v-ypbtMg8u0WyVJGk3YJx6Nw3SGzyAnA,18166
66
+ pystylometry/viz/README.md,sha256=mizuBpUzWgJqjC2u9C-Lu4sVDCcTQOgGsarRSkeWPf4,1031
53
67
  pystylometry/viz/__init__.py,sha256=3kHMAcJJi8oPhTqUZIRdyf311cdyPOHWaJIUv-w0V04,2219
54
68
  pystylometry/viz/drift.py,sha256=r98gQ4s_IlrEuaouxDMyue3cTjGqj10i4IeKC01IuCo,18956
55
69
  pystylometry/viz/jsx/__init__.py,sha256=ZCgbpMPhG5PiJ92IkJRrZwrb7RodZB9MyauO0MGgbRM,1107
@@ -57,7 +71,8 @@ pystylometry/viz/jsx/_base.py,sha256=nd7kEc13fUcRMom3A5jqjGyTy-djIeydq2k3oPHZIHY
57
71
  pystylometry/viz/jsx/report.py,sha256=DbbHnnNAEi5tmVg4PmiHb17vkBBXujyE4x1CfVBiOBw,25857
58
72
  pystylometry/viz/jsx/timeline.py,sha256=hor-xnBa6oVkSqN0AEZUCQFBOB-iTfHSFZHiEfeakPA,30716
59
73
  pystylometry/viz/jsx/viewer.py,sha256=3LO49d_2bRf_P-P-2oSKpKx4N8Ugo4oCLb3DtvyNxXI,43716
60
- pystylometry-1.1.0.dist-info/METADATA,sha256=QEXVX6buqxGwilLuOIvjFT2ZxoPNazUWX1iyhaM8vI8,8348
61
- pystylometry-1.1.0.dist-info/WHEEL,sha256=3ny-bZhpXrU6vSQ1UPG34FoxZBp3lVcvK0LkgUz6VLk,88
62
- pystylometry-1.1.0.dist-info/entry_points.txt,sha256=iHOaFXlyiwcQM1LlID2gWGmN4DBLdTSpKGjttU8tgm8,113
63
- pystylometry-1.1.0.dist-info/RECORD,,
74
+ pystylometry-1.3.1.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
75
+ pystylometry-1.3.1.dist-info/METADATA,sha256=Nn-0-ABq9tykuxWpC79GkhHO71oWLnAseh0z9R3mycs,4813
76
+ pystylometry-1.3.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
77
+ pystylometry-1.3.1.dist-info/entry_points.txt,sha256=iHOaFXlyiwcQM1LlID2gWGmN4DBLdTSpKGjttU8tgm8,113
78
+ pystylometry-1.3.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.3.0
2
+ Generator: poetry-core 1.9.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any