pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pystylometry/__init__.py +1 -2
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1224 -2
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +4 -0
  6. pystylometry/authorship/additional_methods.py +100 -0
  7. pystylometry/character/__init__.py +15 -0
  8. pystylometry/character/character_metrics.py +301 -0
  9. pystylometry/lexical/__init__.py +13 -6
  10. pystylometry/lexical/advanced_diversity.py +641 -0
  11. pystylometry/lexical/function_words.py +391 -0
  12. pystylometry/lexical/hapax.py +154 -7
  13. pystylometry/lexical/mtld.py +83 -7
  14. pystylometry/lexical/ttr.py +83 -0
  15. pystylometry/lexical/word_frequency_sophistication.py +581 -0
  16. pystylometry/lexical/yule.py +34 -7
  17. pystylometry/ngrams/__init__.py +2 -0
  18. pystylometry/ngrams/extended_ngrams.py +235 -0
  19. pystylometry/prosody/__init__.py +12 -0
  20. pystylometry/prosody/rhythm_prosody.py +53 -0
  21. pystylometry/readability/__init__.py +12 -0
  22. pystylometry/readability/additional_formulas.py +985 -0
  23. pystylometry/readability/ari.py +93 -17
  24. pystylometry/readability/coleman_liau.py +102 -9
  25. pystylometry/readability/complex_words.py +531 -0
  26. pystylometry/readability/flesch.py +59 -14
  27. pystylometry/readability/gunning_fog.py +194 -25
  28. pystylometry/readability/smog.py +31 -14
  29. pystylometry/readability/syllables.py +137 -30
  30. pystylometry/stylistic/__init__.py +20 -0
  31. pystylometry/stylistic/cohesion_coherence.py +45 -0
  32. pystylometry/stylistic/genre_register.py +45 -0
  33. pystylometry/stylistic/markers.py +131 -0
  34. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  35. pystylometry/syntactic/__init__.py +4 -0
  36. pystylometry/syntactic/advanced_syntactic.py +432 -0
  37. pystylometry/syntactic/pos_ratios.py +104 -13
  38. pystylometry/syntactic/sentence_stats.py +57 -13
  39. pystylometry/syntactic/sentence_types.py +470 -0
  40. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
  41. pystylometry-1.0.0.dist-info/RECORD +46 -0
  42. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
  43. pystylometry-0.1.0.dist-info/RECORD +0 -26
pystylometry/_utils.py CHANGED
@@ -9,9 +9,13 @@ from .tokenizer import Tokenizer
9
9
  # ===== Convenience Functions =====
10
10
 
11
11
  # Default tokenizer instance for backward compatibility
12
+ # Preserves emails and URLs to allow readability metrics (like Coleman-Liau)
13
+ # to count their alphabetic characters
12
14
  _default_tokenizer = Tokenizer(
13
15
  lowercase=False,
14
16
  strip_punctuation=False,
17
+ preserve_urls=True,
18
+ preserve_emails=True,
15
19
  )
16
20
 
17
21
 
@@ -1,5 +1,6 @@
1
1
  """Authorship attribution metrics."""
2
2
 
3
+ from .additional_methods import compute_johns_delta, compute_kilgarriff, compute_minmax
3
4
  from .burrows_delta import compute_burrows_delta, compute_cosine_delta
4
5
  from .zeta import compute_zeta
5
6
 
@@ -7,4 +8,7 @@ __all__ = [
7
8
  "compute_burrows_delta",
8
9
  "compute_cosine_delta",
9
10
  "compute_zeta",
11
+ "compute_kilgarriff",
12
+ "compute_minmax",
13
+ "compute_johns_delta",
10
14
  ]
@@ -0,0 +1,100 @@
1
+ """Additional authorship attribution methods.
2
+
3
+ This module provides alternative distance/similarity metrics for authorship
4
+ attribution beyond Burrows' Delta and Zeta.
5
+
6
+ Related GitHub Issue:
7
+ #24 - Additional Authorship Attribution Methods
8
+ https://github.com/craigtrim/pystylometry/issues/24
9
+
10
+ Methods implemented:
11
+ - Kilgarriff's Chi-squared
12
+ - Min-Max (Burrows' original method)
13
+ - John Burrows' Delta variations
14
+
15
+ References:
16
+ Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus Linguistics.
17
+ Burrows, J. F. (1992). Not unless you ask nicely. Literary and Linguistic Computing.
18
+ Burrows, J. (2005). Who wrote Shamela? Literary and Linguistic Computing.
19
+ """
20
+
21
+ from .._types import JohnsBurrowsResult, KilgarriffResult, MinMaxResult
22
+
23
+
24
+ def compute_kilgarriff(text1: str, text2: str, mfw: int = 100) -> KilgarriffResult:
25
+ """
26
+ Compute Kilgarriff's Chi-squared distance between two texts.
27
+
28
+ Related GitHub Issue:
29
+ #24 - Additional Authorship Attribution Methods
30
+ https://github.com/craigtrim/pystylometry/issues/24
31
+
32
+ Args:
33
+ text1: First text for comparison
34
+ text2: Second text for comparison
35
+ mfw: Number of most frequent words to analyze
36
+
37
+ Returns:
38
+ KilgarriffResult with chi-squared statistic, p-value, and
39
+ most distinctive features.
40
+ """
41
+ # TODO: Implement Kilgarriff's chi-squared
42
+ # GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
43
+ raise NotImplementedError(
44
+ "Kilgarriff's chi-squared not yet implemented. "
45
+ "See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
46
+ )
47
+
48
+
49
+ def compute_minmax(text1: str, text2: str, mfw: int = 100) -> MinMaxResult:
50
+ """
51
+ Compute Min-Max distance (Burrows' original method).
52
+
53
+ Related GitHub Issue:
54
+ #24 - Additional Authorship Attribution Methods
55
+ https://github.com/craigtrim/pystylometry/issues/24
56
+
57
+ Args:
58
+ text1: First text for comparison
59
+ text2: Second text for comparison
60
+ mfw: Number of most frequent words to analyze
61
+
62
+ Returns:
63
+ MinMaxResult with min-max distance and distinctive features.
64
+ """
65
+ # TODO: Implement Min-Max distance
66
+ # GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
67
+ raise NotImplementedError(
68
+ "Min-Max distance not yet implemented. "
69
+ "See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
70
+ )
71
+
72
+
73
+ def compute_johns_delta(
74
+ text1: str,
75
+ text2: str,
76
+ mfw: int = 100,
77
+ method: str = "quadratic",
78
+ ) -> JohnsBurrowsResult:
79
+ """
80
+ Compute John Burrows' Delta variations.
81
+
82
+ Related GitHub Issue:
83
+ #24 - Additional Authorship Attribution Methods
84
+ https://github.com/craigtrim/pystylometry/issues/24
85
+
86
+ Args:
87
+ text1: First text for comparison
88
+ text2: Second text for comparison
89
+ mfw: Number of most frequent words to analyze
90
+ method: Delta variation ("quadratic", "weighted", "rotated")
91
+
92
+ Returns:
93
+ JohnsBurrowsResult with delta score and method details.
94
+ """
95
+ # TODO: Implement John's Delta variations
96
+ # GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
97
+ raise NotImplementedError(
98
+ "John's Delta variations not yet implemented. "
99
+ "See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
100
+ )
@@ -0,0 +1,15 @@
1
+ """Character-level metrics for stylometric analysis.
2
+
3
+ This package provides character-level features for authorship attribution
4
+ and style analysis.
5
+
6
+ Related GitHub Issue:
7
+ #12 - Character-Level Metrics
8
+ https://github.com/craigtrim/pystylometry/issues/12
9
+ """
10
+
11
+ from .character_metrics import compute_character_metrics
12
+
13
+ __all__ = [
14
+ "compute_character_metrics",
15
+ ]
@@ -0,0 +1,301 @@
1
+ """Character-level metrics for stylometric analysis.
2
+
3
+ This module provides character-level features that capture low-level patterns
4
+ in writing style. Character-level metrics are fundamental for authorship
5
+ attribution and can reveal distinctive patterns in punctuation usage,
6
+ word construction, and formatting preferences.
7
+
8
+ Related GitHub Issue:
9
+ #12 - Character-Level Metrics
10
+ https://github.com/craigtrim/pystylometry/issues/12
11
+
12
+ Features implemented:
13
+ - Average word length (characters per word)
14
+ - Average sentence length (characters per sentence)
15
+ - Punctuation density and variety
16
+ - Letter frequency distribution
17
+ - Vowel-to-consonant ratio
18
+ - Digit frequency and ratio
19
+ - Uppercase ratio
20
+ - Whitespace ratio
21
+
22
+ References:
23
+ Grieve, J. (2007). Quantitative authorship attribution: An evaluation
24
+ of techniques. Literary and Linguistic Computing, 22(3), 251-270.
25
+ Stamatatos, E. (2009). A survey of modern authorship attribution methods.
26
+ JASIST, 60(3), 538-556.
27
+ """
28
+
29
+ from .._types import CharacterMetricsResult
30
+
31
+
32
+ def compute_character_metrics(text: str) -> CharacterMetricsResult:
33
+ """
34
+ Compute character-level stylometric metrics.
35
+
36
+ This function analyzes text at the character level to extract features
37
+ related to word length, punctuation usage, letter distribution, and
38
+ other low-level patterns that can be distinctive for authorship
39
+ attribution and style analysis.
40
+
41
+ Related GitHub Issue:
42
+ #12 - Character-Level Metrics
43
+ https://github.com/craigtrim/pystylometry/issues/12
44
+
45
+ Character-level features are particularly valuable because:
46
+ 1. They are language-independent (work across languages)
47
+ 2. They capture subconscious writing patterns
48
+ 3. They are resistant to topic variation
49
+ 4. They complement higher-level metrics (words, syntax)
50
+
51
+ Metrics computed:
52
+ - Average word length: Mean characters per word
53
+ - Average sentence length (chars): Mean characters per sentence
54
+ - Punctuation density: Punctuation marks per 100 words
55
+ - Punctuation variety: Count of unique punctuation types used
56
+ - Letter frequency: Distribution of a-z (case-insensitive)
57
+ - Vowel-to-consonant ratio: Ratio of vowels to consonants
58
+ - Digit count/ratio: Numeric character usage
59
+ - Uppercase ratio: Uppercase letters / total letters
60
+ - Whitespace ratio: Whitespace characters / total characters
61
+
62
+ Args:
63
+ text: Input text to analyze. Should contain at least one sentence
64
+ for meaningful results. Empty text will return NaN for ratios
65
+ and 0 for counts.
66
+
67
+ Returns:
68
+ CharacterMetricsResult with all character-level features and metadata.
69
+ For empty text, all ratios will be NaN and counts will be 0.
70
+
71
+ Example:
72
+ >>> result = compute_character_metrics("The quick brown fox jumps!")
73
+ >>> print(f"Avg word length: {result.avg_word_length:.2f}")
74
+ Avg word length: 4.17
75
+ >>> print(f"Punctuation density: {result.punctuation_density:.2f}")
76
+ Punctuation density: 16.67
77
+ >>> print(f"Vowel/consonant ratio: {result.vowel_consonant_ratio:.2f}")
78
+ Vowel/consonant ratio: 0.71
79
+
80
+ >>> # Empty text handling
81
+ >>> result = compute_character_metrics("")
82
+ >>> import math
83
+ >>> math.isnan(result.avg_word_length)
84
+ True
85
+ >>> result.digit_count
86
+ 0
87
+
88
+ Note:
89
+ - Punctuation marks include: . , ! ? ; : - ' " ( ) [ ] { } ... etc.
90
+ - Whitespace includes spaces, tabs, newlines
91
+ - Letter frequency is case-insensitive (lowercase normalized)
92
+ - Words are tokenized by whitespace for length calculation
93
+ - Sentences are split using standard sentence delimiters (. ! ?)
94
+ """
95
+ # Define character sets
96
+ # GitHub Issue #12: https://github.com/craigtrim/pystylometry/issues/12
97
+ PUNCTUATION = {
98
+ ".", ",", "!", "?", ";", ":", "-", "—", "–", # Basic punctuation
99
+ "'", '"', """, """, "'", "'", # Quotes
100
+ "(", ")", "[", "]", "{", "}", # Brackets
101
+ "/", "\\", "|", # Slashes
102
+ "…", # Ellipsis
103
+ "*", "&", "@", "#", "$", "%", "^", "~", "`", # Special symbols
104
+ }
105
+ VOWELS = {"a", "e", "i", "o", "u"}
106
+
107
+ # Handle empty text
108
+ if not text:
109
+ # Return NaN for all ratios, 0 for all counts
110
+ empty_letter_freq = {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"}
111
+ return CharacterMetricsResult(
112
+ avg_word_length=float("nan"),
113
+ avg_sentence_length_chars=float("nan"),
114
+ punctuation_density=float("nan"),
115
+ punctuation_variety=0,
116
+ letter_frequency=empty_letter_freq,
117
+ vowel_consonant_ratio=float("nan"),
118
+ digit_count=0,
119
+ digit_ratio=float("nan"),
120
+ uppercase_ratio=float("nan"),
121
+ whitespace_ratio=float("nan"),
122
+ metadata={
123
+ "total_characters": 0,
124
+ "total_letters": 0,
125
+ "total_words": 0,
126
+ "total_sentences": 0,
127
+ "total_punctuation": 0,
128
+ "total_whitespace": 0,
129
+ "total_digits": 0,
130
+ "punctuation_types": [],
131
+ "vowel_count": 0,
132
+ "consonant_count": 0,
133
+ "uppercase_count": 0,
134
+ "lowercase_count": 0,
135
+ },
136
+ )
137
+
138
+ # Initialize counters
139
+ total_chars = len(text)
140
+ letter_counts = {letter: 0 for letter in "abcdefghijklmnopqrstuvwxyz"}
141
+ vowel_count = 0
142
+ consonant_count = 0
143
+ uppercase_count = 0
144
+ lowercase_count = 0
145
+ digit_count = 0
146
+ whitespace_count = 0
147
+ punctuation_count = 0
148
+ punctuation_types = set()
149
+
150
+ # Single pass through text to classify and count all characters
151
+ for char in text:
152
+ if char.isalpha():
153
+ # Letter - update letter frequency (case-insensitive)
154
+ letter_counts[char.lower()] += 1
155
+
156
+ # Count vowels and consonants
157
+ if char.lower() in VOWELS:
158
+ vowel_count += 1
159
+ else:
160
+ consonant_count += 1
161
+
162
+ # Count uppercase and lowercase
163
+ if char.isupper():
164
+ uppercase_count += 1
165
+ else:
166
+ lowercase_count += 1
167
+
168
+ elif char.isdigit():
169
+ digit_count += 1
170
+
171
+ elif char.isspace():
172
+ whitespace_count += 1
173
+
174
+ elif char in PUNCTUATION:
175
+ punctuation_count += 1
176
+ punctuation_types.add(char)
177
+
178
+ total_letters = vowel_count + consonant_count
179
+
180
+ # Calculate letter frequency distribution (normalize to sum to 1.0)
181
+ if total_letters > 0:
182
+ letter_frequency = {letter: count / total_letters for letter, count in letter_counts.items()}
183
+ else:
184
+ letter_frequency = {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"}
185
+
186
+ # Tokenize into words (split on whitespace, then strip punctuation for length)
187
+ words = text.split()
188
+ total_words = len(words)
189
+
190
+ # Calculate average word length (count only letters and digits in words)
191
+ if total_words > 0:
192
+ word_lengths = []
193
+ for word in words:
194
+ # Count only alphanumeric characters for word length
195
+ word_length = sum(1 for char in word if char.isalnum())
196
+ if word_length > 0: # Only count words with at least one alphanumeric char
197
+ word_lengths.append(word_length)
198
+
199
+ if word_lengths:
200
+ avg_word_length = sum(word_lengths) / len(word_lengths)
201
+ else:
202
+ avg_word_length = float("nan")
203
+ else:
204
+ avg_word_length = float("nan")
205
+
206
+ # Segment text into sentences (split on . ! ?)
207
+ # Simple approach: split on sentence delimiters
208
+ sentence_delimiters = {".", "!", "?"}
209
+ sentences = []
210
+ current_sentence = []
211
+
212
+ for char in text:
213
+ current_sentence.append(char)
214
+ if char in sentence_delimiters:
215
+ # End of sentence
216
+ sentence_text = "".join(current_sentence).strip()
217
+ if sentence_text: # Only add non-empty sentences
218
+ sentences.append(sentence_text)
219
+ current_sentence = []
220
+
221
+ # Add any remaining text as a sentence if it's non-empty and doesn't end with delimiter
222
+ if current_sentence:
223
+ sentence_text = "".join(current_sentence).strip()
224
+ if sentence_text:
225
+ sentences.append(sentence_text)
226
+
227
+ total_sentences = len(sentences)
228
+
229
+ # Calculate average sentence length in characters
230
+ if total_sentences > 0:
231
+ sentence_lengths = [len(sent) for sent in sentences]
232
+ avg_sentence_length_chars = sum(sentence_lengths) / total_sentences
233
+ else:
234
+ avg_sentence_length_chars = float("nan")
235
+
236
+ # Calculate punctuation density (per 100 words)
237
+ if total_words > 0:
238
+ punctuation_density = (punctuation_count / total_words) * 100
239
+ else:
240
+ punctuation_density = float("nan")
241
+
242
+ # Punctuation variety (count of unique punctuation types)
243
+ punctuation_variety = len(punctuation_types)
244
+
245
+ # Calculate vowel-to-consonant ratio
246
+ if consonant_count > 0:
247
+ vowel_consonant_ratio = vowel_count / consonant_count
248
+ elif vowel_count > 0:
249
+ # Vowels but no consonants - ratio is infinity
250
+ vowel_consonant_ratio = float("inf")
251
+ else:
252
+ # No letters at all
253
+ vowel_consonant_ratio = float("nan")
254
+
255
+ # Calculate digit ratio
256
+ if total_chars > 0:
257
+ digit_ratio = digit_count / total_chars
258
+ else:
259
+ digit_ratio = float("nan")
260
+
261
+ # Calculate uppercase ratio
262
+ if total_letters > 0:
263
+ uppercase_ratio = uppercase_count / total_letters
264
+ else:
265
+ uppercase_ratio = float("nan")
266
+
267
+ # Calculate whitespace ratio
268
+ if total_chars > 0:
269
+ whitespace_ratio = whitespace_count / total_chars
270
+ else:
271
+ whitespace_ratio = float("nan")
272
+
273
+ # Build metadata
274
+ metadata = {
275
+ "total_characters": total_chars,
276
+ "total_letters": total_letters,
277
+ "total_words": total_words,
278
+ "total_sentences": total_sentences,
279
+ "total_punctuation": punctuation_count,
280
+ "total_whitespace": whitespace_count,
281
+ "total_digits": digit_count,
282
+ "punctuation_types": sorted(list(punctuation_types)),
283
+ "vowel_count": vowel_count,
284
+ "consonant_count": consonant_count,
285
+ "uppercase_count": uppercase_count,
286
+ "lowercase_count": lowercase_count,
287
+ }
288
+
289
+ return CharacterMetricsResult(
290
+ avg_word_length=avg_word_length,
291
+ avg_sentence_length_chars=avg_sentence_length_chars,
292
+ punctuation_density=punctuation_density,
293
+ punctuation_variety=punctuation_variety,
294
+ letter_frequency=letter_frequency,
295
+ vowel_consonant_ratio=vowel_consonant_ratio,
296
+ digit_count=digit_count,
297
+ digit_ratio=digit_ratio,
298
+ uppercase_ratio=uppercase_ratio,
299
+ whitespace_ratio=whitespace_ratio,
300
+ metadata=metadata,
301
+ )
@@ -1,17 +1,24 @@
1
1
  """Lexical diversity metrics."""
2
2
 
3
- # Re-export from stylometry-ttr
4
- # from stylometry_ttr import compute_ttr, TTRResult
5
-
6
3
  # Local implementations
7
- from .hapax import compute_hapax_ratios
4
+ from .advanced_diversity import compute_hdd, compute_mattr, compute_msttr, compute_vocd_d
5
+ from .function_words import compute_function_words
6
+ from .hapax import compute_hapax_ratios, compute_hapax_with_lexicon_analysis
8
7
  from .mtld import compute_mtld
8
+ from .ttr import compute_ttr
9
+ from .word_frequency_sophistication import compute_word_frequency_sophistication
9
10
  from .yule import compute_yule
10
11
 
11
12
  __all__ = [
12
- # "compute_ttr", # From stylometry-ttr
13
- # "TTRResult", # From stylometry-ttr
13
+ "compute_ttr",
14
14
  "compute_mtld",
15
15
  "compute_yule",
16
16
  "compute_hapax_ratios",
17
+ "compute_hapax_with_lexicon_analysis",
18
+ "compute_function_words",
19
+ "compute_vocd_d",
20
+ "compute_mattr",
21
+ "compute_hdd",
22
+ "compute_msttr",
23
+ "compute_word_frequency_sophistication",
17
24
  ]