pystylometry 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,206 @@
1
+ """
2
+ pystylometry - Comprehensive Python package for stylometric analysis.
3
+
4
+ A modular package for text analysis with lexical, readability, syntactic,
5
+ authorship, and n-gram metrics.
6
+
7
+ Installation:
8
+ pip install pystylometry # Core (lexical only)
9
+ pip install pystylometry[readability] # With readability metrics
10
+ pip install pystylometry[syntactic] # With syntactic analysis
11
+ pip install pystylometry[authorship] # With authorship attribution
12
+ pip install pystylometry[all] # Everything
13
+
14
+ Usage:
15
+ # Direct module imports
16
+ from pystylometry.lexical import compute_mtld, compute_yule
17
+ from pystylometry.readability import compute_flesch
18
+ from pystylometry.syntactic import compute_pos_ratios
19
+ from pystylometry.authorship import compute_burrows_delta
20
+
21
+ # Or use the unified analyze() function
22
+ from pystylometry import analyze
23
+
24
+ results = analyze(text, lexical=True, readability=True)
25
+ print(results.lexical['mtld'].mtld_average)
26
+ print(results.readability['flesch'].reading_ease)
27
+ """
28
+
29
+ from ._types import AnalysisResult
30
+
31
+ # Version
32
+ __version__ = "0.1.0"
33
+
34
+ # Core exports - always available
35
+ from . import lexical
36
+
37
+ # Optional exports - may raise ImportError if dependencies not installed
38
+ try:
39
+ from . import readability # noqa: F401
40
+
41
+ _READABILITY_AVAILABLE = True
42
+ except ImportError:
43
+ _READABILITY_AVAILABLE = False
44
+
45
+ try:
46
+ from . import syntactic # noqa: F401
47
+
48
+ _SYNTACTIC_AVAILABLE = True
49
+ except ImportError:
50
+ _SYNTACTIC_AVAILABLE = False
51
+
52
+ # Authorship and ngrams use only stdlib (no external dependencies)
53
+ from . import (
54
+ authorship, # noqa: F401
55
+ ngrams, # noqa: F401
56
+ )
57
+
58
+ _AUTHORSHIP_AVAILABLE = True
59
+ _NGRAMS_AVAILABLE = True
60
+
61
+
62
+ def analyze(
63
+ text: str,
64
+ lexical_metrics: bool = True,
65
+ readability_metrics: bool = False,
66
+ syntactic_metrics: bool = False,
67
+ authorship_metrics: bool = False,
68
+ ngram_metrics: bool = False,
69
+ ) -> AnalysisResult:
70
+ """
71
+ Unified interface to compute multiple stylometric metrics at once.
72
+
73
+ This is a convenience function that calls all requested metric computations
74
+ and returns a unified result object. Only computes metrics for which the
75
+ required optional dependencies are installed.
76
+
77
+ Args:
78
+ text: Input text to analyze
79
+ lexical_metrics: Compute lexical diversity metrics (default: True)
80
+ readability_metrics: Compute readability metrics (default: False)
81
+ syntactic_metrics: Compute syntactic metrics (default: False)
82
+ authorship_metrics: Compute authorship metrics (default: False)
83
+ Note: Authorship metrics typically require multiple texts for comparison.
84
+ This will compute features that can be used for authorship analysis.
85
+ ngram_metrics: Compute n-gram entropy metrics (default: False)
86
+
87
+ Returns:
88
+ AnalysisResult with requested metrics in nested dictionaries
89
+
90
+ Raises:
91
+ ImportError: If requested analysis requires uninstalled dependencies
92
+
93
+ Example:
94
+ >>> from pystylometry import analyze
95
+ >>> results = analyze(text, lexical=True, readability=True)
96
+ >>> print(results.lexical['mtld'].mtld_average)
97
+ >>> print(results.readability['flesch'].reading_ease)
98
+
99
+ Example with all metrics:
100
+ >>> results = analyze(text, lexical=True, readability=True,
101
+ ... syntactic=True, ngrams=True)
102
+ >>> print(f"MTLD: {results.lexical['mtld'].mtld_average:.2f}")
103
+ >>> print(f"Flesch: {results.readability['flesch'].reading_ease:.1f}")
104
+ >>> print(f"Noun ratio: {results.syntactic['pos'].noun_ratio:.3f}")
105
+ >>> print(f"Bigram entropy: {results.ngrams['word_bigram'].entropy:.3f}")
106
+ """
107
+ result = AnalysisResult(metadata={"text_length": len(text)})
108
+
109
+ # Lexical metrics (always available)
110
+ if lexical_metrics:
111
+ result.lexical = {}
112
+ # TODO: Add when stylometry-ttr is integrated
113
+ # result.lexical['ttr'] = lexical.compute_ttr(text)
114
+ result.lexical["mtld"] = lexical.compute_mtld(text)
115
+ result.lexical["yule"] = lexical.compute_yule(text)
116
+ result.lexical["hapax"] = lexical.compute_hapax_ratios(text)
117
+
118
+ # Readability metrics (optional dependency)
119
+ if readability_metrics:
120
+ if not _READABILITY_AVAILABLE:
121
+ raise ImportError(
122
+ "Readability metrics require optional dependencies. "
123
+ "Install with: pip install pystylometry[readability]"
124
+ )
125
+ # Import locally to avoid name conflict
126
+ from . import readability as readability_module
127
+
128
+ result.readability = {}
129
+ result.readability["flesch"] = readability_module.compute_flesch(text)
130
+ result.readability["smog"] = readability_module.compute_smog(text)
131
+ result.readability["gunning_fog"] = readability_module.compute_gunning_fog(text)
132
+ result.readability["coleman_liau"] = readability_module.compute_coleman_liau(text)
133
+ result.readability["ari"] = readability_module.compute_ari(text)
134
+
135
+ # Syntactic metrics (optional dependency)
136
+ if syntactic_metrics:
137
+ if not _SYNTACTIC_AVAILABLE:
138
+ raise ImportError(
139
+ "Syntactic metrics require optional dependencies. "
140
+ "Install with: pip install pystylometry[syntactic]"
141
+ )
142
+ # Import locally to avoid name conflict
143
+ from . import syntactic as syntactic_module
144
+
145
+ result.syntactic = {}
146
+ result.syntactic["pos"] = syntactic_module.compute_pos_ratios(text)
147
+ result.syntactic["sentence_stats"] = syntactic_module.compute_sentence_stats(text)
148
+
149
+ # Authorship metrics (uses stdlib only)
150
+ # Note: These are typically used for comparison between texts
151
+ # Here we just note that they're available but don't compute them
152
+ # since they require multiple texts as input
153
+ if authorship_metrics:
154
+ result.authorship = {
155
+ "note": "Authorship metrics require multiple texts for comparison. "
156
+ "Use pystylometry.authorship.compute_burrows_delta(text1, text2) directly."
157
+ }
158
+
159
+ # N-gram metrics (uses stdlib only)
160
+ if ngram_metrics:
161
+ result.ngrams = {}
162
+ result.ngrams["character_bigram"] = ngrams.compute_character_bigram_entropy(text)
163
+ result.ngrams["word_bigram"] = ngrams.compute_word_bigram_entropy(text)
164
+
165
+ return result
166
+
167
+
168
+ # Convenient access to availability flags
169
+ def get_available_modules() -> dict[str, bool]:
170
+ """
171
+ Get dictionary of available optional modules.
172
+
173
+ Returns:
174
+ Dictionary mapping module names to availability status
175
+
176
+ Example:
177
+ >>> from pystylometry import get_available_modules
178
+ >>> available = get_available_modules()
179
+ >>> if available['readability']:
180
+ ... from pystylometry.readability import compute_flesch
181
+ """
182
+ return {
183
+ "lexical": True, # Always available
184
+ "readability": _READABILITY_AVAILABLE,
185
+ "syntactic": _SYNTACTIC_AVAILABLE,
186
+ "authorship": _AUTHORSHIP_AVAILABLE,
187
+ "ngrams": _NGRAMS_AVAILABLE,
188
+ }
189
+
190
+
191
+ __all__ = [
192
+ "__version__",
193
+ "analyze",
194
+ "get_available_modules",
195
+ "lexical",
196
+ ]
197
+
198
+ # Conditionally add to __all__ based on availability
199
+ if _READABILITY_AVAILABLE:
200
+ __all__.append("readability")
201
+ if _SYNTACTIC_AVAILABLE:
202
+ __all__.append("syntactic")
203
+ if _AUTHORSHIP_AVAILABLE:
204
+ __all__.append("authorship")
205
+ if _NGRAMS_AVAILABLE:
206
+ __all__.append("ngrams")
pystylometry/_types.py ADDED
@@ -0,0 +1,172 @@
1
+ """Result dataclasses for all pystylometry metrics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any
7
+
8
+ # ===== Lexical Results =====
9
+
10
+
11
+ @dataclass
12
+ class MTLDResult:
13
+ """Result from MTLD (Measure of Textual Lexical Diversity) computation."""
14
+
15
+ mtld_forward: float
16
+ mtld_backward: float
17
+ mtld_average: float
18
+ metadata: dict[str, Any]
19
+
20
+
21
+ @dataclass
22
+ class YuleResult:
23
+ """Result from Yule's K and I computation."""
24
+
25
+ yule_k: float
26
+ yule_i: float
27
+ metadata: dict[str, Any]
28
+
29
+
30
+ @dataclass
31
+ class HapaxResult:
32
+ """Result from Hapax Legomena analysis."""
33
+
34
+ hapax_count: int
35
+ hapax_ratio: float
36
+ dis_hapax_count: int
37
+ dis_hapax_ratio: float
38
+ sichel_s: float
39
+ honore_r: float
40
+ metadata: dict[str, Any]
41
+
42
+
43
+ # ===== Readability Results =====
44
+
45
+
46
+ @dataclass
47
+ class FleschResult:
48
+ """Result from Flesch Reading Ease and Flesch-Kincaid Grade computation."""
49
+
50
+ reading_ease: float
51
+ grade_level: float
52
+ difficulty: str # "Very Easy", "Easy", "Fairly Easy", "Standard", etc.
53
+ metadata: dict[str, Any]
54
+
55
+
56
+ @dataclass
57
+ class SMOGResult:
58
+ """Result from SMOG Index computation."""
59
+
60
+ smog_index: float
61
+ grade_level: int
62
+ metadata: dict[str, Any]
63
+
64
+
65
+ @dataclass
66
+ class GunningFogResult:
67
+ """Result from Gunning Fog Index computation."""
68
+
69
+ fog_index: float
70
+ grade_level: int
71
+ metadata: dict[str, Any]
72
+
73
+
74
+ @dataclass
75
+ class ColemanLiauResult:
76
+ """Result from Coleman-Liau Index computation."""
77
+
78
+ cli_index: float
79
+ grade_level: int
80
+ metadata: dict[str, Any]
81
+
82
+
83
+ @dataclass
84
+ class ARIResult:
85
+ """Result from Automated Readability Index computation."""
86
+
87
+ ari_score: float
88
+ grade_level: int
89
+ age_range: str
90
+ metadata: dict[str, Any]
91
+
92
+
93
+ # ===== Syntactic Results =====
94
+
95
+
96
+ @dataclass
97
+ class POSResult:
98
+ """Result from Part-of-Speech ratio analysis."""
99
+
100
+ noun_ratio: float
101
+ verb_ratio: float
102
+ adjective_ratio: float
103
+ adverb_ratio: float
104
+ noun_verb_ratio: float
105
+ adjective_noun_ratio: float
106
+ lexical_density: float
107
+ function_word_ratio: float
108
+ metadata: dict[str, Any]
109
+
110
+
111
+ @dataclass
112
+ class SentenceStatsResult:
113
+ """Result from sentence-level statistics."""
114
+
115
+ mean_sentence_length: float
116
+ sentence_length_std: float
117
+ sentence_length_range: int
118
+ min_sentence_length: int
119
+ max_sentence_length: int
120
+ sentence_count: int
121
+ metadata: dict[str, Any]
122
+
123
+
124
+ # ===== Authorship Results =====
125
+
126
+
127
+ @dataclass
128
+ class BurrowsDeltaResult:
129
+ """Result from Burrows' Delta computation."""
130
+
131
+ delta_score: float
132
+ distance_type: str # "burrows", "cosine", "eder"
133
+ mfw_count: int
134
+ metadata: dict[str, Any]
135
+
136
+
137
+ @dataclass
138
+ class ZetaResult:
139
+ """Result from Zeta score computation."""
140
+
141
+ zeta_score: float
142
+ marker_words: list[str]
143
+ anti_marker_words: list[str]
144
+ metadata: dict[str, Any]
145
+
146
+
147
+ # ===== N-gram Results =====
148
+
149
+
150
+ @dataclass
151
+ class EntropyResult:
152
+ """Result from n-gram entropy computation."""
153
+
154
+ entropy: float
155
+ perplexity: float
156
+ ngram_type: str # "character_bigram", "word_bigram", "word_trigram"
157
+ metadata: dict[str, Any]
158
+
159
+
160
+ # ===== Unified Analysis Result =====
161
+
162
+
163
+ @dataclass
164
+ class AnalysisResult:
165
+ """Unified result from comprehensive stylometric analysis."""
166
+
167
+ lexical: dict[str, Any] | None = None
168
+ readability: dict[str, Any] | None = None
169
+ syntactic: dict[str, Any] | None = None
170
+ authorship: dict[str, Any] | None = None
171
+ ngrams: dict[str, Any] | None = None
172
+ metadata: dict[str, Any] | None = None
pystylometry/_utils.py ADDED
@@ -0,0 +1,197 @@
1
+ """Shared utility functions for pystylometry."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from .tokenizer import Tokenizer
8
+
9
+ # ===== Convenience Functions =====
10
+
11
+ # Default tokenizer instance for backward compatibility
12
+ _default_tokenizer = Tokenizer(
13
+ lowercase=False,
14
+ strip_punctuation=False,
15
+ )
16
+
17
+
18
+ def tokenize(text: str) -> list[str]:
19
+ """
20
+ Simple tokenization using default settings.
21
+
22
+ Convenience function that maintains backward compatibility
23
+ with the original simple tokenizer interface.
24
+
25
+ Args:
26
+ text: Input text to tokenize
27
+
28
+ Returns:
29
+ List of tokens
30
+
31
+ Example:
32
+ >>> tokens = tokenize("Hello, world!")
33
+ >>> print(tokens)
34
+ ['Hello', ',', 'world', '!']
35
+ """
36
+ return _default_tokenizer.tokenize(text)
37
+
38
+
39
+ def advanced_tokenize(
40
+ text: str,
41
+ lowercase: bool = True,
42
+ strip_punctuation: bool = True,
43
+ expand_contractions: bool = False,
44
+ ) -> list[str]:
45
+ """
46
+ Tokenization with commonly-used advanced options.
47
+
48
+ Args:
49
+ text: Input text to tokenize
50
+ lowercase: Convert to lowercase (default: True)
51
+ strip_punctuation: Remove punctuation tokens (default: True)
52
+ expand_contractions: Expand contractions (default: False)
53
+
54
+ Returns:
55
+ List of tokens
56
+
57
+ Example:
58
+ >>> tokens = advanced_tokenize("Hello, world! It's nice.", lowercase=True)
59
+ >>> print(tokens)
60
+ ['hello', 'world', "it's", 'nice']
61
+ """
62
+ tokenizer = Tokenizer(
63
+ lowercase=lowercase,
64
+ strip_punctuation=strip_punctuation,
65
+ expand_contractions=expand_contractions,
66
+ )
67
+ return tokenizer.tokenize(text)
68
+
69
+
70
+ # ===== Sentence Splitting =====
71
+
72
+ # Common abbreviations that shouldn't trigger sentence boundaries
73
+ _ABBREVIATIONS = {
74
+ "mr.",
75
+ "mrs.",
76
+ "ms.",
77
+ "dr.",
78
+ "prof.",
79
+ "sr.",
80
+ "jr.",
81
+ "st.",
82
+ "vs.",
83
+ "etc.",
84
+ "e.g.",
85
+ "i.e.",
86
+ "al.",
87
+ "fig.",
88
+ "vol.",
89
+ "no.",
90
+ "inc.",
91
+ "corp.",
92
+ "ltd.",
93
+ "co.",
94
+ "ph.d.",
95
+ "m.d.",
96
+ "b.a.",
97
+ "m.a.",
98
+ "j.d.",
99
+ "rev.",
100
+ "gen.",
101
+ "rep.",
102
+ "sen.",
103
+ "capt.",
104
+ }
105
+
106
+
107
+ def split_sentences(text: str) -> list[str]:
108
+ """
109
+ Split text into sentences with improved boundary detection.
110
+
111
+ Handles common abbreviations and edge cases better than simple
112
+ splitting on sentence-ending punctuation. Uses a two-pass approach:
113
+ 1. Protect known abbreviations from splitting
114
+ 2. Split on sentence boundaries
115
+ 3. Restore abbreviations
116
+
117
+ Args:
118
+ text: Input text to split
119
+
120
+ Returns:
121
+ List of sentences
122
+
123
+ Example:
124
+ >>> sentences = split_sentences("Dr. Smith arrived. He was happy.")
125
+ >>> print(sentences)
126
+ ['Dr. Smith arrived.', 'He was happy.']
127
+ """
128
+ if not text:
129
+ return []
130
+
131
+ # Temporarily replace abbreviations with placeholders
132
+ protected_text = text
133
+ replacements = {}
134
+ for i, abbr in enumerate(_ABBREVIATIONS):
135
+ if abbr in text.lower():
136
+ placeholder = f"__ABBR{i}__"
137
+ # Case-insensitive replacement
138
+ pattern = re.compile(re.escape(abbr), re.IGNORECASE)
139
+ matches = pattern.findall(protected_text)
140
+ if matches:
141
+ replacements[placeholder] = matches[0]
142
+ protected_text = pattern.sub(placeholder, protected_text, count=1)
143
+
144
+ # Split on sentence boundaries: period/question/exclamation + whitespace + capital letter
145
+ # Simple pattern that avoids variable-width look-behind
146
+ sentences = re.split(r"([.!?]+)\s+(?=[A-Z])", protected_text)
147
+
148
+ # Reconstruct sentences (regex split includes the captured groups)
149
+ result = []
150
+ i = 0
151
+ while i < len(sentences):
152
+ if i + 1 < len(sentences) and sentences[i + 1] in (".", "!", "?", ".!", "!?", "?.", "..."):
153
+ # Combine text with its punctuation
154
+ sentence = sentences[i] + sentences[i + 1]
155
+ i += 2
156
+ else:
157
+ sentence = sentences[i]
158
+ i += 1
159
+
160
+ # Restore abbreviations
161
+ for placeholder, original in replacements.items():
162
+ sentence = sentence.replace(placeholder, original)
163
+
164
+ sentence = sentence.strip()
165
+ if sentence:
166
+ result.append(sentence)
167
+
168
+ # Fallback: if we only got one sentence, try simpler split
169
+ if len(result) <= 1 and text:
170
+ sentences = re.split(r"[.!?]+\s+", text)
171
+ result = [s.strip() for s in sentences if s.strip()]
172
+
173
+ return result
174
+
175
+
176
+ def check_optional_dependency(module_name: str, extra_name: str) -> bool:
177
+ """
178
+ Check if an optional dependency is installed.
179
+
180
+ Args:
181
+ module_name: Name of the module to check
182
+ extra_name: Name of the extra in pyproject.toml
183
+
184
+ Returns:
185
+ True if module is available
186
+
187
+ Raises:
188
+ ImportError: If module is not installed with instructions
189
+ """
190
+ try:
191
+ __import__(module_name)
192
+ return True
193
+ except ImportError:
194
+ raise ImportError(
195
+ f"The '{module_name}' package is required for this functionality. "
196
+ f"Install it with: pip install pystylometry[{extra_name}]"
197
+ )
@@ -0,0 +1,10 @@
1
+ """Authorship attribution metrics."""
2
+
3
+ from .burrows_delta import compute_burrows_delta, compute_cosine_delta
4
+ from .zeta import compute_zeta
5
+
6
+ __all__ = [
7
+ "compute_burrows_delta",
8
+ "compute_cosine_delta",
9
+ "compute_zeta",
10
+ ]