pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pystylometry/__init__.py +30 -5
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1954 -28
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +26 -1
  6. pystylometry/authorship/additional_methods.py +75 -0
  7. pystylometry/authorship/kilgarriff.py +347 -0
  8. pystylometry/character/__init__.py +15 -0
  9. pystylometry/character/character_metrics.py +389 -0
  10. pystylometry/cli.py +427 -0
  11. pystylometry/consistency/__init__.py +57 -0
  12. pystylometry/consistency/_thresholds.py +162 -0
  13. pystylometry/consistency/drift.py +549 -0
  14. pystylometry/dialect/__init__.py +65 -0
  15. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  16. pystylometry/dialect/_loader.py +360 -0
  17. pystylometry/dialect/detector.py +533 -0
  18. pystylometry/lexical/__init__.py +13 -6
  19. pystylometry/lexical/advanced_diversity.py +680 -0
  20. pystylometry/lexical/function_words.py +590 -0
  21. pystylometry/lexical/hapax.py +310 -33
  22. pystylometry/lexical/mtld.py +180 -22
  23. pystylometry/lexical/ttr.py +149 -0
  24. pystylometry/lexical/word_frequency_sophistication.py +1805 -0
  25. pystylometry/lexical/yule.py +142 -29
  26. pystylometry/ngrams/__init__.py +2 -0
  27. pystylometry/ngrams/entropy.py +150 -49
  28. pystylometry/ngrams/extended_ngrams.py +235 -0
  29. pystylometry/prosody/__init__.py +12 -0
  30. pystylometry/prosody/rhythm_prosody.py +53 -0
  31. pystylometry/readability/__init__.py +12 -0
  32. pystylometry/readability/additional_formulas.py +2110 -0
  33. pystylometry/readability/ari.py +173 -35
  34. pystylometry/readability/coleman_liau.py +150 -30
  35. pystylometry/readability/complex_words.py +531 -0
  36. pystylometry/readability/flesch.py +181 -32
  37. pystylometry/readability/gunning_fog.py +208 -35
  38. pystylometry/readability/smog.py +126 -28
  39. pystylometry/readability/syllables.py +137 -30
  40. pystylometry/stylistic/__init__.py +20 -0
  41. pystylometry/stylistic/cohesion_coherence.py +45 -0
  42. pystylometry/stylistic/genre_register.py +45 -0
  43. pystylometry/stylistic/markers.py +131 -0
  44. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  45. pystylometry/syntactic/__init__.py +4 -0
  46. pystylometry/syntactic/advanced_syntactic.py +494 -0
  47. pystylometry/syntactic/pos_ratios.py +172 -17
  48. pystylometry/syntactic/sentence_stats.py +105 -18
  49. pystylometry/syntactic/sentence_types.py +526 -0
  50. pystylometry/viz/__init__.py +71 -0
  51. pystylometry/viz/drift.py +589 -0
  52. pystylometry/viz/jsx/__init__.py +31 -0
  53. pystylometry/viz/jsx/_base.py +144 -0
  54. pystylometry/viz/jsx/report.py +677 -0
  55. pystylometry/viz/jsx/timeline.py +716 -0
  56. pystylometry/viz/jsx/viewer.py +1032 -0
  57. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
  58. pystylometry-1.1.0.dist-info/RECORD +63 -0
  59. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  60. pystylometry-0.1.0.dist-info/RECORD +0 -26
  61. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
@@ -1,14 +1,62 @@
1
- """SMOG (Simple Measure of Gobbledygook) Index."""
1
+ """SMOG (Simple Measure of Gobbledygook) Index.
2
2
 
3
- from .._types import SMOGResult
3
+ This module implements the SMOG readability formula with native chunked
4
+ analysis for stylometric fingerprinting.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
10
+
11
+ import math
12
+
13
+ from .._normalize import normalize_for_readability
14
+ from .._types import Distribution, SMOGResult, chunk_text, make_distribution
4
15
  from .._utils import split_sentences, tokenize
5
16
  from .syllables import count_syllables
6
17
 
7
18
 
8
- def compute_smog(text: str) -> SMOGResult:
19
+ def _compute_smog_single(text: str) -> tuple[float, float, dict]:
20
+ """Compute SMOG metrics for a single chunk of text.
21
+
22
+ Returns:
23
+ Tuple of (smog_index, grade_level, metadata_dict).
24
+ Returns (nan, nan, metadata) for empty/invalid input.
25
+ """
26
+ sentences = split_sentences(text)
27
+ tokens = tokenize(text)
28
+ word_tokens = normalize_for_readability(tokens)
29
+
30
+ if len(sentences) == 0 or len(word_tokens) == 0:
31
+ return (
32
+ float("nan"),
33
+ float("nan"),
34
+ {"sentence_count": 0, "word_count": 0, "polysyllable_count": 0},
35
+ )
36
+
37
+ # Count polysyllables (words with 3+ syllables)
38
+ polysyllable_count = sum(1 for word in word_tokens if count_syllables(word) >= 3)
39
+
40
+ # SMOG formula
41
+ smog_index = 1.043 * math.sqrt(polysyllable_count * 30 / len(sentences)) + 3.1291
42
+ grade_level = max(0, min(20, math.floor(smog_index + 0.5)))
43
+
44
+ metadata = {
45
+ "sentence_count": len(sentences),
46
+ "word_count": len(word_tokens),
47
+ "polysyllable_count": polysyllable_count,
48
+ }
49
+
50
+ return (smog_index, float(grade_level), metadata)
51
+
52
+
53
+ def compute_smog(text: str, chunk_size: int = 1000) -> SMOGResult:
9
54
  """
10
55
  Compute SMOG (Simple Measure of Gobbledygook) Index.
11
56
 
57
+ This function uses native chunked analysis to capture variance and patterns
58
+ across the text, which is essential for stylometric fingerprinting.
59
+
12
60
  Formula:
13
61
  SMOG = 1.043 × √(polysyllables × 30/sentences) + 3.1291
14
62
 
@@ -17,55 +65,105 @@ def compute_smog(text: str) -> SMOGResult:
17
65
  The SMOG index estimates the years of education needed to understand the text.
18
66
  It's particularly useful for healthcare materials.
19
67
 
68
+ Related GitHub Issue:
69
+ #27 - Native chunked analysis with Distribution dataclass
70
+ https://github.com/craigtrim/pystylometry/issues/27
71
+
20
72
  References:
21
73
  McLaughlin, G. H. (1969). SMOG grading: A new readability formula.
22
74
  Journal of Reading, 12(8), 639-646.
23
75
 
24
76
  Args:
25
77
  text: Input text to analyze
78
+ chunk_size: Number of words per chunk (default: 1000).
79
+ The text is divided into chunks of this size, and metrics are
80
+ computed per-chunk.
26
81
 
27
82
  Returns:
28
- SMOGResult with SMOG index and grade level
83
+ SMOGResult with:
84
+ - smog_index: Mean SMOG index across chunks
85
+ - grade_level: Mean grade level across chunks
86
+ - smog_index_dist: Distribution with per-chunk values and stats
87
+ - grade_level_dist: Distribution with per-chunk values and stats
88
+ - chunk_size: The chunk size used
89
+ - chunk_count: Number of chunks analyzed
29
90
 
30
91
  Example:
31
- >>> result = compute_smog("The quick brown fox jumps over the lazy dog.")
32
- >>> print(f"SMOG Index: {result.smog_index:.1f}")
33
- >>> print(f"Grade Level: {result.grade_level}")
92
+ >>> result = compute_smog("Long text here...", chunk_size=1000)
93
+ >>> result.smog_index # Mean across chunks
94
+ 12.5
95
+ >>> result.smog_index_dist.std # Variance reveals fingerprint
96
+ 1.8
34
97
  """
35
- sentences = split_sentences(text)
36
- tokens = tokenize(text)
98
+ # Chunk the text
99
+ chunks = chunk_text(text, chunk_size)
100
+
101
+ # Compute metrics per chunk
102
+ smog_values = []
103
+ grade_values = []
104
+ total_sentences = 0
105
+ total_words = 0
106
+ total_polysyllables = 0
37
107
 
38
- if len(sentences) < 30:
39
- # SMOG requires at least 30 sentences for accuracy
40
- # We'll compute anyway but note in metadata
41
- pass
108
+ for chunk in chunks:
109
+ si, gl, meta = _compute_smog_single(chunk)
110
+ if not math.isnan(si):
111
+ smog_values.append(si)
112
+ grade_values.append(gl)
113
+ total_sentences += meta.get("sentence_count", 0)
114
+ total_words += meta.get("word_count", 0)
115
+ total_polysyllables += meta.get("polysyllable_count", 0)
42
116
 
43
- if len(sentences) == 0 or len(tokens) == 0:
117
+ # Handle empty or all-invalid chunks
118
+ if not smog_values:
119
+ empty_dist = Distribution(
120
+ values=[],
121
+ mean=float("nan"),
122
+ median=float("nan"),
123
+ std=0.0,
124
+ range=0.0,
125
+ iqr=0.0,
126
+ )
44
127
  return SMOGResult(
45
- smog_index=0.0,
46
- grade_level=0,
128
+ smog_index=float("nan"),
129
+ grade_level=float("nan"),
130
+ smog_index_dist=empty_dist,
131
+ grade_level_dist=empty_dist,
132
+ chunk_size=chunk_size,
133
+ chunk_count=len(chunks),
47
134
  metadata={
135
+ # Backward-compatible keys
48
136
  "sentence_count": 0,
49
137
  "word_count": 0,
50
138
  "polysyllable_count": 0,
139
+ # New prefixed keys for consistency
140
+ "total_sentence_count": 0,
141
+ "total_word_count": 0,
142
+ "total_polysyllable_count": 0,
51
143
  "warning": "Insufficient text",
52
144
  },
53
145
  )
54
146
 
55
- # Count polysyllables (words with 3+ syllables)
56
- polysyllable_count = sum(1 for word in tokens if count_syllables(word) >= 3)
57
-
58
- # TODO: Implement SMOG formula
59
- smog_index = 0.0 # Placeholder
60
- grade_level = 0 # Placeholder
147
+ # Build distributions
148
+ smog_dist = make_distribution(smog_values)
149
+ grade_dist = make_distribution(grade_values)
61
150
 
62
151
  return SMOGResult(
63
- smog_index=smog_index,
64
- grade_level=grade_level,
152
+ smog_index=smog_dist.mean,
153
+ grade_level=grade_dist.mean,
154
+ smog_index_dist=smog_dist,
155
+ grade_level_dist=grade_dist,
156
+ chunk_size=chunk_size,
157
+ chunk_count=len(chunks),
65
158
  metadata={
66
- "sentence_count": len(sentences),
67
- "word_count": len(tokens),
68
- "polysyllable_count": polysyllable_count,
69
- "warning": "Less than 30 sentences" if len(sentences) < 30 else None,
159
+ # Backward-compatible keys
160
+ "sentence_count": total_sentences,
161
+ "word_count": total_words,
162
+ "polysyllable_count": total_polysyllables,
163
+ # New prefixed keys for consistency
164
+ "total_sentence_count": total_sentences,
165
+ "total_word_count": total_words,
166
+ "total_polysyllable_count": total_polysyllables,
167
+ "warning": "Less than 30 sentences" if total_sentences < 30 else None,
70
168
  },
71
169
  )
@@ -1,54 +1,161 @@
1
- """Syllable counting utilities using CMU Pronouncing Dictionary."""
1
+ """
2
+ Syllable counting using CMU Pronouncing Dictionary.
2
3
 
4
+ Uses the pronouncing library which provides access to the CMU Pronouncing
5
+ Dictionary for high-accuracy syllable counting based on phonetic transcriptions.
6
+ """
3
7
 
8
+ import re
9
+ from functools import lru_cache
10
+
11
+ try:
12
+ import pronouncing # type: ignore[import-untyped]
13
+ except ImportError:
14
+ raise ImportError(
15
+ "The 'pronouncing' library is required for syllable counting. "
16
+ "Install it with: pip install pystylometry[readability]"
17
+ )
18
+
19
+
20
+ @lru_cache(maxsize=4096)
4
21
  def count_syllables(word: str) -> int:
5
22
  """
6
- Count syllables in a word using CMU Pronouncing Dictionary with heuristic fallback.
23
+ Count syllables using CMU Pronouncing Dictionary.
24
+
25
+ Uses phonetic transcriptions from CMU dictionary. For words with multiple
26
+ pronunciations, uses the first pronunciation (typically the most common).
27
+ Falls back to simple vowel counting for words not in the dictionary.
7
28
 
8
29
  Args:
9
- word: The word to count syllables for
30
+ word: Input word (handles mixed case, strips whitespace)
10
31
 
11
32
  Returns:
12
- Number of syllables in the word
33
+ Syllable count (minimum 1 for non-empty input)
34
+
35
+ Example:
36
+ >>> count_syllables("beautiful")
37
+ 3
38
+ >>> count_syllables("fire")
39
+ 2
40
+ >>> count_syllables("cruel")
41
+ 1
13
42
  """
14
- # TODO: Implement with pronouncing library
15
- # For now, use simple heuristic fallback
16
- return _heuristic_syllable_count(word)
43
+ word = word.lower().strip()
44
+ if not word:
45
+ return 0
17
46
 
47
+ # Strip common punctuation
48
+ word = word.strip(".,;:!?\"'()-")
49
+ if not word:
50
+ return 0
18
51
 
19
- def _heuristic_syllable_count(word: str) -> int:
20
- """
21
- Simple heuristic syllable counter for fallback.
52
+ # Handle contractions by removing apostrophes
53
+ if "'" in word:
54
+ word = word.replace("'", "")
22
55
 
23
- This is a basic implementation that counts vowel groups.
24
- Should be replaced with CMU dict lookup when pronouncing is available.
56
+ # Handle hyphenated compounds
57
+ if "-" in word:
58
+ return sum(count_syllables(part) for part in word.split("-") if part)
25
59
 
26
- Args:
27
- word: The word to count syllables for
60
+ # Get pronunciations from CMU dictionary
61
+ phones_list = pronouncing.phones_for_word(word)
28
62
 
29
- Returns:
30
- Estimated number of syllables
63
+ if phones_list:
64
+ # Use first pronunciation (most common)
65
+ # Count stress markers (0, 1, 2) in phoneme representation
66
+ phones = phones_list[0]
67
+ return pronouncing.syllable_count(phones) # type: ignore[no-any-return]
68
+
69
+ # Fallback for words not in dictionary: simple vowel counting
70
+ return _fallback_count(word)
71
+
72
+
73
+ def _fallback_count(word: str) -> int:
31
74
  """
32
- word = word.lower().strip()
33
- if len(word) == 0:
34
- return 0
75
+ Simple fallback syllable counter for words not in CMU dictionary.
35
76
 
77
+ Uses basic vowel counting with silent-e adjustment.
78
+ Less accurate than CMU but handles rare/technical words.
79
+ """
36
80
  vowels = "aeiouy"
37
- syllable_count = 0
38
- previous_was_vowel = False
81
+ count = 0
82
+ prev_was_vowel = False
39
83
 
40
84
  for char in word:
41
85
  is_vowel = char in vowels
42
- if is_vowel and not previous_was_vowel:
43
- syllable_count += 1
44
- previous_was_vowel = is_vowel
86
+ if is_vowel and not prev_was_vowel:
87
+ count += 1
88
+ prev_was_vowel = is_vowel
45
89
 
46
90
  # Adjust for silent 'e'
47
- if word.endswith("e") and syllable_count > 1:
48
- syllable_count -= 1
91
+ if word.endswith("e") and count > 1:
92
+ count -= 1
93
+
94
+ # Ensure minimum of 1
95
+ return max(1, count)
96
+
97
+
98
+ def count_syllables_text(text: str) -> list[tuple[str, int]]:
99
+ """
100
+ Count syllables for all words in a text.
101
+
102
+ Args:
103
+ text: Input text
104
+
105
+ Returns:
106
+ List of (word, syllable_count) tuples
107
+
108
+ Example:
109
+ >>> count_syllables_text("The quick brown fox")
110
+ [('The', 1), ('quick', 1), ('brown', 1), ('fox', 1)]
111
+ """
112
+
113
+ words = re.findall(r"[a-zA-Z']+", text)
114
+ return [(w, count_syllables(w)) for w in words]
115
+
116
+
117
+ def total_syllables(text: str) -> int:
118
+ """
119
+ Return total syllable count for text.
120
+
121
+ Args:
122
+ text: Input text
123
+
124
+ Returns:
125
+ Total number of syllables
126
+
127
+ Example:
128
+ >>> total_syllables("The quick brown fox")
129
+ 4
130
+ """
131
+ return sum(count for _, count in count_syllables_text(text))
132
+
133
+
134
+ def validate_accuracy(
135
+ test_pairs: list[tuple[str, int]],
136
+ ) -> tuple[float, list[tuple[str, int, int]]]:
137
+ """
138
+ Test accuracy against known word-syllable pairs.
139
+
140
+ Args:
141
+ test_pairs: List of (word, expected_syllables) tuples
142
+
143
+ Returns:
144
+ (accuracy_percentage, list of (word, expected, got) for failures)
145
+
146
+ Example:
147
+ >>> test_pairs = [("hello", 2), ("world", 1), ("beautiful", 3)]
148
+ >>> accuracy, failures = validate_accuracy(test_pairs)
149
+ >>> print(f"Accuracy: {accuracy:.1f}%")
150
+ """
151
+ failures = []
152
+ for word, expected in test_pairs:
153
+ got = count_syllables(word)
154
+ if got != expected:
155
+ failures.append((word, expected, got))
49
156
 
50
- # Ensure at least one syllable
51
- if syllable_count == 0:
52
- syllable_count = 1
157
+ if not test_pairs:
158
+ return 0.0, []
53
159
 
54
- return syllable_count
160
+ accuracy = (len(test_pairs) - len(failures)) / len(test_pairs) * 100
161
+ return accuracy, failures
@@ -0,0 +1,20 @@
1
+ """Stylistic analysis metrics.
2
+
3
+ Related GitHub Issues:
4
+ #20 - Stylistic Markers
5
+ #21 - Vocabulary Overlap and Similarity Metrics
6
+ #22 - Cohesion and Coherence Metrics
7
+ #23 - Genre and Register Features
8
+ """
9
+
10
+ from .cohesion_coherence import compute_cohesion_coherence
11
+ from .genre_register import compute_genre_register
12
+ from .markers import compute_stylistic_markers
13
+ from .vocabulary_overlap import compute_vocabulary_overlap
14
+
15
+ __all__ = [
16
+ "compute_stylistic_markers",
17
+ "compute_vocabulary_overlap",
18
+ "compute_cohesion_coherence",
19
+ "compute_genre_register",
20
+ ]
@@ -0,0 +1,45 @@
1
+ """Cohesion and coherence metrics.
2
+
3
+ This module measures how well a text holds together structurally (cohesion)
4
+ and semantically (coherence). Important for analyzing writing quality and
5
+ authorial sophistication.
6
+
7
+ Related GitHub Issue:
8
+ #22 - Cohesion and Coherence Metrics
9
+ https://github.com/craigtrim/pystylometry/issues/22
10
+
11
+ References:
12
+ Halliday, M. A. K., & Hasan, R. (1976). Cohesion in English. Longman.
13
+ Graesser, A. C., McNamara, D. S., & Kulikowich, J. M. (2011). Coh-Metrix.
14
+ """
15
+
16
+ from .._types import CohesionCoherenceResult
17
+
18
+
19
+ def compute_cohesion_coherence(text: str, model: str = "en_core_web_sm") -> CohesionCoherenceResult:
20
+ """
21
+ Compute cohesion and coherence metrics.
22
+
23
+ Related GitHub Issue:
24
+ #22 - Cohesion and Coherence Metrics
25
+ https://github.com/craigtrim/pystylometry/issues/22
26
+
27
+ Args:
28
+ text: Input text to analyze
29
+ model: spaCy model for linguistic analysis
30
+
31
+ Returns:
32
+ CohesionCoherenceResult with referential cohesion, lexical cohesion,
33
+ connective density, and coherence scores.
34
+
35
+ Example:
36
+ >>> result = compute_cohesion_coherence("Multi-paragraph text...")
37
+ >>> print(f"Pronoun density: {result.pronoun_density:.2f}")
38
+ >>> print(f"Connective density: {result.connective_density:.2f}")
39
+ """
40
+ # TODO: Implement cohesion/coherence analysis
41
+ # GitHub Issue #22: https://github.com/craigtrim/pystylometry/issues/22
42
+ raise NotImplementedError(
43
+ "Cohesion/coherence metrics not yet implemented. "
44
+ "See GitHub Issue #22: https://github.com/craigtrim/pystylometry/issues/22"
45
+ )
@@ -0,0 +1,45 @@
1
+ """Genre and register classification features.
2
+
3
+ This module extracts features that distinguish between different text types
4
+ (academic, journalistic, fiction, legal, etc.) and formality levels.
5
+
6
+ Related GitHub Issue:
7
+ #23 - Genre and Register Features
8
+ https://github.com/craigtrim/pystylometry/issues/23
9
+
10
+ References:
11
+ Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
12
+ Biber, D., & Conrad, S. (2009). Register, genre, and style.
13
+ """
14
+
15
+ from .._types import GenreRegisterResult
16
+
17
+
18
+ def compute_genre_register(text: str, model: str = "en_core_web_sm") -> GenreRegisterResult:
19
+ """
20
+ Analyze genre and register features for text classification.
21
+
22
+ Related GitHub Issue:
23
+ #23 - Genre and Register Features
24
+ https://github.com/craigtrim/pystylometry/issues/23
25
+
26
+ Args:
27
+ text: Input text to analyze
28
+ model: spaCy model for linguistic analysis
29
+
30
+ Returns:
31
+ GenreRegisterResult with formality scores, register classification,
32
+ genre predictions, and feature scores for major genres.
33
+
34
+ Example:
35
+ >>> result = compute_genre_register("Academic paper text...")
36
+ >>> print(f"Formality score: {result.formality_score:.2f}")
37
+ >>> print(f"Predicted genre: {result.predicted_genre}")
38
+ >>> print(f"Academic score: {result.academic_score:.3f}")
39
+ """
40
+ # TODO: Implement genre/register analysis
41
+ # GitHub Issue #23: https://github.com/craigtrim/pystylometry/issues/23
42
+ raise NotImplementedError(
43
+ "Genre/register classification not yet implemented. "
44
+ "See GitHub Issue #23: https://github.com/craigtrim/pystylometry/issues/23"
45
+ )
@@ -0,0 +1,131 @@
1
+ """Stylistic markers for authorship attribution.
2
+
3
+ This module identifies and analyzes specific linguistic features that authors
4
+ use consistently and often subconsciously. These markers include contraction
5
+ preferences, intensifier usage, hedging patterns, modal auxiliaries, negation
6
+ patterns, and punctuation style habits.
7
+
8
+ Related GitHub Issue:
9
+ #20 - Stylistic Markers
10
+ https://github.com/craigtrim/pystylometry/issues/20
11
+
12
+ Categories of stylistic markers:
13
+ - Contraction patterns (can't vs. cannot, I'm vs. I am)
14
+ - Intensifiers (very, really, extremely, quite)
15
+ - Hedges (maybe, perhaps, probably, somewhat)
16
+ - Modal auxiliaries (can, could, may, might, must, should, will, would)
17
+ - Negation patterns (not, no, never, none, neither)
18
+ - Punctuation style (exclamations, questions, quotes, parentheticals)
19
+
20
+ References:
21
+ Argamon, S., & Levitan, S. (2005). Measuring the usefulness of function
22
+ words for authorship attribution. ACH/ALLC.
23
+ Pennebaker, J. W. (2011). The secret life of pronouns. Bloomsbury Press.
24
+ Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
25
+ """
26
+
27
+ from .._types import StylisticMarkersResult
28
+
29
+
30
+ def compute_stylistic_markers(text: str) -> StylisticMarkersResult:
31
+ """
32
+ Analyze stylistic markers for authorship attribution.
33
+
34
+ Identifies and quantifies specific linguistic features that reveal authorial
35
+ style. These features are often used subconsciously and remain consistent
36
+ across an author's works, making them valuable for attribution.
37
+
38
+ Related GitHub Issue:
39
+ #20 - Stylistic Markers
40
+ https://github.com/craigtrim/pystylometry/issues/20
41
+
42
+ Why stylistic markers matter:
43
+
44
+ Subconscious usage:
45
+ - Authors don't deliberately vary these features
46
+ - Remain consistent even when author tries to disguise style
47
+ - Difficult to consciously control
48
+
49
+ Genre-independent:
50
+ - Used similarly across different topics
51
+ - More stable than content words
52
+ - Complement content-based features
53
+
54
+ Psychologically meaningful:
55
+ - Reveal personality traits (Pennebaker's research)
56
+ - Indicate emotional state
57
+ - Show cognitive patterns
58
+
59
+ Marker Categories Analyzed:
60
+
61
+ 1. Contractions:
62
+ - Preference for contracted vs. expanded forms
63
+ - Examples: can't/cannot, I'm/I am, won't/will not
64
+ - Formality indicator (more contractions = informal)
65
+
66
+ 2. Intensifiers:
67
+ - Words that amplify meaning
68
+ - Examples: very, really, extremely, quite, rather
69
+ - Indicate emphatic style
70
+
71
+ 3. Hedges:
72
+ - Words that weaken or qualify statements
73
+ - Examples: maybe, perhaps, probably, somewhat, kind of
74
+ - Indicate tentative or cautious style
75
+
76
+ 4. Modal Auxiliaries:
77
+ - Express necessity, possibility, permission
78
+ - Epistemic modals: may, might, could (possibility)
79
+ - Deontic modals: must, should, ought (obligation)
80
+
81
+ 5. Negation:
82
+ - Patterns of negative expression
83
+ - not, no, never, none, neither, nowhere
84
+ - Frequency and type vary by author
85
+
86
+ 6. Punctuation Style:
87
+ - Exclamation marks: Emphatic, emotional
88
+ - Question marks: Interactive, rhetorical
89
+ - Quotation marks: Dialogue, scare quotes
90
+ - Parentheticals: Asides, additional info
91
+ - Ellipses: Trailing off, suspense
92
+ - Dashes: Interruptions, emphasis
93
+ - Semicolons/colons: Sophisticated syntax
94
+
95
+ Args:
96
+ text: Input text to analyze. Should contain at least 200+ words for
97
+ reliable statistics. Shorter texts may have unstable marker ratios.
98
+
99
+ Returns:
100
+ StylisticMarkersResult containing extensive marker statistics.
101
+ See _types.py for complete field list.
102
+
103
+ Example:
104
+ >>> result = compute_stylistic_markers("Sample text with markers...")
105
+ >>> print(f"Contraction ratio: {result.contraction_ratio * 100:.1f}%")
106
+ Contraction ratio: 42.3%
107
+ >>> print(f"Intensifiers/100 words: {result.intensifier_density:.2f}")
108
+ Intensifiers/100 words: 3.45
109
+ >>> print(f"Top intensifiers: {result.top_intensifiers[:3]}")
110
+ Top intensifiers: [('very', 12), ('really', 8), ('quite', 5)]
111
+ >>> print(f"Exclamation density: {result.exclamation_density:.2f}")
112
+ Exclamation density: 2.10
113
+
114
+ Note:
115
+ - Densities are per 100 words for interpretability
116
+ - Contraction detection requires pattern matching
117
+ - Modal auxiliaries classified as epistemic or deontic
118
+ - Punctuation counts include all occurrences
119
+ - Empty text returns NaN for ratios, 0 for counts
120
+ """
121
+ # TODO: Implement stylistic marker analysis
122
+ # GitHub Issue #20: https://github.com/craigtrim/pystylometry/issues/20
123
+ #
124
+ # This is a comprehensive implementation with many components.
125
+ # Break it down into logical sections.
126
+ #
127
+ # See GitHub issue for full implementation plan and word lists.
128
+ raise NotImplementedError(
129
+ "Stylistic markers not yet implemented. "
130
+ "See GitHub Issue #20: https://github.com/craigtrim/pystylometry/issues/20"
131
+ )
@@ -0,0 +1,47 @@
1
+ """Vocabulary overlap and similarity metrics.
2
+
3
+ This module computes similarity measures between two texts based on their
4
+ shared vocabulary. Useful for authorship verification, plagiarism detection,
5
+ and measuring stylistic consistency.
6
+
7
+ Related GitHub Issue:
8
+ #21 - Vocabulary Overlap and Similarity Metrics
9
+ https://github.com/craigtrim/pystylometry/issues/21
10
+
11
+ References:
12
+ Jaccard, P. (1912). The distribution of the flora in the alpine zone.
13
+ Salton, G., & McGill, M. J. (1983). Introduction to Modern Information Retrieval.
14
+ """
15
+
16
+ from .._types import VocabularyOverlapResult
17
+
18
+
19
+ def compute_vocabulary_overlap(text1: str, text2: str) -> VocabularyOverlapResult:
20
+ """
21
+ Compute vocabulary overlap and similarity between two texts.
22
+
23
+ Related GitHub Issue:
24
+ #21 - Vocabulary Overlap and Similarity Metrics
25
+ https://github.com/craigtrim/pystylometry/issues/21
26
+
27
+ Args:
28
+ text1: First text to compare
29
+ text2: Second text to compare
30
+
31
+ Returns:
32
+ VocabularyOverlapResult with Jaccard, Dice, cosine similarities,
33
+ shared vocabulary statistics, and distinctive words for each text.
34
+
35
+ Example:
36
+ >>> result = compute_vocabulary_overlap(text1, text2)
37
+ >>> print(f"Jaccard similarity: {result.jaccard_similarity:.3f}")
38
+ Jaccard similarity: 0.456
39
+ >>> print(f"Shared words: {result.shared_vocab_size}")
40
+ Shared words: 234
41
+ """
42
+ # TODO: Implement vocabulary overlap analysis
43
+ # GitHub Issue #21: https://github.com/craigtrim/pystylometry/issues/21
44
+ raise NotImplementedError(
45
+ "Vocabulary overlap not yet implemented. "
46
+ "See GitHub Issue #21: https://github.com/craigtrim/pystylometry/issues/21"
47
+ )