pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pystylometry/__init__.py +30 -5
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1954 -28
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +26 -1
  6. pystylometry/authorship/additional_methods.py +75 -0
  7. pystylometry/authorship/kilgarriff.py +347 -0
  8. pystylometry/character/__init__.py +15 -0
  9. pystylometry/character/character_metrics.py +389 -0
  10. pystylometry/cli.py +427 -0
  11. pystylometry/consistency/__init__.py +57 -0
  12. pystylometry/consistency/_thresholds.py +162 -0
  13. pystylometry/consistency/drift.py +549 -0
  14. pystylometry/dialect/__init__.py +65 -0
  15. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  16. pystylometry/dialect/_loader.py +360 -0
  17. pystylometry/dialect/detector.py +533 -0
  18. pystylometry/lexical/__init__.py +13 -6
  19. pystylometry/lexical/advanced_diversity.py +680 -0
  20. pystylometry/lexical/function_words.py +590 -0
  21. pystylometry/lexical/hapax.py +310 -33
  22. pystylometry/lexical/mtld.py +180 -22
  23. pystylometry/lexical/ttr.py +149 -0
  24. pystylometry/lexical/word_frequency_sophistication.py +1805 -0
  25. pystylometry/lexical/yule.py +142 -29
  26. pystylometry/ngrams/__init__.py +2 -0
  27. pystylometry/ngrams/entropy.py +150 -49
  28. pystylometry/ngrams/extended_ngrams.py +235 -0
  29. pystylometry/prosody/__init__.py +12 -0
  30. pystylometry/prosody/rhythm_prosody.py +53 -0
  31. pystylometry/readability/__init__.py +12 -0
  32. pystylometry/readability/additional_formulas.py +2110 -0
  33. pystylometry/readability/ari.py +173 -35
  34. pystylometry/readability/coleman_liau.py +150 -30
  35. pystylometry/readability/complex_words.py +531 -0
  36. pystylometry/readability/flesch.py +181 -32
  37. pystylometry/readability/gunning_fog.py +208 -35
  38. pystylometry/readability/smog.py +126 -28
  39. pystylometry/readability/syllables.py +137 -30
  40. pystylometry/stylistic/__init__.py +20 -0
  41. pystylometry/stylistic/cohesion_coherence.py +45 -0
  42. pystylometry/stylistic/genre_register.py +45 -0
  43. pystylometry/stylistic/markers.py +131 -0
  44. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  45. pystylometry/syntactic/__init__.py +4 -0
  46. pystylometry/syntactic/advanced_syntactic.py +494 -0
  47. pystylometry/syntactic/pos_ratios.py +172 -17
  48. pystylometry/syntactic/sentence_stats.py +105 -18
  49. pystylometry/syntactic/sentence_types.py +526 -0
  50. pystylometry/viz/__init__.py +71 -0
  51. pystylometry/viz/drift.py +589 -0
  52. pystylometry/viz/jsx/__init__.py +31 -0
  53. pystylometry/viz/jsx/_base.py +144 -0
  54. pystylometry/viz/jsx/report.py +677 -0
  55. pystylometry/viz/jsx/timeline.py +716 -0
  56. pystylometry/viz/jsx/viewer.py +1032 -0
  57. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
  58. pystylometry-1.1.0.dist-info/RECORD +63 -0
  59. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  60. pystylometry-0.1.0.dist-info/RECORD +0 -26
  61. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
@@ -1,10 +1,17 @@
1
- """Part-of-Speech ratio analysis using spaCy."""
1
+ """Part-of-Speech ratio analysis using spaCy.
2
2
 
3
- from .._types import POSResult
3
+ Related GitHub Issue:
4
+ #27 - Native chunked analysis with Distribution dataclass
5
+ https://github.com/craigtrim/pystylometry/issues/27
6
+ """
7
+
8
+ from .._types import Distribution, POSResult, make_distribution
4
9
  from .._utils import check_optional_dependency
5
10
 
6
11
 
7
- def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
12
+ def compute_pos_ratios(
13
+ text: str, model: str = "en_core_web_sm", chunk_size: int = 1000
14
+ ) -> POSResult:
8
15
  """
9
16
  Compute Part-of-Speech ratios and lexical density using spaCy.
10
17
 
@@ -18,6 +25,10 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
18
25
  - Lexical density: (nouns + verbs + adjectives + adverbs) / total words
19
26
  - Function word ratio: (determiners + prepositions + conjunctions) / total words
20
27
 
28
+ Related GitHub Issue:
29
+ #27 - Native chunked analysis with Distribution dataclass
30
+ https://github.com/craigtrim/pystylometry/issues/27
31
+
21
32
  References:
22
33
  Biber, D. (1988). Variation across speech and writing.
23
34
  Cambridge University Press.
@@ -25,9 +36,13 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
25
36
  Args:
26
37
  text: Input text to analyze
27
38
  model: spaCy model name (default: "en_core_web_sm")
39
+ chunk_size: Number of words per chunk (default: 1000).
40
+ Note: POS analysis is performed on the full text for accuracy,
41
+ so this parameter is included for API consistency but actual
42
+ results are from a single pass.
28
43
 
29
44
  Returns:
30
- POSResult with all POS ratios and metadata
45
+ POSResult with all POS ratios, distributions, and metadata
31
46
 
32
47
  Raises:
33
48
  ImportError: If spaCy is not installed
@@ -40,22 +55,162 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
40
55
  """
41
56
  check_optional_dependency("spacy", "syntactic")
42
57
 
43
- # TODO: Implement spaCy-based POS analysis
44
- # import spacy
45
- # nlp = spacy.load(model)
46
- # doc = nlp(text)
58
+ import spacy
59
+
60
+ # Load spaCy model
61
+ try:
62
+ nlp = spacy.load(model)
63
+ except OSError:
64
+ raise OSError(
65
+ f"spaCy model '{model}' not found. Download it with: python -m spacy download {model}"
66
+ )
67
+
68
+ # Process text with spaCy
69
+ doc = nlp(text)
70
+
71
+ # Count POS tags
72
+ noun_count = 0
73
+ verb_count = 0
74
+ adj_count = 0
75
+ adv_count = 0
76
+ det_count = 0
77
+ adp_count = 0 # Adpositions (prepositions)
78
+ conj_count = 0 # Conjunctions (coordinating and subordinating)
79
+ total_tokens = 0
80
+
81
+ for token in doc:
82
+ # Only count alphabetic tokens (skip punctuation, numbers, etc.)
83
+ if not token.is_alpha:
84
+ continue
85
+
86
+ total_tokens += 1
87
+ pos = token.pos_
88
+
89
+ if pos == "NOUN" or pos == "PROPN":
90
+ noun_count += 1
91
+ elif pos == "VERB":
92
+ verb_count += 1
93
+ elif pos == "ADJ":
94
+ adj_count += 1
95
+ elif pos == "ADV":
96
+ adv_count += 1
97
+ elif pos == "DET":
98
+ det_count += 1
99
+ elif pos == "ADP":
100
+ adp_count += 1
101
+ elif pos in ("CCONJ", "SCONJ"):
102
+ conj_count += 1
103
+
104
+ # Handle empty text
105
+ if total_tokens == 0:
106
+ empty_dist = Distribution(
107
+ values=[],
108
+ mean=float("nan"),
109
+ median=float("nan"),
110
+ std=0.0,
111
+ range=0.0,
112
+ iqr=0.0,
113
+ )
114
+ return POSResult(
115
+ noun_ratio=float("nan"),
116
+ verb_ratio=float("nan"),
117
+ adjective_ratio=float("nan"),
118
+ adverb_ratio=float("nan"),
119
+ noun_verb_ratio=float("nan"),
120
+ adjective_noun_ratio=float("nan"),
121
+ lexical_density=float("nan"),
122
+ function_word_ratio=float("nan"),
123
+ noun_ratio_dist=empty_dist,
124
+ verb_ratio_dist=empty_dist,
125
+ adjective_ratio_dist=empty_dist,
126
+ adverb_ratio_dist=empty_dist,
127
+ noun_verb_ratio_dist=empty_dist,
128
+ adjective_noun_ratio_dist=empty_dist,
129
+ lexical_density_dist=empty_dist,
130
+ function_word_ratio_dist=empty_dist,
131
+ chunk_size=chunk_size,
132
+ chunk_count=0,
133
+ metadata={
134
+ "model": model,
135
+ "token_count": 0,
136
+ "noun_count": 0,
137
+ "verb_count": 0,
138
+ "adjective_count": 0,
139
+ "adverb_count": 0,
140
+ },
141
+ )
142
+
143
+ # Calculate ratios
144
+ noun_ratio = noun_count / total_tokens
145
+ verb_ratio = verb_count / total_tokens
146
+ adj_ratio = adj_count / total_tokens
147
+ adv_ratio = adv_count / total_tokens
148
+
149
+ # Noun-verb ratio (handle division by zero)
150
+ noun_verb_ratio = noun_count / verb_count if verb_count > 0 else float("nan")
151
+
152
+ # Adjective-noun ratio (handle division by zero)
153
+ adj_noun_ratio = adj_count / noun_count if noun_count > 0 else float("nan")
154
+
155
+ # Lexical density: (content words) / total words
156
+ # Content words = nouns + verbs + adjectives + adverbs
157
+ lexical_words = noun_count + verb_count + adj_count + adv_count
158
+ lexical_density = lexical_words / total_tokens
159
+
160
+ # Function word ratio: (determiners + prepositions + conjunctions) / total words
161
+ function_words = det_count + adp_count + conj_count
162
+ function_word_ratio = function_words / total_tokens
163
+
164
+ # Create single-value distributions (POS analysis is done on full text)
165
+ noun_ratio_dist = make_distribution([noun_ratio])
166
+ verb_ratio_dist = make_distribution([verb_ratio])
167
+ adj_ratio_dist = make_distribution([adj_ratio])
168
+ adv_ratio_dist = make_distribution([adv_ratio])
169
+ noun_verb_dist = (
170
+ make_distribution([noun_verb_ratio])
171
+ if not (noun_verb_ratio != noun_verb_ratio)
172
+ else Distribution(
173
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
174
+ )
175
+ )
176
+ adj_noun_dist = (
177
+ make_distribution([adj_noun_ratio])
178
+ if not (adj_noun_ratio != adj_noun_ratio)
179
+ else Distribution(
180
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
181
+ )
182
+ )
183
+ lexical_density_dist = make_distribution([lexical_density])
184
+ function_word_dist = make_distribution([function_word_ratio])
47
185
 
48
186
  return POSResult(
49
- noun_ratio=0.0,
50
- verb_ratio=0.0,
51
- adjective_ratio=0.0,
52
- adverb_ratio=0.0,
53
- noun_verb_ratio=0.0,
54
- adjective_noun_ratio=0.0,
55
- lexical_density=0.0,
56
- function_word_ratio=0.0,
187
+ noun_ratio=noun_ratio,
188
+ verb_ratio=verb_ratio,
189
+ adjective_ratio=adj_ratio,
190
+ adverb_ratio=adv_ratio,
191
+ noun_verb_ratio=noun_verb_ratio,
192
+ adjective_noun_ratio=adj_noun_ratio,
193
+ lexical_density=lexical_density,
194
+ function_word_ratio=function_word_ratio,
195
+ noun_ratio_dist=noun_ratio_dist,
196
+ verb_ratio_dist=verb_ratio_dist,
197
+ adjective_ratio_dist=adj_ratio_dist,
198
+ adverb_ratio_dist=adv_ratio_dist,
199
+ noun_verb_ratio_dist=noun_verb_dist,
200
+ adjective_noun_ratio_dist=adj_noun_dist,
201
+ lexical_density_dist=lexical_density_dist,
202
+ function_word_ratio_dist=function_word_dist,
203
+ chunk_size=chunk_size,
204
+ chunk_count=1, # Single pass analysis
57
205
  metadata={
58
206
  "model": model,
59
- "token_count": 0,
207
+ "token_count": total_tokens,
208
+ "noun_count": noun_count,
209
+ "verb_count": verb_count,
210
+ "adjective_count": adj_count,
211
+ "adverb_count": adv_count,
212
+ "determiner_count": det_count,
213
+ "adposition_count": adp_count,
214
+ "conjunction_count": conj_count,
60
215
  },
61
216
  )
@@ -1,10 +1,17 @@
1
- """Sentence-level statistics using spaCy."""
1
+ """Sentence-level statistics using spaCy.
2
2
 
3
- from .._types import SentenceStatsResult
4
- from .._utils import check_optional_dependency, split_sentences
3
+ Related GitHub Issue:
4
+ #27 - Native chunked analysis with Distribution dataclass
5
+ https://github.com/craigtrim/pystylometry/issues/27
6
+ """
5
7
 
8
+ from .._types import Distribution, SentenceStatsResult, make_distribution
9
+ from .._utils import check_optional_dependency
6
10
 
7
- def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> SentenceStatsResult:
11
+
12
+ def compute_sentence_stats(
13
+ text: str, model: str = "en_core_web_sm", chunk_size: int = 1000
14
+ ) -> SentenceStatsResult:
8
15
  """
9
16
  Compute sentence-level statistics using spaCy.
10
17
 
@@ -16,6 +23,10 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
16
23
  - Maximum sentence length
17
24
  - Total sentence count
18
25
 
26
+ Related GitHub Issue:
27
+ #27 - Native chunked analysis with Distribution dataclass
28
+ https://github.com/craigtrim/pystylometry/issues/27
29
+
19
30
  References:
20
31
  Hunt, K. W. (1965). Grammatical structures written at three grade levels.
21
32
  NCTE Research Report No. 3.
@@ -23,9 +34,13 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
23
34
  Args:
24
35
  text: Input text to analyze
25
36
  model: spaCy model name (default: "en_core_web_sm")
37
+ chunk_size: Number of words per chunk (default: 1000).
38
+ Note: Sentence analysis is performed on the full text for accuracy,
39
+ so this parameter is included for API consistency but actual
40
+ results are from a single pass.
26
41
 
27
42
  Returns:
28
- SentenceStatsResult with sentence statistics and metadata
43
+ SentenceStatsResult with sentence statistics, distributions, and metadata
29
44
 
30
45
  Raises:
31
46
  ImportError: If spaCy is not installed
@@ -38,23 +53,95 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
38
53
  """
39
54
  check_optional_dependency("spacy", "syntactic")
40
55
 
41
- # TODO: Implement spaCy-based sentence analysis
42
- # import spacy
43
- # nlp = spacy.load(model)
44
- # doc = nlp(text)
45
- # sentences = list(doc.sents)
56
+ import spacy
57
+
58
+ # Load spaCy model
59
+ try:
60
+ nlp = spacy.load(model)
61
+ except OSError:
62
+ raise OSError(
63
+ f"spaCy model '{model}' not found. Download it with: python -m spacy download {model}"
64
+ )
65
+
66
+ # Process text with spaCy
67
+ doc = nlp(text)
68
+
69
+ # Extract sentences and count words in each
70
+ sentence_lengths = []
71
+ for sent in doc.sents:
72
+ # Count only alphabetic tokens (exclude punctuation)
73
+ word_count = sum(1 for token in sent if token.is_alpha)
74
+ if word_count > 0: # Only include non-empty sentences
75
+ sentence_lengths.append(word_count)
76
+
77
+ # Handle empty text
78
+ if len(sentence_lengths) == 0:
79
+ empty_dist = Distribution(
80
+ values=[],
81
+ mean=float("nan"),
82
+ median=float("nan"),
83
+ std=0.0,
84
+ range=0.0,
85
+ iqr=0.0,
86
+ )
87
+ return SentenceStatsResult(
88
+ mean_sentence_length=float("nan"),
89
+ sentence_length_std=float("nan"),
90
+ sentence_length_range=0.0,
91
+ min_sentence_length=0.0,
92
+ max_sentence_length=0.0,
93
+ sentence_count=0,
94
+ mean_sentence_length_dist=empty_dist,
95
+ sentence_length_std_dist=empty_dist,
96
+ sentence_length_range_dist=empty_dist,
97
+ min_sentence_length_dist=empty_dist,
98
+ max_sentence_length_dist=empty_dist,
99
+ chunk_size=chunk_size,
100
+ chunk_count=0,
101
+ metadata={
102
+ "model": model,
103
+ },
104
+ )
105
+
106
+ # Calculate statistics
107
+ mean_length = sum(sentence_lengths) / len(sentence_lengths)
108
+
109
+ # Standard deviation
110
+ if len(sentence_lengths) > 1:
111
+ variance = sum((x - mean_length) ** 2 for x in sentence_lengths) / (
112
+ len(sentence_lengths) - 1
113
+ )
114
+ std_dev = variance**0.5
115
+ else:
116
+ std_dev = 0.0
117
+
118
+ min_length = float(min(sentence_lengths))
119
+ max_length = float(max(sentence_lengths))
120
+ length_range = max_length - min_length
46
121
 
47
- # For now, use simple fallback
48
- sentences = split_sentences(text)
122
+ # Create single-value distributions (sentence analysis is done on full text)
123
+ mean_dist = make_distribution([mean_length])
124
+ std_dist = make_distribution([std_dev])
125
+ range_dist = make_distribution([length_range])
126
+ min_dist = make_distribution([min_length])
127
+ max_dist = make_distribution([max_length])
49
128
 
50
129
  return SentenceStatsResult(
51
- mean_sentence_length=0.0,
52
- sentence_length_std=0.0,
53
- sentence_length_range=0,
54
- min_sentence_length=0,
55
- max_sentence_length=0,
56
- sentence_count=len(sentences),
130
+ mean_sentence_length=mean_length,
131
+ sentence_length_std=std_dev,
132
+ sentence_length_range=length_range,
133
+ min_sentence_length=min_length,
134
+ max_sentence_length=max_length,
135
+ sentence_count=len(sentence_lengths),
136
+ mean_sentence_length_dist=mean_dist,
137
+ sentence_length_std_dist=std_dist,
138
+ sentence_length_range_dist=range_dist,
139
+ min_sentence_length_dist=min_dist,
140
+ max_sentence_length_dist=max_dist,
141
+ chunk_size=chunk_size,
142
+ chunk_count=1, # Single pass analysis
57
143
  metadata={
58
144
  "model": model,
145
+ "sentence_lengths": sentence_lengths,
59
146
  },
60
147
  )