pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
|
@@ -1,10 +1,17 @@
|
|
|
1
|
-
"""Part-of-Speech ratio analysis using spaCy.
|
|
1
|
+
"""Part-of-Speech ratio analysis using spaCy.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Related GitHub Issue:
|
|
4
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
5
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .._types import Distribution, POSResult, make_distribution
|
|
4
9
|
from .._utils import check_optional_dependency
|
|
5
10
|
|
|
6
11
|
|
|
7
|
-
def compute_pos_ratios(
|
|
12
|
+
def compute_pos_ratios(
|
|
13
|
+
text: str, model: str = "en_core_web_sm", chunk_size: int = 1000
|
|
14
|
+
) -> POSResult:
|
|
8
15
|
"""
|
|
9
16
|
Compute Part-of-Speech ratios and lexical density using spaCy.
|
|
10
17
|
|
|
@@ -18,6 +25,10 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
18
25
|
- Lexical density: (nouns + verbs + adjectives + adverbs) / total words
|
|
19
26
|
- Function word ratio: (determiners + prepositions + conjunctions) / total words
|
|
20
27
|
|
|
28
|
+
Related GitHub Issue:
|
|
29
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
30
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
31
|
+
|
|
21
32
|
References:
|
|
22
33
|
Biber, D. (1988). Variation across speech and writing.
|
|
23
34
|
Cambridge University Press.
|
|
@@ -25,9 +36,13 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
25
36
|
Args:
|
|
26
37
|
text: Input text to analyze
|
|
27
38
|
model: spaCy model name (default: "en_core_web_sm")
|
|
39
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
40
|
+
Note: POS analysis is performed on the full text for accuracy,
|
|
41
|
+
so this parameter is included for API consistency but actual
|
|
42
|
+
results are from a single pass.
|
|
28
43
|
|
|
29
44
|
Returns:
|
|
30
|
-
POSResult with all POS ratios and metadata
|
|
45
|
+
POSResult with all POS ratios, distributions, and metadata
|
|
31
46
|
|
|
32
47
|
Raises:
|
|
33
48
|
ImportError: If spaCy is not installed
|
|
@@ -40,22 +55,162 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
|
|
|
40
55
|
"""
|
|
41
56
|
check_optional_dependency("spacy", "syntactic")
|
|
42
57
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
#
|
|
46
|
-
|
|
58
|
+
import spacy
|
|
59
|
+
|
|
60
|
+
# Load spaCy model
|
|
61
|
+
try:
|
|
62
|
+
nlp = spacy.load(model)
|
|
63
|
+
except OSError:
|
|
64
|
+
raise OSError(
|
|
65
|
+
f"spaCy model '{model}' not found. Download it with: python -m spacy download {model}"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Process text with spaCy
|
|
69
|
+
doc = nlp(text)
|
|
70
|
+
|
|
71
|
+
# Count POS tags
|
|
72
|
+
noun_count = 0
|
|
73
|
+
verb_count = 0
|
|
74
|
+
adj_count = 0
|
|
75
|
+
adv_count = 0
|
|
76
|
+
det_count = 0
|
|
77
|
+
adp_count = 0 # Adpositions (prepositions)
|
|
78
|
+
conj_count = 0 # Conjunctions (coordinating and subordinating)
|
|
79
|
+
total_tokens = 0
|
|
80
|
+
|
|
81
|
+
for token in doc:
|
|
82
|
+
# Only count alphabetic tokens (skip punctuation, numbers, etc.)
|
|
83
|
+
if not token.is_alpha:
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
total_tokens += 1
|
|
87
|
+
pos = token.pos_
|
|
88
|
+
|
|
89
|
+
if pos == "NOUN" or pos == "PROPN":
|
|
90
|
+
noun_count += 1
|
|
91
|
+
elif pos == "VERB":
|
|
92
|
+
verb_count += 1
|
|
93
|
+
elif pos == "ADJ":
|
|
94
|
+
adj_count += 1
|
|
95
|
+
elif pos == "ADV":
|
|
96
|
+
adv_count += 1
|
|
97
|
+
elif pos == "DET":
|
|
98
|
+
det_count += 1
|
|
99
|
+
elif pos == "ADP":
|
|
100
|
+
adp_count += 1
|
|
101
|
+
elif pos in ("CCONJ", "SCONJ"):
|
|
102
|
+
conj_count += 1
|
|
103
|
+
|
|
104
|
+
# Handle empty text
|
|
105
|
+
if total_tokens == 0:
|
|
106
|
+
empty_dist = Distribution(
|
|
107
|
+
values=[],
|
|
108
|
+
mean=float("nan"),
|
|
109
|
+
median=float("nan"),
|
|
110
|
+
std=0.0,
|
|
111
|
+
range=0.0,
|
|
112
|
+
iqr=0.0,
|
|
113
|
+
)
|
|
114
|
+
return POSResult(
|
|
115
|
+
noun_ratio=float("nan"),
|
|
116
|
+
verb_ratio=float("nan"),
|
|
117
|
+
adjective_ratio=float("nan"),
|
|
118
|
+
adverb_ratio=float("nan"),
|
|
119
|
+
noun_verb_ratio=float("nan"),
|
|
120
|
+
adjective_noun_ratio=float("nan"),
|
|
121
|
+
lexical_density=float("nan"),
|
|
122
|
+
function_word_ratio=float("nan"),
|
|
123
|
+
noun_ratio_dist=empty_dist,
|
|
124
|
+
verb_ratio_dist=empty_dist,
|
|
125
|
+
adjective_ratio_dist=empty_dist,
|
|
126
|
+
adverb_ratio_dist=empty_dist,
|
|
127
|
+
noun_verb_ratio_dist=empty_dist,
|
|
128
|
+
adjective_noun_ratio_dist=empty_dist,
|
|
129
|
+
lexical_density_dist=empty_dist,
|
|
130
|
+
function_word_ratio_dist=empty_dist,
|
|
131
|
+
chunk_size=chunk_size,
|
|
132
|
+
chunk_count=0,
|
|
133
|
+
metadata={
|
|
134
|
+
"model": model,
|
|
135
|
+
"token_count": 0,
|
|
136
|
+
"noun_count": 0,
|
|
137
|
+
"verb_count": 0,
|
|
138
|
+
"adjective_count": 0,
|
|
139
|
+
"adverb_count": 0,
|
|
140
|
+
},
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Calculate ratios
|
|
144
|
+
noun_ratio = noun_count / total_tokens
|
|
145
|
+
verb_ratio = verb_count / total_tokens
|
|
146
|
+
adj_ratio = adj_count / total_tokens
|
|
147
|
+
adv_ratio = adv_count / total_tokens
|
|
148
|
+
|
|
149
|
+
# Noun-verb ratio (handle division by zero)
|
|
150
|
+
noun_verb_ratio = noun_count / verb_count if verb_count > 0 else float("nan")
|
|
151
|
+
|
|
152
|
+
# Adjective-noun ratio (handle division by zero)
|
|
153
|
+
adj_noun_ratio = adj_count / noun_count if noun_count > 0 else float("nan")
|
|
154
|
+
|
|
155
|
+
# Lexical density: (content words) / total words
|
|
156
|
+
# Content words = nouns + verbs + adjectives + adverbs
|
|
157
|
+
lexical_words = noun_count + verb_count + adj_count + adv_count
|
|
158
|
+
lexical_density = lexical_words / total_tokens
|
|
159
|
+
|
|
160
|
+
# Function word ratio: (determiners + prepositions + conjunctions) / total words
|
|
161
|
+
function_words = det_count + adp_count + conj_count
|
|
162
|
+
function_word_ratio = function_words / total_tokens
|
|
163
|
+
|
|
164
|
+
# Create single-value distributions (POS analysis is done on full text)
|
|
165
|
+
noun_ratio_dist = make_distribution([noun_ratio])
|
|
166
|
+
verb_ratio_dist = make_distribution([verb_ratio])
|
|
167
|
+
adj_ratio_dist = make_distribution([adj_ratio])
|
|
168
|
+
adv_ratio_dist = make_distribution([adv_ratio])
|
|
169
|
+
noun_verb_dist = (
|
|
170
|
+
make_distribution([noun_verb_ratio])
|
|
171
|
+
if not (noun_verb_ratio != noun_verb_ratio)
|
|
172
|
+
else Distribution(
|
|
173
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
adj_noun_dist = (
|
|
177
|
+
make_distribution([adj_noun_ratio])
|
|
178
|
+
if not (adj_noun_ratio != adj_noun_ratio)
|
|
179
|
+
else Distribution(
|
|
180
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
181
|
+
)
|
|
182
|
+
)
|
|
183
|
+
lexical_density_dist = make_distribution([lexical_density])
|
|
184
|
+
function_word_dist = make_distribution([function_word_ratio])
|
|
47
185
|
|
|
48
186
|
return POSResult(
|
|
49
|
-
noun_ratio=
|
|
50
|
-
verb_ratio=
|
|
51
|
-
adjective_ratio=
|
|
52
|
-
adverb_ratio=
|
|
53
|
-
noun_verb_ratio=
|
|
54
|
-
adjective_noun_ratio=
|
|
55
|
-
lexical_density=
|
|
56
|
-
function_word_ratio=
|
|
187
|
+
noun_ratio=noun_ratio,
|
|
188
|
+
verb_ratio=verb_ratio,
|
|
189
|
+
adjective_ratio=adj_ratio,
|
|
190
|
+
adverb_ratio=adv_ratio,
|
|
191
|
+
noun_verb_ratio=noun_verb_ratio,
|
|
192
|
+
adjective_noun_ratio=adj_noun_ratio,
|
|
193
|
+
lexical_density=lexical_density,
|
|
194
|
+
function_word_ratio=function_word_ratio,
|
|
195
|
+
noun_ratio_dist=noun_ratio_dist,
|
|
196
|
+
verb_ratio_dist=verb_ratio_dist,
|
|
197
|
+
adjective_ratio_dist=adj_ratio_dist,
|
|
198
|
+
adverb_ratio_dist=adv_ratio_dist,
|
|
199
|
+
noun_verb_ratio_dist=noun_verb_dist,
|
|
200
|
+
adjective_noun_ratio_dist=adj_noun_dist,
|
|
201
|
+
lexical_density_dist=lexical_density_dist,
|
|
202
|
+
function_word_ratio_dist=function_word_dist,
|
|
203
|
+
chunk_size=chunk_size,
|
|
204
|
+
chunk_count=1, # Single pass analysis
|
|
57
205
|
metadata={
|
|
58
206
|
"model": model,
|
|
59
|
-
"token_count":
|
|
207
|
+
"token_count": total_tokens,
|
|
208
|
+
"noun_count": noun_count,
|
|
209
|
+
"verb_count": verb_count,
|
|
210
|
+
"adjective_count": adj_count,
|
|
211
|
+
"adverb_count": adv_count,
|
|
212
|
+
"determiner_count": det_count,
|
|
213
|
+
"adposition_count": adp_count,
|
|
214
|
+
"conjunction_count": conj_count,
|
|
60
215
|
},
|
|
61
216
|
)
|
|
@@ -1,10 +1,17 @@
|
|
|
1
|
-
"""Sentence-level statistics using spaCy.
|
|
1
|
+
"""Sentence-level statistics using spaCy.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Related GitHub Issue:
|
|
4
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
5
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
6
|
+
"""
|
|
5
7
|
|
|
8
|
+
from .._types import Distribution, SentenceStatsResult, make_distribution
|
|
9
|
+
from .._utils import check_optional_dependency
|
|
6
10
|
|
|
7
|
-
|
|
11
|
+
|
|
12
|
+
def compute_sentence_stats(
|
|
13
|
+
text: str, model: str = "en_core_web_sm", chunk_size: int = 1000
|
|
14
|
+
) -> SentenceStatsResult:
|
|
8
15
|
"""
|
|
9
16
|
Compute sentence-level statistics using spaCy.
|
|
10
17
|
|
|
@@ -16,6 +23,10 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
16
23
|
- Maximum sentence length
|
|
17
24
|
- Total sentence count
|
|
18
25
|
|
|
26
|
+
Related GitHub Issue:
|
|
27
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
28
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
29
|
+
|
|
19
30
|
References:
|
|
20
31
|
Hunt, K. W. (1965). Grammatical structures written at three grade levels.
|
|
21
32
|
NCTE Research Report No. 3.
|
|
@@ -23,9 +34,13 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
23
34
|
Args:
|
|
24
35
|
text: Input text to analyze
|
|
25
36
|
model: spaCy model name (default: "en_core_web_sm")
|
|
37
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
38
|
+
Note: Sentence analysis is performed on the full text for accuracy,
|
|
39
|
+
so this parameter is included for API consistency but actual
|
|
40
|
+
results are from a single pass.
|
|
26
41
|
|
|
27
42
|
Returns:
|
|
28
|
-
SentenceStatsResult with sentence statistics and metadata
|
|
43
|
+
SentenceStatsResult with sentence statistics, distributions, and metadata
|
|
29
44
|
|
|
30
45
|
Raises:
|
|
31
46
|
ImportError: If spaCy is not installed
|
|
@@ -38,23 +53,95 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
|
|
|
38
53
|
"""
|
|
39
54
|
check_optional_dependency("spacy", "syntactic")
|
|
40
55
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
#
|
|
44
|
-
|
|
45
|
-
|
|
56
|
+
import spacy
|
|
57
|
+
|
|
58
|
+
# Load spaCy model
|
|
59
|
+
try:
|
|
60
|
+
nlp = spacy.load(model)
|
|
61
|
+
except OSError:
|
|
62
|
+
raise OSError(
|
|
63
|
+
f"spaCy model '{model}' not found. Download it with: python -m spacy download {model}"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Process text with spaCy
|
|
67
|
+
doc = nlp(text)
|
|
68
|
+
|
|
69
|
+
# Extract sentences and count words in each
|
|
70
|
+
sentence_lengths = []
|
|
71
|
+
for sent in doc.sents:
|
|
72
|
+
# Count only alphabetic tokens (exclude punctuation)
|
|
73
|
+
word_count = sum(1 for token in sent if token.is_alpha)
|
|
74
|
+
if word_count > 0: # Only include non-empty sentences
|
|
75
|
+
sentence_lengths.append(word_count)
|
|
76
|
+
|
|
77
|
+
# Handle empty text
|
|
78
|
+
if len(sentence_lengths) == 0:
|
|
79
|
+
empty_dist = Distribution(
|
|
80
|
+
values=[],
|
|
81
|
+
mean=float("nan"),
|
|
82
|
+
median=float("nan"),
|
|
83
|
+
std=0.0,
|
|
84
|
+
range=0.0,
|
|
85
|
+
iqr=0.0,
|
|
86
|
+
)
|
|
87
|
+
return SentenceStatsResult(
|
|
88
|
+
mean_sentence_length=float("nan"),
|
|
89
|
+
sentence_length_std=float("nan"),
|
|
90
|
+
sentence_length_range=0.0,
|
|
91
|
+
min_sentence_length=0.0,
|
|
92
|
+
max_sentence_length=0.0,
|
|
93
|
+
sentence_count=0,
|
|
94
|
+
mean_sentence_length_dist=empty_dist,
|
|
95
|
+
sentence_length_std_dist=empty_dist,
|
|
96
|
+
sentence_length_range_dist=empty_dist,
|
|
97
|
+
min_sentence_length_dist=empty_dist,
|
|
98
|
+
max_sentence_length_dist=empty_dist,
|
|
99
|
+
chunk_size=chunk_size,
|
|
100
|
+
chunk_count=0,
|
|
101
|
+
metadata={
|
|
102
|
+
"model": model,
|
|
103
|
+
},
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Calculate statistics
|
|
107
|
+
mean_length = sum(sentence_lengths) / len(sentence_lengths)
|
|
108
|
+
|
|
109
|
+
# Standard deviation
|
|
110
|
+
if len(sentence_lengths) > 1:
|
|
111
|
+
variance = sum((x - mean_length) ** 2 for x in sentence_lengths) / (
|
|
112
|
+
len(sentence_lengths) - 1
|
|
113
|
+
)
|
|
114
|
+
std_dev = variance**0.5
|
|
115
|
+
else:
|
|
116
|
+
std_dev = 0.0
|
|
117
|
+
|
|
118
|
+
min_length = float(min(sentence_lengths))
|
|
119
|
+
max_length = float(max(sentence_lengths))
|
|
120
|
+
length_range = max_length - min_length
|
|
46
121
|
|
|
47
|
-
#
|
|
48
|
-
|
|
122
|
+
# Create single-value distributions (sentence analysis is done on full text)
|
|
123
|
+
mean_dist = make_distribution([mean_length])
|
|
124
|
+
std_dist = make_distribution([std_dev])
|
|
125
|
+
range_dist = make_distribution([length_range])
|
|
126
|
+
min_dist = make_distribution([min_length])
|
|
127
|
+
max_dist = make_distribution([max_length])
|
|
49
128
|
|
|
50
129
|
return SentenceStatsResult(
|
|
51
|
-
mean_sentence_length=
|
|
52
|
-
sentence_length_std=
|
|
53
|
-
sentence_length_range=
|
|
54
|
-
min_sentence_length=
|
|
55
|
-
max_sentence_length=
|
|
56
|
-
sentence_count=len(
|
|
130
|
+
mean_sentence_length=mean_length,
|
|
131
|
+
sentence_length_std=std_dev,
|
|
132
|
+
sentence_length_range=length_range,
|
|
133
|
+
min_sentence_length=min_length,
|
|
134
|
+
max_sentence_length=max_length,
|
|
135
|
+
sentence_count=len(sentence_lengths),
|
|
136
|
+
mean_sentence_length_dist=mean_dist,
|
|
137
|
+
sentence_length_std_dist=std_dist,
|
|
138
|
+
sentence_length_range_dist=range_dist,
|
|
139
|
+
min_sentence_length_dist=min_dist,
|
|
140
|
+
max_sentence_length_dist=max_dist,
|
|
141
|
+
chunk_size=chunk_size,
|
|
142
|
+
chunk_count=1, # Single pass analysis
|
|
57
143
|
metadata={
|
|
58
144
|
"model": model,
|
|
145
|
+
"sentence_lengths": sentence_lengths,
|
|
59
146
|
},
|
|
60
147
|
)
|