pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +45 -3
- pystylometry/_types.py +1017 -259
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +28 -4
- pystylometry/authorship/additional_methods.py +260 -40
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +354 -0
- pystylometry/character/README.md +17 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/README.md +27 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- pystylometry-1.3.0.dist-info/RECORD +76 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
- pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/METADATA +0 -275
- pystylometry-1.0.0.dist-info/RECORD +0 -46
pystylometry/lexical/yule.py
CHANGED
|
@@ -1,15 +1,71 @@
|
|
|
1
|
-
"""Yule's K and I statistics for vocabulary richness.
|
|
1
|
+
"""Yule's K and I statistics for vocabulary richness.
|
|
2
2
|
|
|
3
|
+
This module implements Yule's K and I metrics with native chunked analysis
|
|
4
|
+
for stylometric fingerprinting.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import math
|
|
3
12
|
from collections import Counter
|
|
4
13
|
|
|
5
|
-
from .._types import YuleResult
|
|
14
|
+
from .._types import Distribution, YuleResult, chunk_text, make_distribution
|
|
6
15
|
from .._utils import tokenize
|
|
7
16
|
|
|
8
17
|
|
|
9
|
-
def
|
|
18
|
+
def _compute_yule_single(text: str) -> tuple[float, float, dict]:
|
|
19
|
+
"""Compute Yule's K and I for a single chunk of text.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Tuple of (yule_k, yule_i, metadata_dict).
|
|
23
|
+
Returns (nan, nan, metadata) for empty/invalid input.
|
|
24
|
+
"""
|
|
25
|
+
tokens = tokenize(text.lower())
|
|
26
|
+
N = len(tokens) # noqa: N806
|
|
27
|
+
|
|
28
|
+
if N == 0:
|
|
29
|
+
return (
|
|
30
|
+
float("nan"),
|
|
31
|
+
float("nan"),
|
|
32
|
+
{"token_count": 0, "vocabulary_size": 0},
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Count frequency of each token
|
|
36
|
+
freq_counter = Counter(tokens)
|
|
37
|
+
V = len(freq_counter) # noqa: N806
|
|
38
|
+
|
|
39
|
+
# Count how many words occur with each frequency
|
|
40
|
+
freq_of_freqs = Counter(freq_counter.values())
|
|
41
|
+
|
|
42
|
+
# Calculate Σm²×Vm
|
|
43
|
+
sum_m2_vm = sum(m * m * vm for m, vm in freq_of_freqs.items())
|
|
44
|
+
|
|
45
|
+
# Yule's K: 10⁴ × (Σm²×Vm - N) / N²
|
|
46
|
+
yule_k = 10_000 * (sum_m2_vm - N) / (N * N)
|
|
47
|
+
|
|
48
|
+
# Yule's I: V² / (Σm²×Vm - N)
|
|
49
|
+
denominator = sum_m2_vm - N
|
|
50
|
+
if denominator == 0:
|
|
51
|
+
yule_i = float("nan")
|
|
52
|
+
else:
|
|
53
|
+
yule_i = (V * V) / denominator
|
|
54
|
+
|
|
55
|
+
return (
|
|
56
|
+
yule_k,
|
|
57
|
+
yule_i,
|
|
58
|
+
{"token_count": N, "vocabulary_size": V},
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def compute_yule(text: str, chunk_size: int = 1000) -> YuleResult:
|
|
10
63
|
"""
|
|
11
64
|
Compute Yule's K and I metrics for vocabulary richness.
|
|
12
65
|
|
|
66
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
67
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
68
|
+
|
|
13
69
|
Yule's K measures vocabulary repetitiveness (higher = more repetitive).
|
|
14
70
|
Yule's I is the inverse measure (higher = more diverse).
|
|
15
71
|
|
|
@@ -23,71 +79,101 @@ def compute_yule(text: str) -> YuleResult:
|
|
|
23
79
|
- Vm = number of types occurring m times
|
|
24
80
|
- m = frequency count
|
|
25
81
|
|
|
82
|
+
Related GitHub Issue:
|
|
83
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
84
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
85
|
+
|
|
26
86
|
References:
|
|
27
87
|
Yule, G. U. (1944). The Statistical Study of Literary Vocabulary.
|
|
28
88
|
Cambridge University Press.
|
|
29
89
|
|
|
30
90
|
Args:
|
|
31
91
|
text: Input text to analyze
|
|
92
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
32
93
|
|
|
33
94
|
Returns:
|
|
34
|
-
YuleResult with
|
|
35
|
-
|
|
36
|
-
Note: For empty input or when Σm²×Vm = N (perfectly uniform vocabulary),
|
|
37
|
-
metrics will be float('nan') to indicate undefined values.
|
|
95
|
+
YuleResult with yule_k, yule_i, distributions, and metadata
|
|
38
96
|
|
|
39
97
|
Example:
|
|
40
|
-
>>> result = compute_yule("
|
|
41
|
-
>>>
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
>>> import math
|
|
46
|
-
>>> result_empty = compute_yule("")
|
|
47
|
-
>>> math.isnan(result_empty.yule_k)
|
|
48
|
-
True
|
|
98
|
+
>>> result = compute_yule("Long text here...", chunk_size=1000)
|
|
99
|
+
>>> result.yule_k # Mean across chunks
|
|
100
|
+
120.5
|
|
101
|
+
>>> result.yule_k_dist.std # Variance reveals fingerprint
|
|
102
|
+
15.2
|
|
49
103
|
"""
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
104
|
+
# Chunk the text
|
|
105
|
+
chunks = chunk_text(text, chunk_size)
|
|
106
|
+
|
|
107
|
+
# Compute metrics per chunk
|
|
108
|
+
yule_k_values = []
|
|
109
|
+
yule_i_values = []
|
|
110
|
+
total_tokens = 0
|
|
111
|
+
total_vocab = 0
|
|
112
|
+
|
|
113
|
+
for chunk in chunks:
|
|
114
|
+
k, i, meta = _compute_yule_single(chunk)
|
|
115
|
+
if not math.isnan(k):
|
|
116
|
+
yule_k_values.append(k)
|
|
117
|
+
if not math.isnan(i):
|
|
118
|
+
yule_i_values.append(i)
|
|
119
|
+
total_tokens += meta.get("token_count", 0)
|
|
120
|
+
total_vocab += meta.get("vocabulary_size", 0)
|
|
121
|
+
|
|
122
|
+
# Handle empty or all-invalid chunks
|
|
123
|
+
if not yule_k_values:
|
|
124
|
+
empty_dist = Distribution(
|
|
125
|
+
values=[],
|
|
126
|
+
mean=float("nan"),
|
|
127
|
+
median=float("nan"),
|
|
128
|
+
std=0.0,
|
|
129
|
+
range=0.0,
|
|
130
|
+
iqr=0.0,
|
|
131
|
+
)
|
|
54
132
|
return YuleResult(
|
|
55
133
|
yule_k=float("nan"),
|
|
56
134
|
yule_i=float("nan"),
|
|
57
|
-
|
|
135
|
+
yule_k_dist=empty_dist,
|
|
136
|
+
yule_i_dist=empty_dist,
|
|
137
|
+
chunk_size=chunk_size,
|
|
138
|
+
chunk_count=len(chunks),
|
|
139
|
+
metadata={
|
|
140
|
+
# Backward-compatible keys
|
|
141
|
+
"token_count": 0,
|
|
142
|
+
"vocabulary_size": 0,
|
|
143
|
+
# New prefixed keys for consistency
|
|
144
|
+
"total_token_count": 0,
|
|
145
|
+
"total_vocabulary_size": 0,
|
|
146
|
+
},
|
|
58
147
|
)
|
|
59
148
|
|
|
60
|
-
#
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
# K measures vocabulary repetitiveness (higher K = more repetitive)
|
|
75
|
-
yule_k = 10_000 * (sum_m2_vm - N) / (N * N)
|
|
76
|
-
|
|
77
|
-
# Yule's I: V² / (Σm²×Vm - N)
|
|
78
|
-
# I is the inverse measure (higher I = more diverse)
|
|
79
|
-
# If Σm²×Vm = N (perfectly uniform vocabulary), denominator is 0, return NaN
|
|
80
|
-
denominator = sum_m2_vm - N
|
|
81
|
-
if denominator == 0:
|
|
82
|
-
yule_i = float("nan")
|
|
83
|
-
else:
|
|
84
|
-
yule_i = (V * V) / denominator
|
|
149
|
+
# Build distributions
|
|
150
|
+
yule_k_dist = make_distribution(yule_k_values)
|
|
151
|
+
yule_i_dist = (
|
|
152
|
+
make_distribution(yule_i_values)
|
|
153
|
+
if yule_i_values
|
|
154
|
+
else Distribution(
|
|
155
|
+
values=[],
|
|
156
|
+
mean=float("nan"),
|
|
157
|
+
median=float("nan"),
|
|
158
|
+
std=0.0,
|
|
159
|
+
range=0.0,
|
|
160
|
+
iqr=0.0,
|
|
161
|
+
)
|
|
162
|
+
)
|
|
85
163
|
|
|
86
164
|
return YuleResult(
|
|
87
|
-
yule_k=
|
|
88
|
-
yule_i=
|
|
165
|
+
yule_k=yule_k_dist.mean,
|
|
166
|
+
yule_i=yule_i_dist.mean,
|
|
167
|
+
yule_k_dist=yule_k_dist,
|
|
168
|
+
yule_i_dist=yule_i_dist,
|
|
169
|
+
chunk_size=chunk_size,
|
|
170
|
+
chunk_count=len(chunks),
|
|
89
171
|
metadata={
|
|
90
|
-
|
|
91
|
-
"
|
|
172
|
+
# Backward-compatible keys
|
|
173
|
+
"token_count": total_tokens,
|
|
174
|
+
"vocabulary_size": total_vocab,
|
|
175
|
+
# New prefixed keys for consistency
|
|
176
|
+
"total_token_count": total_tokens,
|
|
177
|
+
"total_vocabulary_size": total_vocab,
|
|
92
178
|
},
|
|
93
179
|
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# ngrams
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
N-gram generation, entropy computation, and sequence analysis.
|
|
7
|
+
|
|
8
|
+
## Catalogue
|
|
9
|
+
|
|
10
|
+
| File | Functions | What It Measures |
|
|
11
|
+
|------|-----------|-----------------|
|
|
12
|
+
| `entropy.py` | `compute_ngram_entropy`, `compute_character_bigram_entropy`, `compute_word_bigram_entropy` | Shannon entropy at character and word n-gram levels |
|
|
13
|
+
| `extended_ngrams.py` | `compute_extended_ngrams` | Word, character, and POS n-gram profiles with frequency distributions |
|
|
14
|
+
|
|
15
|
+
## See Also
|
|
16
|
+
|
|
17
|
+
- [`syntactic/`](../syntactic/) provides POS tags consumed by `compute_extended_ngrams(text, pos=True)`
|
|
18
|
+
- [`character/`](../character/) for character-level features without n-gram structure
|
pystylometry/ngrams/entropy.py
CHANGED
|
@@ -1,16 +1,83 @@
|
|
|
1
|
-
"""N-gram entropy and perplexity calculations.
|
|
1
|
+
"""N-gram entropy and perplexity calculations.
|
|
2
|
+
|
|
3
|
+
This module implements n-gram entropy computation with native chunked analysis
|
|
4
|
+
for stylometric fingerprinting.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
2
10
|
|
|
3
11
|
import math
|
|
4
12
|
from collections import Counter
|
|
5
13
|
|
|
6
|
-
from .._types import EntropyResult
|
|
14
|
+
from .._types import Distribution, EntropyResult, chunk_text, make_distribution
|
|
7
15
|
from .._utils import tokenize
|
|
8
16
|
|
|
9
17
|
|
|
10
|
-
def
|
|
18
|
+
def _compute_ngram_entropy_single(text: str, n: int, ngram_type: str) -> tuple[float, float, dict]:
|
|
19
|
+
"""Compute n-gram entropy for a single chunk of text.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Tuple of (entropy, perplexity, metadata_dict).
|
|
23
|
+
Returns (nan, nan, metadata) for empty/invalid input.
|
|
24
|
+
"""
|
|
25
|
+
# Generate n-grams
|
|
26
|
+
if ngram_type == "character":
|
|
27
|
+
items = list(text)
|
|
28
|
+
else: # word
|
|
29
|
+
items = tokenize(text)
|
|
30
|
+
|
|
31
|
+
if len(items) < n:
|
|
32
|
+
return (
|
|
33
|
+
float("nan"),
|
|
34
|
+
float("nan"),
|
|
35
|
+
{
|
|
36
|
+
"item_count": len(items),
|
|
37
|
+
"unique_ngrams": 0,
|
|
38
|
+
"total_ngrams": 0,
|
|
39
|
+
},
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Create n-grams using sliding window
|
|
43
|
+
ngram_list = []
|
|
44
|
+
for i in range(len(items) - n + 1):
|
|
45
|
+
ngram = tuple(items[i : i + n])
|
|
46
|
+
ngram_list.append(ngram)
|
|
47
|
+
|
|
48
|
+
# Count n-gram frequencies
|
|
49
|
+
ngram_counts = Counter(ngram_list)
|
|
50
|
+
total_ngrams = len(ngram_list)
|
|
51
|
+
|
|
52
|
+
# Calculate entropy: H(X) = -Σ p(x) × log₂(p(x))
|
|
53
|
+
entropy = 0.0
|
|
54
|
+
for count in ngram_counts.values():
|
|
55
|
+
probability = count / total_ngrams
|
|
56
|
+
entropy -= probability * math.log2(probability)
|
|
57
|
+
|
|
58
|
+
# Calculate perplexity: 2^H(X)
|
|
59
|
+
perplexity = 2**entropy
|
|
60
|
+
|
|
61
|
+
return (
|
|
62
|
+
entropy,
|
|
63
|
+
perplexity,
|
|
64
|
+
{
|
|
65
|
+
"item_count": len(items),
|
|
66
|
+
"unique_ngrams": len(ngram_counts),
|
|
67
|
+
"total_ngrams": total_ngrams,
|
|
68
|
+
},
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def compute_ngram_entropy(
|
|
73
|
+
text: str, n: int = 2, ngram_type: str = "word", chunk_size: int = 1000
|
|
74
|
+
) -> EntropyResult:
|
|
11
75
|
"""
|
|
12
76
|
Compute n-gram entropy and perplexity for text.
|
|
13
77
|
|
|
78
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
79
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
80
|
+
|
|
14
81
|
Entropy measures the unpredictability of the next item in a sequence.
|
|
15
82
|
Higher entropy = more unpredictable = more diverse/complex text.
|
|
16
83
|
|
|
@@ -20,6 +87,10 @@ def compute_ngram_entropy(text: str, n: int = 2, ngram_type: str = "word") -> En
|
|
|
20
87
|
|
|
21
88
|
Where p(x) is the probability of n-gram x occurring.
|
|
22
89
|
|
|
90
|
+
Related GitHub Issue:
|
|
91
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
92
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
93
|
+
|
|
23
94
|
References:
|
|
24
95
|
Shannon, C. E. (1948). A mathematical theory of communication.
|
|
25
96
|
Bell System Technical Journal, 27(3), 379-423.
|
|
@@ -31,100 +102,130 @@ def compute_ngram_entropy(text: str, n: int = 2, ngram_type: str = "word") -> En
|
|
|
31
102
|
text: Input text to analyze
|
|
32
103
|
n: N-gram size (2 for bigrams, 3 for trigrams, etc.)
|
|
33
104
|
ngram_type: "word" or "character" (default: "word")
|
|
105
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
34
106
|
|
|
35
107
|
Returns:
|
|
36
|
-
EntropyResult with entropy, perplexity, and metadata
|
|
108
|
+
EntropyResult with entropy, perplexity, distributions, and metadata
|
|
37
109
|
|
|
38
110
|
Example:
|
|
39
|
-
>>> result = compute_ngram_entropy("
|
|
40
|
-
>>>
|
|
41
|
-
|
|
111
|
+
>>> result = compute_ngram_entropy("Long text here...", n=2, chunk_size=1000)
|
|
112
|
+
>>> result.entropy # Mean across chunks
|
|
113
|
+
5.2
|
|
114
|
+
>>> result.entropy_dist.std # Variance reveals fingerprint
|
|
115
|
+
0.3
|
|
42
116
|
"""
|
|
43
|
-
#
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
117
|
+
# Chunk the text
|
|
118
|
+
chunks = chunk_text(text, chunk_size)
|
|
119
|
+
|
|
120
|
+
# Compute metrics per chunk
|
|
121
|
+
entropy_values = []
|
|
122
|
+
perplexity_values = []
|
|
123
|
+
total_items = 0
|
|
124
|
+
total_unique_ngrams = 0
|
|
125
|
+
total_ngrams = 0
|
|
126
|
+
|
|
127
|
+
for chunk in chunks:
|
|
128
|
+
ent, perp, meta = _compute_ngram_entropy_single(chunk, n, ngram_type)
|
|
129
|
+
if not math.isnan(ent):
|
|
130
|
+
entropy_values.append(ent)
|
|
131
|
+
perplexity_values.append(perp)
|
|
132
|
+
total_items += meta.get("item_count", 0)
|
|
133
|
+
total_unique_ngrams += meta.get("unique_ngrams", 0)
|
|
134
|
+
total_ngrams += meta.get("total_ngrams", 0)
|
|
135
|
+
|
|
136
|
+
# Handle empty or all-invalid chunks
|
|
137
|
+
if not entropy_values:
|
|
138
|
+
empty_dist = Distribution(
|
|
139
|
+
values=[],
|
|
140
|
+
mean=float("nan"),
|
|
141
|
+
median=float("nan"),
|
|
142
|
+
std=0.0,
|
|
143
|
+
range=0.0,
|
|
144
|
+
iqr=0.0,
|
|
145
|
+
)
|
|
50
146
|
return EntropyResult(
|
|
51
|
-
entropy=
|
|
52
|
-
perplexity=
|
|
147
|
+
entropy=float("nan"),
|
|
148
|
+
perplexity=float("nan"),
|
|
53
149
|
ngram_type=f"{ngram_type}_{n}gram",
|
|
150
|
+
entropy_dist=empty_dist,
|
|
151
|
+
perplexity_dist=empty_dist,
|
|
152
|
+
chunk_size=chunk_size,
|
|
153
|
+
chunk_count=len(chunks),
|
|
54
154
|
metadata={
|
|
55
155
|
"n": n,
|
|
56
156
|
"ngram_type": ngram_type,
|
|
57
|
-
"
|
|
157
|
+
"total_item_count": total_items,
|
|
58
158
|
"warning": "Text too short for n-gram analysis",
|
|
59
159
|
},
|
|
60
160
|
)
|
|
61
161
|
|
|
62
|
-
#
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
ngram = tuple(items[i : i + n])
|
|
66
|
-
ngram_list.append(ngram)
|
|
67
|
-
|
|
68
|
-
# Count n-gram frequencies
|
|
69
|
-
ngram_counts = Counter(ngram_list)
|
|
70
|
-
total_ngrams = len(ngram_list)
|
|
71
|
-
|
|
72
|
-
# Calculate entropy: H(X) = -Σ p(x) × log₂(p(x))
|
|
73
|
-
entropy = 0.0
|
|
74
|
-
for count in ngram_counts.values():
|
|
75
|
-
probability = count / total_ngrams
|
|
76
|
-
entropy -= probability * math.log2(probability)
|
|
77
|
-
|
|
78
|
-
# Calculate perplexity: 2^H(X)
|
|
79
|
-
perplexity = 2**entropy
|
|
162
|
+
# Build distributions
|
|
163
|
+
entropy_dist = make_distribution(entropy_values)
|
|
164
|
+
perplexity_dist = make_distribution(perplexity_values)
|
|
80
165
|
|
|
81
166
|
return EntropyResult(
|
|
82
|
-
entropy=
|
|
83
|
-
perplexity=
|
|
167
|
+
entropy=entropy_dist.mean,
|
|
168
|
+
perplexity=perplexity_dist.mean,
|
|
84
169
|
ngram_type=f"{ngram_type}_{n}gram",
|
|
170
|
+
entropy_dist=entropy_dist,
|
|
171
|
+
perplexity_dist=perplexity_dist,
|
|
172
|
+
chunk_size=chunk_size,
|
|
173
|
+
chunk_count=len(chunks),
|
|
85
174
|
metadata={
|
|
86
175
|
"n": n,
|
|
87
176
|
"ngram_type": ngram_type,
|
|
88
|
-
"
|
|
89
|
-
"
|
|
177
|
+
"total_item_count": total_items,
|
|
178
|
+
"total_unique_ngrams": total_unique_ngrams,
|
|
90
179
|
"total_ngrams": total_ngrams,
|
|
91
180
|
},
|
|
92
181
|
)
|
|
93
182
|
|
|
94
183
|
|
|
95
|
-
def compute_character_bigram_entropy(text: str) -> EntropyResult:
|
|
184
|
+
def compute_character_bigram_entropy(text: str, chunk_size: int = 1000) -> EntropyResult:
|
|
96
185
|
"""
|
|
97
186
|
Compute character bigram entropy.
|
|
98
187
|
|
|
99
188
|
Convenience function that calls compute_ngram_entropy with n=2, ngram_type="character".
|
|
100
189
|
|
|
190
|
+
Related GitHub Issue:
|
|
191
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
192
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
193
|
+
|
|
101
194
|
Args:
|
|
102
195
|
text: Input text to analyze
|
|
196
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
103
197
|
|
|
104
198
|
Returns:
|
|
105
|
-
EntropyResult with character bigram entropy and
|
|
199
|
+
EntropyResult with character bigram entropy, perplexity, and distributions
|
|
106
200
|
|
|
107
201
|
Example:
|
|
108
|
-
>>> result = compute_character_bigram_entropy("
|
|
109
|
-
>>>
|
|
202
|
+
>>> result = compute_character_bigram_entropy("Long text here...", chunk_size=1000)
|
|
203
|
+
>>> result.entropy # Mean across chunks
|
|
204
|
+
3.8
|
|
110
205
|
"""
|
|
111
|
-
return compute_ngram_entropy(text, n=2, ngram_type="character")
|
|
206
|
+
return compute_ngram_entropy(text, n=2, ngram_type="character", chunk_size=chunk_size)
|
|
112
207
|
|
|
113
208
|
|
|
114
|
-
def compute_word_bigram_entropy(text: str) -> EntropyResult:
|
|
209
|
+
def compute_word_bigram_entropy(text: str, chunk_size: int = 1000) -> EntropyResult:
|
|
115
210
|
"""
|
|
116
211
|
Compute word bigram entropy.
|
|
117
212
|
|
|
118
213
|
Convenience function that calls compute_ngram_entropy with n=2, ngram_type="word".
|
|
119
214
|
|
|
215
|
+
Related GitHub Issue:
|
|
216
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
217
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
218
|
+
|
|
120
219
|
Args:
|
|
121
220
|
text: Input text to analyze
|
|
221
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
122
222
|
|
|
123
223
|
Returns:
|
|
124
|
-
EntropyResult with word bigram entropy and
|
|
224
|
+
EntropyResult with word bigram entropy, perplexity, and distributions
|
|
125
225
|
|
|
126
226
|
Example:
|
|
127
|
-
>>> result = compute_word_bigram_entropy("
|
|
128
|
-
>>>
|
|
227
|
+
>>> result = compute_word_bigram_entropy("Long text here...", chunk_size=1000)
|
|
228
|
+
>>> result.entropy # Mean across chunks
|
|
229
|
+
5.2
|
|
129
230
|
"""
|
|
130
|
-
return compute_ngram_entropy(text, n=2, ngram_type="word")
|
|
231
|
+
return compute_ngram_entropy(text, n=2, ngram_type="word", chunk_size=chunk_size)
|