pystylometry 1.3.1__py3-none-any.whl → 1.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +42 -3
- pystylometry/_types.py +53 -3
- pystylometry/cli.py +695 -0
- pystylometry/lexical/__init__.py +4 -1
- pystylometry/lexical/bnc_frequency.py +309 -0
- pystylometry/lexical/ttr.py +288 -97
- pystylometry/viz/jsx/__init__.py +2 -0
- pystylometry/viz/jsx/bnc_frequency.py +495 -0
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/METADATA +16 -3
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/RECORD +13 -11
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/entry_points.txt +2 -0
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/LICENSE +0 -0
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/WHEEL +0 -0
pystylometry/__init__.py
CHANGED
|
@@ -40,14 +40,13 @@ Usage:
|
|
|
40
40
|
print(result.pattern_confidence)
|
|
41
41
|
"""
|
|
42
42
|
|
|
43
|
+
from . import lexical # noqa: E402
|
|
43
44
|
from ._types import AnalysisResult
|
|
45
|
+
from .tokenizer import TokenizationStats, Tokenizer, TokenMetadata
|
|
44
46
|
|
|
45
47
|
# Version
|
|
46
48
|
__version__ = "0.1.0"
|
|
47
49
|
|
|
48
|
-
# Core exports - always available
|
|
49
|
-
from . import lexical
|
|
50
|
-
|
|
51
50
|
# Optional exports - may raise ImportError if dependencies not installed
|
|
52
51
|
try:
|
|
53
52
|
from . import readability # noqa: F401
|
|
@@ -87,6 +86,41 @@ _CONSISTENCY_AVAILABLE = True
|
|
|
87
86
|
_STYLISTIC_AVAILABLE = True
|
|
88
87
|
|
|
89
88
|
|
|
89
|
+
def tokenize(text: str, **kwargs: object) -> list[str]:
|
|
90
|
+
"""Tokenize text using the stylometric tokenizer.
|
|
91
|
+
|
|
92
|
+
Convenience wrapper around Tokenizer.tokenize(). All keyword arguments
|
|
93
|
+
are forwarded to the Tokenizer constructor.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
text: Input text to tokenize.
|
|
97
|
+
**kwargs: Options forwarded to Tokenizer (lowercase, strip_numbers,
|
|
98
|
+
expand_contractions, etc.).
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
List of token strings.
|
|
102
|
+
|
|
103
|
+
Example:
|
|
104
|
+
>>> from pystylometry import tokenize
|
|
105
|
+
>>> tokenize("Hello, world! It's a test.")
|
|
106
|
+
['hello', 'world', "it's", 'a', 'test']
|
|
107
|
+
"""
|
|
108
|
+
return Tokenizer(**kwargs).tokenize(text) # type: ignore[arg-type]
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def tokenize_with_metadata(text: str, **kwargs: object) -> list[TokenMetadata]:
|
|
112
|
+
"""Tokenize text and return tokens with positional and type metadata.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
text: Input text to tokenize.
|
|
116
|
+
**kwargs: Options forwarded to Tokenizer.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
List of TokenMetadata objects.
|
|
120
|
+
"""
|
|
121
|
+
return Tokenizer(**kwargs).tokenize_with_metadata(text) # type: ignore[arg-type]
|
|
122
|
+
|
|
123
|
+
|
|
90
124
|
def analyze(
|
|
91
125
|
text: str,
|
|
92
126
|
lexical_metrics: bool = True,
|
|
@@ -225,6 +259,11 @@ __all__ = [
|
|
|
225
259
|
"__version__",
|
|
226
260
|
"analyze",
|
|
227
261
|
"get_available_modules",
|
|
262
|
+
"tokenize",
|
|
263
|
+
"tokenize_with_metadata",
|
|
264
|
+
"Tokenizer",
|
|
265
|
+
"TokenMetadata",
|
|
266
|
+
"TokenizationStats",
|
|
228
267
|
"lexical",
|
|
229
268
|
]
|
|
230
269
|
|
pystylometry/_types.py
CHANGED
|
@@ -23,7 +23,7 @@ from __future__ import annotations
|
|
|
23
23
|
|
|
24
24
|
import statistics
|
|
25
25
|
from dataclasses import dataclass
|
|
26
|
-
from typing import Any
|
|
26
|
+
from typing import Any, Optional
|
|
27
27
|
|
|
28
28
|
# ===== Distribution and Chunking =====
|
|
29
29
|
# Related to GitHub Issue #27: Native chunked analysis with Distribution dataclass
|
|
@@ -316,8 +316,8 @@ class HapaxLexiconResult:
|
|
|
316
316
|
class TTRResult:
|
|
317
317
|
"""Result from Type-Token Ratio (TTR) analysis.
|
|
318
318
|
|
|
319
|
-
|
|
320
|
-
|
|
319
|
+
Measures vocabulary richness through the ratio of unique words (types)
|
|
320
|
+
to total words (tokens).
|
|
321
321
|
|
|
322
322
|
All numeric metrics include both a mean value (convenient access) and
|
|
323
323
|
a full distribution with per-chunk values and statistics.
|
|
@@ -370,6 +370,56 @@ class TTRResult:
|
|
|
370
370
|
metadata: dict[str, Any]
|
|
371
371
|
|
|
372
372
|
|
|
373
|
+
@dataclass
|
|
374
|
+
class TTRAggregateResult:
|
|
375
|
+
"""Aggregated TTR statistics for a collection of texts.
|
|
376
|
+
|
|
377
|
+
Computes group-level summary statistics (mean, std, min, max, median)
|
|
378
|
+
across multiple ``TTRResult`` objects. Useful for comparative analysis
|
|
379
|
+
across authors, genres, or time periods.
|
|
380
|
+
|
|
381
|
+
Related GitHub Issue:
|
|
382
|
+
#43 - Inline stylometry-ttr into pystylometry (remove external dependency)
|
|
383
|
+
https://github.com/craigtrim/pystylometry/issues/43
|
|
384
|
+
|
|
385
|
+
Example:
|
|
386
|
+
>>> from pystylometry.lexical import compute_ttr, TTRAggregator
|
|
387
|
+
>>> results = [compute_ttr(t) for t in texts]
|
|
388
|
+
>>> agg = TTRAggregator()
|
|
389
|
+
>>> stats = agg.aggregate(results, group_id="Austen")
|
|
390
|
+
>>> stats.ttr_mean
|
|
391
|
+
0.412
|
|
392
|
+
"""
|
|
393
|
+
|
|
394
|
+
group_id: str
|
|
395
|
+
text_count: int
|
|
396
|
+
total_words: int
|
|
397
|
+
|
|
398
|
+
# Raw TTR statistics
|
|
399
|
+
ttr_mean: float
|
|
400
|
+
ttr_std: float
|
|
401
|
+
ttr_min: float
|
|
402
|
+
ttr_max: float
|
|
403
|
+
ttr_median: float
|
|
404
|
+
|
|
405
|
+
# Root TTR (Guiraud's index) statistics
|
|
406
|
+
root_ttr_mean: float
|
|
407
|
+
root_ttr_std: float
|
|
408
|
+
|
|
409
|
+
# Log TTR (Herdan's C) statistics
|
|
410
|
+
log_ttr_mean: float
|
|
411
|
+
log_ttr_std: float
|
|
412
|
+
|
|
413
|
+
# STTR statistics (None if no texts had enough words for STTR)
|
|
414
|
+
sttr_mean: Optional[float]
|
|
415
|
+
sttr_std: Optional[float]
|
|
416
|
+
|
|
417
|
+
# Delta std mean (None if no texts had delta metrics)
|
|
418
|
+
delta_std_mean: Optional[float]
|
|
419
|
+
|
|
420
|
+
metadata: dict[str, Any]
|
|
421
|
+
|
|
422
|
+
|
|
373
423
|
# ===== Repetition Detection Results =====
|
|
374
424
|
# Related to GitHub Issue #28: Verbal tics detection for slop analysis
|
|
375
425
|
# https://github.com/craigtrim/pystylometry/issues/28
|