pystylometry 1.3.1__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pystylometry/__init__.py CHANGED
@@ -40,14 +40,13 @@ Usage:
40
40
  print(result.pattern_confidence)
41
41
  """
42
42
 
43
+ from . import lexical # noqa: E402
43
44
  from ._types import AnalysisResult
45
+ from .tokenizer import TokenizationStats, Tokenizer, TokenMetadata
44
46
 
45
47
  # Version
46
48
  __version__ = "0.1.0"
47
49
 
48
- # Core exports - always available
49
- from . import lexical
50
-
51
50
  # Optional exports - may raise ImportError if dependencies not installed
52
51
  try:
53
52
  from . import readability # noqa: F401
@@ -87,6 +86,41 @@ _CONSISTENCY_AVAILABLE = True
87
86
  _STYLISTIC_AVAILABLE = True
88
87
 
89
88
 
89
+ def tokenize(text: str, **kwargs: object) -> list[str]:
90
+ """Tokenize text using the stylometric tokenizer.
91
+
92
+ Convenience wrapper around Tokenizer.tokenize(). All keyword arguments
93
+ are forwarded to the Tokenizer constructor.
94
+
95
+ Args:
96
+ text: Input text to tokenize.
97
+ **kwargs: Options forwarded to Tokenizer (lowercase, strip_numbers,
98
+ expand_contractions, etc.).
99
+
100
+ Returns:
101
+ List of token strings.
102
+
103
+ Example:
104
+ >>> from pystylometry import tokenize
105
+ >>> tokenize("Hello, world! It's a test.")
106
+ ['hello', 'world', "it's", 'a', 'test']
107
+ """
108
+ return Tokenizer(**kwargs).tokenize(text) # type: ignore[arg-type]
109
+
110
+
111
+ def tokenize_with_metadata(text: str, **kwargs: object) -> list[TokenMetadata]:
112
+ """Tokenize text and return tokens with positional and type metadata.
113
+
114
+ Args:
115
+ text: Input text to tokenize.
116
+ **kwargs: Options forwarded to Tokenizer.
117
+
118
+ Returns:
119
+ List of TokenMetadata objects.
120
+ """
121
+ return Tokenizer(**kwargs).tokenize_with_metadata(text) # type: ignore[arg-type]
122
+
123
+
90
124
  def analyze(
91
125
  text: str,
92
126
  lexical_metrics: bool = True,
@@ -225,6 +259,11 @@ __all__ = [
225
259
  "__version__",
226
260
  "analyze",
227
261
  "get_available_modules",
262
+ "tokenize",
263
+ "tokenize_with_metadata",
264
+ "Tokenizer",
265
+ "TokenMetadata",
266
+ "TokenizationStats",
228
267
  "lexical",
229
268
  ]
230
269
 
pystylometry/_types.py CHANGED
@@ -23,7 +23,7 @@ from __future__ import annotations
23
23
 
24
24
  import statistics
25
25
  from dataclasses import dataclass
26
- from typing import Any
26
+ from typing import Any, Optional
27
27
 
28
28
  # ===== Distribution and Chunking =====
29
29
  # Related to GitHub Issue #27: Native chunked analysis with Distribution dataclass
@@ -316,8 +316,8 @@ class HapaxLexiconResult:
316
316
  class TTRResult:
317
317
  """Result from Type-Token Ratio (TTR) analysis.
318
318
 
319
- Wraps stylometry-ttr package functionality to measure vocabulary richness
320
- through the ratio of unique words (types) to total words (tokens).
319
+ Measures vocabulary richness through the ratio of unique words (types)
320
+ to total words (tokens).
321
321
 
322
322
  All numeric metrics include both a mean value (convenient access) and
323
323
  a full distribution with per-chunk values and statistics.
@@ -370,6 +370,56 @@ class TTRResult:
370
370
  metadata: dict[str, Any]
371
371
 
372
372
 
373
+ @dataclass
374
+ class TTRAggregateResult:
375
+ """Aggregated TTR statistics for a collection of texts.
376
+
377
+ Computes group-level summary statistics (mean, std, min, max, median)
378
+ across multiple ``TTRResult`` objects. Useful for comparative analysis
379
+ across authors, genres, or time periods.
380
+
381
+ Related GitHub Issue:
382
+ #43 - Inline stylometry-ttr into pystylometry (remove external dependency)
383
+ https://github.com/craigtrim/pystylometry/issues/43
384
+
385
+ Example:
386
+ >>> from pystylometry.lexical import compute_ttr, TTRAggregator
387
+ >>> results = [compute_ttr(t) for t in texts]
388
+ >>> agg = TTRAggregator()
389
+ >>> stats = agg.aggregate(results, group_id="Austen")
390
+ >>> stats.ttr_mean
391
+ 0.412
392
+ """
393
+
394
+ group_id: str
395
+ text_count: int
396
+ total_words: int
397
+
398
+ # Raw TTR statistics
399
+ ttr_mean: float
400
+ ttr_std: float
401
+ ttr_min: float
402
+ ttr_max: float
403
+ ttr_median: float
404
+
405
+ # Root TTR (Guiraud's index) statistics
406
+ root_ttr_mean: float
407
+ root_ttr_std: float
408
+
409
+ # Log TTR (Herdan's C) statistics
410
+ log_ttr_mean: float
411
+ log_ttr_std: float
412
+
413
+ # STTR statistics (None if no texts had enough words for STTR)
414
+ sttr_mean: Optional[float]
415
+ sttr_std: Optional[float]
416
+
417
+ # Delta std mean (None if no texts had delta metrics)
418
+ delta_std_mean: Optional[float]
419
+
420
+ metadata: dict[str, Any]
421
+
422
+
373
423
  # ===== Repetition Detection Results =====
374
424
  # Related to GitHub Issue #28: Verbal tics detection for slop analysis
375
425
  # https://github.com/craigtrim/pystylometry/issues/28