pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pystylometry/__init__.py +30 -5
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1954 -28
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +26 -1
  6. pystylometry/authorship/additional_methods.py +75 -0
  7. pystylometry/authorship/kilgarriff.py +347 -0
  8. pystylometry/character/__init__.py +15 -0
  9. pystylometry/character/character_metrics.py +389 -0
  10. pystylometry/cli.py +427 -0
  11. pystylometry/consistency/__init__.py +57 -0
  12. pystylometry/consistency/_thresholds.py +162 -0
  13. pystylometry/consistency/drift.py +549 -0
  14. pystylometry/dialect/__init__.py +65 -0
  15. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  16. pystylometry/dialect/_loader.py +360 -0
  17. pystylometry/dialect/detector.py +533 -0
  18. pystylometry/lexical/__init__.py +13 -6
  19. pystylometry/lexical/advanced_diversity.py +680 -0
  20. pystylometry/lexical/function_words.py +590 -0
  21. pystylometry/lexical/hapax.py +310 -33
  22. pystylometry/lexical/mtld.py +180 -22
  23. pystylometry/lexical/ttr.py +149 -0
  24. pystylometry/lexical/word_frequency_sophistication.py +1805 -0
  25. pystylometry/lexical/yule.py +142 -29
  26. pystylometry/ngrams/__init__.py +2 -0
  27. pystylometry/ngrams/entropy.py +150 -49
  28. pystylometry/ngrams/extended_ngrams.py +235 -0
  29. pystylometry/prosody/__init__.py +12 -0
  30. pystylometry/prosody/rhythm_prosody.py +53 -0
  31. pystylometry/readability/__init__.py +12 -0
  32. pystylometry/readability/additional_formulas.py +2110 -0
  33. pystylometry/readability/ari.py +173 -35
  34. pystylometry/readability/coleman_liau.py +150 -30
  35. pystylometry/readability/complex_words.py +531 -0
  36. pystylometry/readability/flesch.py +181 -32
  37. pystylometry/readability/gunning_fog.py +208 -35
  38. pystylometry/readability/smog.py +126 -28
  39. pystylometry/readability/syllables.py +137 -30
  40. pystylometry/stylistic/__init__.py +20 -0
  41. pystylometry/stylistic/cohesion_coherence.py +45 -0
  42. pystylometry/stylistic/genre_register.py +45 -0
  43. pystylometry/stylistic/markers.py +131 -0
  44. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  45. pystylometry/syntactic/__init__.py +4 -0
  46. pystylometry/syntactic/advanced_syntactic.py +494 -0
  47. pystylometry/syntactic/pos_ratios.py +172 -17
  48. pystylometry/syntactic/sentence_stats.py +105 -18
  49. pystylometry/syntactic/sentence_types.py +526 -0
  50. pystylometry/viz/__init__.py +71 -0
  51. pystylometry/viz/drift.py +589 -0
  52. pystylometry/viz/jsx/__init__.py +31 -0
  53. pystylometry/viz/jsx/_base.py +144 -0
  54. pystylometry/viz/jsx/report.py +677 -0
  55. pystylometry/viz/jsx/timeline.py +716 -0
  56. pystylometry/viz/jsx/viewer.py +1032 -0
  57. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
  58. pystylometry-1.1.0.dist-info/RECORD +63 -0
  59. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  60. pystylometry-0.1.0.dist-info/RECORD +0 -26
  61. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,149 @@
1
+ """Type-Token Ratio (TTR) analysis using stylometry-ttr package.
2
+
3
+ This module provides a facade wrapper around the stylometry-ttr package,
4
+ maintaining consistent API patterns with other pystylometry metrics.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from .._types import Distribution, TTRResult, make_distribution
14
+
15
+
16
+ def compute_ttr(text: str, text_id: str | None = None, chunk_size: int = 1000) -> TTRResult:
17
+ """
18
+ Compute Type-Token Ratio (TTR) metrics for vocabulary richness.
19
+
20
+ This is a facade wrapper around the stylometry-ttr package that provides
21
+ multiple TTR variants for measuring lexical diversity. TTR measures the
22
+ ratio of unique words (types) to total words (tokens).
23
+
24
+ Metrics computed:
25
+ - Raw TTR: unique_words / total_words
26
+ - Root TTR (Guiraud's index): unique_words / sqrt(total_words)
27
+ - Log TTR (Herdan's C): log(unique_words) / log(total_words)
28
+ - STTR: Standardized TTR across fixed-size chunks (reduces length bias)
29
+ - Delta Std: Standard deviation of TTR across chunks (vocabulary consistency)
30
+
31
+ Related GitHub Issue:
32
+ #27 - Native chunked analysis with Distribution dataclass
33
+ https://github.com/craigtrim/pystylometry/issues/27
34
+
35
+ References:
36
+ Guiraud, P. (1960). Problèmes et méthodes de la statistique linguistique.
37
+ Herdan, G. (1960). Type-token Mathematics: A Textbook of Mathematical
38
+ Linguistics. Mouton.
39
+ Johnson, W. (1944). Studies in language behavior: I. A program of research.
40
+ Psychological Monographs, 56(2), 1-15.
41
+
42
+ Args:
43
+ text: Input text to analyze
44
+ text_id: Optional identifier for the text (for tracking purposes)
45
+ chunk_size: Number of words per chunk (default: 1000).
46
+ Note: The stylometry-ttr package handles its own internal chunking,
47
+ so this parameter is included for API consistency but actual chunking
48
+ behavior is delegated to stylometry-ttr.
49
+
50
+ Returns:
51
+ TTRResult with all TTR variants and metadata, including Distribution
52
+ objects for stylometric fingerprinting.
53
+
54
+ Example:
55
+ >>> result = compute_ttr("The quick brown fox jumps over the lazy dog.")
56
+ >>> print(f"Raw TTR: {result.ttr:.3f}")
57
+ Raw TTR: 0.900
58
+ >>> print(f"Root TTR: {result.root_ttr:.3f}")
59
+ Root TTR: 2.846
60
+ >>> print(f"STTR: {result.sttr:.3f}")
61
+ STTR: 1.000
62
+
63
+ >>> # With text identifier
64
+ >>> result = compute_ttr("Sample text here.", text_id="sample-001")
65
+ >>> print(result.metadata["text_id"])
66
+ sample-001
67
+ """
68
+ try:
69
+ from stylometry_ttr import compute_ttr as _compute_ttr
70
+ except ImportError as e:
71
+ raise ImportError(
72
+ "TTR metrics require the stylometry-ttr package. "
73
+ "This should have been installed as a core dependency. "
74
+ "Install with: pip install stylometry-ttr"
75
+ ) from e
76
+
77
+ # Call the stylometry-ttr compute_ttr function
78
+ # Note: stylometry-ttr requires text_id to be a string, not None
79
+ ttr_result = _compute_ttr(text, text_id=text_id or "")
80
+
81
+ # Extract values, handling None for short texts
82
+ ttr_val = ttr_result.ttr
83
+ root_ttr_val = ttr_result.root_ttr
84
+ log_ttr_val = ttr_result.log_ttr
85
+ sttr_val = ttr_result.sttr if ttr_result.sttr is not None else 0.0
86
+ delta_std_val = ttr_result.delta_std if ttr_result.delta_std is not None else 0.0
87
+
88
+ # Create single-value distributions from stylometry-ttr results
89
+ # The stylometry-ttr package handles its own internal chunking for STTR
90
+ # so we wrap the aggregate results in Distribution objects
91
+ ttr_dist = (
92
+ make_distribution([ttr_val])
93
+ if ttr_val is not None
94
+ else Distribution(
95
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
96
+ )
97
+ )
98
+ root_ttr_dist = (
99
+ make_distribution([root_ttr_val])
100
+ if root_ttr_val is not None
101
+ else Distribution(
102
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
103
+ )
104
+ )
105
+ log_ttr_dist = (
106
+ make_distribution([log_ttr_val])
107
+ if log_ttr_val is not None
108
+ else Distribution(
109
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
110
+ )
111
+ )
112
+ sttr_dist = (
113
+ make_distribution([sttr_val])
114
+ if ttr_result.sttr is not None
115
+ else Distribution(
116
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
117
+ )
118
+ )
119
+ delta_std_dist = (
120
+ make_distribution([delta_std_val])
121
+ if ttr_result.delta_std is not None
122
+ else Distribution(
123
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
124
+ )
125
+ )
126
+
127
+ # Convert to our TTRResult dataclass
128
+ return TTRResult(
129
+ total_words=ttr_result.total_words,
130
+ unique_words=ttr_result.unique_words,
131
+ ttr=ttr_val if ttr_val is not None else float("nan"),
132
+ root_ttr=root_ttr_val if root_ttr_val is not None else float("nan"),
133
+ log_ttr=log_ttr_val if log_ttr_val is not None else float("nan"),
134
+ sttr=sttr_val,
135
+ delta_std=delta_std_val,
136
+ ttr_dist=ttr_dist,
137
+ root_ttr_dist=root_ttr_dist,
138
+ log_ttr_dist=log_ttr_dist,
139
+ sttr_dist=sttr_dist,
140
+ delta_std_dist=delta_std_dist,
141
+ chunk_size=chunk_size,
142
+ chunk_count=1, # stylometry-ttr returns aggregate results
143
+ metadata={
144
+ "text_id": text_id or "",
145
+ "source": "stylometry-ttr",
146
+ "sttr_available": ttr_result.sttr is not None,
147
+ "delta_std_available": ttr_result.delta_std is not None,
148
+ },
149
+ )