pystylometry 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +206 -0
- pystylometry/_types.py +172 -0
- pystylometry/_utils.py +197 -0
- pystylometry/authorship/__init__.py +10 -0
- pystylometry/authorship/burrows_delta.py +152 -0
- pystylometry/authorship/zeta.py +109 -0
- pystylometry/lexical/__init__.py +17 -0
- pystylometry/lexical/hapax.py +75 -0
- pystylometry/lexical/mtld.py +61 -0
- pystylometry/lexical/yule.py +66 -0
- pystylometry/ngrams/__init__.py +13 -0
- pystylometry/ngrams/entropy.py +130 -0
- pystylometry/readability/__init__.py +15 -0
- pystylometry/readability/ari.py +70 -0
- pystylometry/readability/coleman_liau.py +67 -0
- pystylometry/readability/flesch.py +81 -0
- pystylometry/readability/gunning_fog.py +63 -0
- pystylometry/readability/smog.py +71 -0
- pystylometry/readability/syllables.py +54 -0
- pystylometry/syntactic/__init__.py +9 -0
- pystylometry/syntactic/pos_ratios.py +61 -0
- pystylometry/syntactic/sentence_stats.py +60 -0
- pystylometry/tokenizer.py +598 -0
- pystylometry-0.1.0.dist-info/METADATA +238 -0
- pystylometry-0.1.0.dist-info/RECORD +26 -0
- pystylometry-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,598 @@
|
|
|
1
|
+
"""Advanced tokenizer for stylometric analysis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import unicodedata
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Iterator
|
|
9
|
+
|
|
10
|
+
# ===== Unicode Normalization Tables =====
|
|
11
|
+
|
|
12
|
+
# Single-character replacements (fast lookup with str.maketrans)
|
|
13
|
+
_UNICODE_REPLACEMENTS = str.maketrans(
|
|
14
|
+
{
|
|
15
|
+
# Smart quotes
|
|
16
|
+
"\u2018": "'", # Left single quote
|
|
17
|
+
"\u2019": "'", # Right single quote
|
|
18
|
+
"\u201a": "'", # Single low-9 quote
|
|
19
|
+
"\u201b": "'", # Single high-reversed-9 quote
|
|
20
|
+
"\u201c": '"', # Left double quote
|
|
21
|
+
"\u201d": '"', # Right double quote
|
|
22
|
+
"\u201e": '"', # Double low-9 quote
|
|
23
|
+
"\u201f": '"', # Double high-reversed-9 quote
|
|
24
|
+
# Dashes
|
|
25
|
+
"\u2013": "-", # En dash
|
|
26
|
+
"\u2014": "-", # Em dash
|
|
27
|
+
"\u2015": "-", # Horizontal bar
|
|
28
|
+
"\u2212": "-", # Minus sign
|
|
29
|
+
# Spaces
|
|
30
|
+
"\u00a0": " ", # Non-breaking space
|
|
31
|
+
"\u2002": " ", # En space
|
|
32
|
+
"\u2003": " ", # Em space
|
|
33
|
+
"\u2009": " ", # Thin space
|
|
34
|
+
"\u200a": " ", # Hair space
|
|
35
|
+
# Apostrophes and primes
|
|
36
|
+
"\u02bc": "'", # Modifier letter apostrophe
|
|
37
|
+
"\u2032": "'", # Prime
|
|
38
|
+
"\u2033": '"', # Double prime
|
|
39
|
+
# Ellipsis
|
|
40
|
+
"\u2026": "...", # Horizontal ellipsis
|
|
41
|
+
# Ligatures (decompose)
|
|
42
|
+
"\ufb01": "fi",
|
|
43
|
+
"\ufb02": "fl",
|
|
44
|
+
"\ufb03": "ffi",
|
|
45
|
+
"\ufb04": "ffl",
|
|
46
|
+
"\u00e6": "ae", # æ
|
|
47
|
+
"\u00c6": "AE", # Æ
|
|
48
|
+
"\u0153": "oe", # œ
|
|
49
|
+
"\u0152": "OE", # Œ
|
|
50
|
+
# Mathematical operators
|
|
51
|
+
"\u00d7": "x", # Multiplication sign
|
|
52
|
+
"\u00f7": "/", # Division sign
|
|
53
|
+
"\u00b1": "+/-", # Plus-minus
|
|
54
|
+
# Currency (normalize for analysis)
|
|
55
|
+
"\u00a3": "GBP", # Pound
|
|
56
|
+
"\u00a5": "JPY", # Yen
|
|
57
|
+
"\u20ac": "EUR", # Euro
|
|
58
|
+
# Fractions
|
|
59
|
+
"\u00bc": "1/4",
|
|
60
|
+
"\u00bd": "1/2",
|
|
61
|
+
"\u00be": "3/4",
|
|
62
|
+
"\u2153": "1/3",
|
|
63
|
+
"\u2154": "2/3",
|
|
64
|
+
}
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Multi-character patterns (regex-based)
|
|
68
|
+
_MULTI_CHAR_PATTERNS = [
|
|
69
|
+
# Multiple dashes to single dash
|
|
70
|
+
(re.compile(r"[-\u2013\u2014]{2,}"), "-"),
|
|
71
|
+
# Multiple dots (not ellipsis)
|
|
72
|
+
(re.compile(r"\.{4,}"), "..."),
|
|
73
|
+
# Zero-width characters
|
|
74
|
+
(re.compile(r"[\u200b-\u200d\ufeff]"), ""),
|
|
75
|
+
# Control characters except newline/tab
|
|
76
|
+
(re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]"), ""),
|
|
77
|
+
# Multiple spaces/tabs to single space
|
|
78
|
+
(re.compile(r"[ \t]+"), " "),
|
|
79
|
+
# HTML entities (common ones)
|
|
80
|
+
(re.compile(r" "), " "),
|
|
81
|
+
(re.compile(r"""), '"'),
|
|
82
|
+
(re.compile(r"'"), "'"),
|
|
83
|
+
(re.compile(r"&"), "&"),
|
|
84
|
+
(re.compile(r"<"), "<"),
|
|
85
|
+
(re.compile(r">"), ">"),
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ===== Text Cleaning Patterns =====
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _remove_italics_markers(text: str) -> str:
|
|
93
|
+
"""Remove markdown/formatting italics markers."""
|
|
94
|
+
# Remove asterisk/underscore pairs around words
|
|
95
|
+
text = re.sub(r"\*([^\*]+)\*", r"\1", text)
|
|
96
|
+
text = re.sub(r"_([^_]+)_", r"\1", text)
|
|
97
|
+
return text
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _remove_brackets(text: str) -> str:
|
|
101
|
+
"""Remove bracketed content [like this] and {like this}."""
|
|
102
|
+
text = re.sub(r"\[([^\]]+)\]", r"\1", text)
|
|
103
|
+
text = re.sub(r"\{([^\}]+)\}", r"\1", text)
|
|
104
|
+
return text
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _remove_line_break_hyphens(text: str) -> str:
|
|
108
|
+
"""Remove hyphens at line breaks (word-\nbreak -> wordbreak)."""
|
|
109
|
+
return re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _remove_page_markers(text: str) -> str:
|
|
113
|
+
"""Remove page numbers and headers like [Page 123] or --- Page 45 ---."""
|
|
114
|
+
text = re.sub(r"\[Page\s+\d+\]", "", text, flags=re.IGNORECASE)
|
|
115
|
+
text = re.sub(r"[-=]{2,}\s*Page\s+\d+\s*[-=]{2,}", "", text, flags=re.IGNORECASE)
|
|
116
|
+
return text
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _normalize_whitespace(text: str) -> str:
|
|
120
|
+
"""Normalize all whitespace to single spaces."""
|
|
121
|
+
# Collapse multiple newlines
|
|
122
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
123
|
+
# Normalize spaces
|
|
124
|
+
text = re.sub(r"[ \t]+", " ", text)
|
|
125
|
+
return text.strip()
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ===== Token Pattern =====
|
|
129
|
+
|
|
130
|
+
# Comprehensive token pattern with priority-ordered alternations
|
|
131
|
+
_TOKEN_PATTERN = re.compile(
|
|
132
|
+
r"""
|
|
133
|
+
# URLs (highest priority to avoid splitting)
|
|
134
|
+
(?P<url>https?://\S+)|
|
|
135
|
+
|
|
136
|
+
# Email addresses
|
|
137
|
+
(?P<email>\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b)|
|
|
138
|
+
|
|
139
|
+
# Hashtags and mentions (social media)
|
|
140
|
+
(?P<hashtag>\#\w+)|
|
|
141
|
+
(?P<mention>@\w+)|
|
|
142
|
+
|
|
143
|
+
# Time expressions (3:45pm, 10:30:15)
|
|
144
|
+
(?P<time>\d{1,2}:\d{2}(?::\d{2})?(?:[ap]m)?)|
|
|
145
|
+
|
|
146
|
+
# Dates (ISO format: 2024-01-15)
|
|
147
|
+
(?P<date>\d{4}-\d{2}-\d{2})|
|
|
148
|
+
|
|
149
|
+
# Acronyms with periods (U.S.A., Ph.D.)
|
|
150
|
+
(?P<acronym>(?:[A-Z]\.){2,})|
|
|
151
|
+
|
|
152
|
+
# Contractions and possessives (complex patterns)
|
|
153
|
+
(?P<contraction_start>
|
|
154
|
+
'(?:tis|twas|twere|twould|twill|em|gainst|cause|bout|til|way)(?![a-z])
|
|
155
|
+
)|
|
|
156
|
+
(?P<internal_elision>
|
|
157
|
+
\w+[''](?:er|re|ve|ll|d|m|s|t|clock)(?![a-z])
|
|
158
|
+
)|
|
|
159
|
+
(?P<hyphen_possessive>
|
|
160
|
+
(?:\w+(?:-\w+)+)['']s?
|
|
161
|
+
)|
|
|
162
|
+
(?P<standard_contraction>
|
|
163
|
+
\w+[''][a-z]{1,3}(?![a-z])
|
|
164
|
+
)|
|
|
165
|
+
(?P<possessive>
|
|
166
|
+
\w+['']s?(?![a-z])
|
|
167
|
+
)|
|
|
168
|
+
|
|
169
|
+
# Roman numerals
|
|
170
|
+
(?P<roman>\b[IVXLCDM]+\b)|
|
|
171
|
+
|
|
172
|
+
# Ordinals (1st, 2nd, 3rd, 4th, etc.)
|
|
173
|
+
(?P<ordinal>\d+(?:st|nd|rd|th))|
|
|
174
|
+
|
|
175
|
+
# Numbers with commas and decimals ($1,234.56)
|
|
176
|
+
(?P<number_currency>\$?\d{1,3}(?:,\d{3})*(?:\.\d+)?)|
|
|
177
|
+
|
|
178
|
+
# Abbreviations (Dr., Mr., Mrs., etc.)
|
|
179
|
+
(?P<abbreviation>(?:Dr|Mr|Mrs|Ms|Prof|Sr|Jr|vs|etc|e\.g|i\.e)\.)|
|
|
180
|
+
|
|
181
|
+
# G-dropping (singin', dancin')
|
|
182
|
+
(?P<g_drop>\w+in[''])|
|
|
183
|
+
|
|
184
|
+
# Hyphenated compounds (mother-in-law, well-known)
|
|
185
|
+
(?P<hyphenated>(?:\w+-)+\w+)|
|
|
186
|
+
|
|
187
|
+
# Regular words (including internal apostrophes like "o'clock")
|
|
188
|
+
(?P<word>\w+(?:[']\w+)*)|
|
|
189
|
+
|
|
190
|
+
# Ellipsis
|
|
191
|
+
(?P<ellipsis>\.{3}|…)|
|
|
192
|
+
|
|
193
|
+
# Individual punctuation
|
|
194
|
+
(?P<punct>[^\w\s])
|
|
195
|
+
""",
|
|
196
|
+
re.VERBOSE | re.IGNORECASE | re.UNICODE,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# ===== Common Abbreviations =====
|
|
201
|
+
|
|
202
|
+
_COMMON_ABBREV = {
|
|
203
|
+
# Titles
|
|
204
|
+
"Dr.": "Doctor",
|
|
205
|
+
"Mr.": "Mister",
|
|
206
|
+
"Mrs.": "Misses",
|
|
207
|
+
"Ms.": "Miss",
|
|
208
|
+
"Prof.": "Professor",
|
|
209
|
+
"Sr.": "Senior",
|
|
210
|
+
"Jr.": "Junior",
|
|
211
|
+
# Latin
|
|
212
|
+
"e.g.": "for example",
|
|
213
|
+
"i.e.": "that is",
|
|
214
|
+
"etc.": "et cetera",
|
|
215
|
+
"vs.": "versus",
|
|
216
|
+
# Time
|
|
217
|
+
"a.m.": "AM",
|
|
218
|
+
"p.m.": "PM",
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
# Contraction expansions
|
|
222
|
+
_CONTRACTIONS = {
|
|
223
|
+
"ain't": "am not",
|
|
224
|
+
"aren't": "are not",
|
|
225
|
+
"can't": "cannot",
|
|
226
|
+
"won't": "will not",
|
|
227
|
+
"shan't": "shall not",
|
|
228
|
+
"couldn't": "could not",
|
|
229
|
+
"shouldn't": "should not",
|
|
230
|
+
"wouldn't": "would not",
|
|
231
|
+
"didn't": "did not",
|
|
232
|
+
"doesn't": "does not",
|
|
233
|
+
"don't": "do not",
|
|
234
|
+
"hadn't": "had not",
|
|
235
|
+
"hasn't": "has not",
|
|
236
|
+
"haven't": "have not",
|
|
237
|
+
"he'd": "he would",
|
|
238
|
+
"he'll": "he will",
|
|
239
|
+
"he's": "he is",
|
|
240
|
+
"i'd": "I would",
|
|
241
|
+
"i'll": "I will",
|
|
242
|
+
"i'm": "I am",
|
|
243
|
+
"i've": "I have",
|
|
244
|
+
"isn't": "is not",
|
|
245
|
+
"it's": "it is",
|
|
246
|
+
"let's": "let us",
|
|
247
|
+
"she'd": "she would",
|
|
248
|
+
"she'll": "she will",
|
|
249
|
+
"she's": "she is",
|
|
250
|
+
"that's": "that is",
|
|
251
|
+
"there's": "there is",
|
|
252
|
+
"they'd": "they would",
|
|
253
|
+
"they'll": "they will",
|
|
254
|
+
"they're": "they are",
|
|
255
|
+
"they've": "they have",
|
|
256
|
+
"we'd": "we would",
|
|
257
|
+
"we'll": "we will",
|
|
258
|
+
"we're": "we are",
|
|
259
|
+
"we've": "we have",
|
|
260
|
+
"weren't": "were not",
|
|
261
|
+
"what's": "what is",
|
|
262
|
+
"where's": "where is",
|
|
263
|
+
"who's": "who is",
|
|
264
|
+
"you'd": "you would",
|
|
265
|
+
"you'll": "you will",
|
|
266
|
+
"you're": "you are",
|
|
267
|
+
"you've": "you have",
|
|
268
|
+
"'tis": "it is",
|
|
269
|
+
"'twas": "it was",
|
|
270
|
+
"'em": "them",
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
@dataclass
|
|
275
|
+
class TokenMetadata:
|
|
276
|
+
"""Metadata about a token."""
|
|
277
|
+
|
|
278
|
+
token: str
|
|
279
|
+
start: int
|
|
280
|
+
end: int
|
|
281
|
+
token_type: str # word, url, email, number, punctuation, etc.
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
@dataclass
|
|
285
|
+
class TokenizationStats:
|
|
286
|
+
"""Statistics from tokenization."""
|
|
287
|
+
|
|
288
|
+
total_tokens: int
|
|
289
|
+
unique_tokens: int
|
|
290
|
+
word_tokens: int
|
|
291
|
+
number_tokens: int
|
|
292
|
+
punctuation_tokens: int
|
|
293
|
+
url_tokens: int
|
|
294
|
+
email_tokens: int
|
|
295
|
+
hashtag_tokens: int
|
|
296
|
+
mention_tokens: int
|
|
297
|
+
average_token_length: float
|
|
298
|
+
min_token_length: int
|
|
299
|
+
max_token_length: int
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
class Tokenizer:
|
|
303
|
+
"""
|
|
304
|
+
Advanced tokenizer for stylometric analysis.
|
|
305
|
+
|
|
306
|
+
Features:
|
|
307
|
+
- Comprehensive unicode normalization
|
|
308
|
+
- Text cleaning (italics, brackets, page markers)
|
|
309
|
+
- Sophisticated token pattern matching
|
|
310
|
+
- Configurable filtering options
|
|
311
|
+
- Token metadata tracking
|
|
312
|
+
- Memory-efficient iteration
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
lowercase: Convert tokens to lowercase (default: True)
|
|
316
|
+
min_length: Minimum token length (default: 1)
|
|
317
|
+
max_length: Maximum token length (default: None)
|
|
318
|
+
strip_numbers: Remove numeric tokens (default: False)
|
|
319
|
+
strip_punctuation: Remove pure punctuation tokens (default: True)
|
|
320
|
+
preserve_urls: Keep URL tokens (default: False)
|
|
321
|
+
preserve_emails: Keep email tokens (default: False)
|
|
322
|
+
preserve_hashtags: Keep hashtag tokens (default: False)
|
|
323
|
+
preserve_mentions: Keep @mention tokens (default: False)
|
|
324
|
+
expand_contractions: Expand contractions to full words (default: False)
|
|
325
|
+
expand_abbreviations: Expand common abbreviations (default: False)
|
|
326
|
+
strip_accents: Remove accents from characters (default: False)
|
|
327
|
+
normalize_unicode: Apply unicode normalization (default: True)
|
|
328
|
+
clean_text: Apply text cleaning (default: True)
|
|
329
|
+
|
|
330
|
+
Example:
|
|
331
|
+
>>> tokenizer = Tokenizer(lowercase=True, strip_punctuation=True)
|
|
332
|
+
>>> tokens = tokenizer.tokenize("Hello, world! It's a test.")
|
|
333
|
+
>>> print(tokens)
|
|
334
|
+
['hello', 'world', "it's", 'a', 'test']
|
|
335
|
+
|
|
336
|
+
>>> # With metadata
|
|
337
|
+
>>> metadata = tokenizer.tokenize_with_metadata("Test text")
|
|
338
|
+
>>> for item in metadata:
|
|
339
|
+
... print(f"{item.token} [{item.token_type}] at {item.start}-{item.end}")
|
|
340
|
+
"""
|
|
341
|
+
|
|
342
|
+
def __init__(
|
|
343
|
+
self,
|
|
344
|
+
lowercase: bool = True,
|
|
345
|
+
min_length: int = 1,
|
|
346
|
+
max_length: int | None = None,
|
|
347
|
+
strip_numbers: bool = False,
|
|
348
|
+
strip_punctuation: bool = True,
|
|
349
|
+
preserve_urls: bool = False,
|
|
350
|
+
preserve_emails: bool = False,
|
|
351
|
+
preserve_hashtags: bool = False,
|
|
352
|
+
preserve_mentions: bool = False,
|
|
353
|
+
expand_contractions: bool = False,
|
|
354
|
+
expand_abbreviations: bool = False,
|
|
355
|
+
strip_accents: bool = False,
|
|
356
|
+
normalize_unicode: bool = True,
|
|
357
|
+
clean_text: bool = True,
|
|
358
|
+
):
|
|
359
|
+
self.lowercase = lowercase
|
|
360
|
+
self.min_length = min_length
|
|
361
|
+
self.max_length = max_length
|
|
362
|
+
self.strip_numbers = strip_numbers
|
|
363
|
+
self.strip_punctuation = strip_punctuation
|
|
364
|
+
self.preserve_urls = preserve_urls
|
|
365
|
+
self.preserve_emails = preserve_emails
|
|
366
|
+
self.preserve_hashtags = preserve_hashtags
|
|
367
|
+
self.preserve_mentions = preserve_mentions
|
|
368
|
+
self.expand_contractions = expand_contractions
|
|
369
|
+
self.expand_abbreviations = expand_abbreviations
|
|
370
|
+
self.strip_accents = strip_accents
|
|
371
|
+
self.normalize_unicode = normalize_unicode
|
|
372
|
+
self.clean_text = clean_text
|
|
373
|
+
|
|
374
|
+
def _preprocess_text(self, text: str) -> str:
|
|
375
|
+
"""Apply unicode normalization and text cleaning."""
|
|
376
|
+
if not text:
|
|
377
|
+
return ""
|
|
378
|
+
|
|
379
|
+
# Unicode normalization
|
|
380
|
+
if self.normalize_unicode:
|
|
381
|
+
# Apply character replacements
|
|
382
|
+
text = text.translate(_UNICODE_REPLACEMENTS)
|
|
383
|
+
|
|
384
|
+
# Apply multi-character patterns
|
|
385
|
+
for pattern, replacement in _MULTI_CHAR_PATTERNS:
|
|
386
|
+
text = pattern.sub(replacement, text)
|
|
387
|
+
|
|
388
|
+
# Text cleaning
|
|
389
|
+
if self.clean_text:
|
|
390
|
+
text = _remove_italics_markers(text)
|
|
391
|
+
text = _remove_brackets(text)
|
|
392
|
+
text = _remove_line_break_hyphens(text)
|
|
393
|
+
text = _remove_page_markers(text)
|
|
394
|
+
text = _normalize_whitespace(text)
|
|
395
|
+
|
|
396
|
+
# Strip accents if requested
|
|
397
|
+
if self.strip_accents:
|
|
398
|
+
# NFD decomposition then filter out combining marks
|
|
399
|
+
text = unicodedata.normalize("NFD", text)
|
|
400
|
+
text = "".join(c for c in text if unicodedata.category(c) != "Mn")
|
|
401
|
+
|
|
402
|
+
return text
|
|
403
|
+
|
|
404
|
+
def _expand_token(self, token: str) -> str:
|
|
405
|
+
"""Expand contractions and abbreviations if configured."""
|
|
406
|
+
if self.expand_contractions:
|
|
407
|
+
lower_token = token.lower()
|
|
408
|
+
if lower_token in _CONTRACTIONS:
|
|
409
|
+
expanded = _CONTRACTIONS[lower_token]
|
|
410
|
+
# Preserve case for first character
|
|
411
|
+
if token[0].isupper():
|
|
412
|
+
expanded = expanded.capitalize()
|
|
413
|
+
return expanded
|
|
414
|
+
|
|
415
|
+
if self.expand_abbreviations:
|
|
416
|
+
if token in _COMMON_ABBREV:
|
|
417
|
+
return _COMMON_ABBREV[token]
|
|
418
|
+
|
|
419
|
+
return token
|
|
420
|
+
|
|
421
|
+
def _should_keep_token(self, token: str, token_type: str) -> bool:
|
|
422
|
+
"""Determine if token should be kept based on filters."""
|
|
423
|
+
# Length filters
|
|
424
|
+
if len(token) < self.min_length:
|
|
425
|
+
return False
|
|
426
|
+
if self.max_length and len(token) > self.max_length:
|
|
427
|
+
return False
|
|
428
|
+
|
|
429
|
+
# Type-based filters
|
|
430
|
+
if token_type == "url" and not self.preserve_urls:
|
|
431
|
+
return False
|
|
432
|
+
if token_type == "email" and not self.preserve_emails:
|
|
433
|
+
return False
|
|
434
|
+
if token_type == "hashtag" and not self.preserve_hashtags:
|
|
435
|
+
return False
|
|
436
|
+
if token_type == "mention" and not self.preserve_mentions:
|
|
437
|
+
return False
|
|
438
|
+
if token_type == "punct" and self.strip_punctuation:
|
|
439
|
+
return False
|
|
440
|
+
if self.strip_numbers and token_type in ("number_currency", "ordinal", "time", "date"):
|
|
441
|
+
return False
|
|
442
|
+
|
|
443
|
+
return True
|
|
444
|
+
|
|
445
|
+
def tokenize(self, text: str) -> list[str]:
|
|
446
|
+
"""
|
|
447
|
+
Tokenize text into a list of tokens.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
text: Input text to tokenize
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
List of tokens
|
|
454
|
+
"""
|
|
455
|
+
return list(self.tokenize_iter(text))
|
|
456
|
+
|
|
457
|
+
def tokenize_iter(self, text: str) -> Iterator[str]:
|
|
458
|
+
"""
|
|
459
|
+
Tokenize text and return an iterator (memory efficient).
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
text: Input text to tokenize
|
|
463
|
+
|
|
464
|
+
Yields:
|
|
465
|
+
Individual tokens
|
|
466
|
+
"""
|
|
467
|
+
text = self._preprocess_text(text)
|
|
468
|
+
|
|
469
|
+
for match in _TOKEN_PATTERN.finditer(text):
|
|
470
|
+
# Determine token type
|
|
471
|
+
token_type = match.lastgroup or "unknown"
|
|
472
|
+
token = match.group(0)
|
|
473
|
+
|
|
474
|
+
# Expand if needed
|
|
475
|
+
token = self._expand_token(token)
|
|
476
|
+
|
|
477
|
+
# Apply case transformation
|
|
478
|
+
if self.lowercase:
|
|
479
|
+
token = token.lower()
|
|
480
|
+
|
|
481
|
+
# Check filters
|
|
482
|
+
if self._should_keep_token(token, token_type):
|
|
483
|
+
# Handle expanded tokens (may contain spaces)
|
|
484
|
+
if " " in token:
|
|
485
|
+
yield from token.split()
|
|
486
|
+
else:
|
|
487
|
+
yield token
|
|
488
|
+
|
|
489
|
+
def tokenize_with_metadata(self, text: str) -> list[TokenMetadata]:
|
|
490
|
+
"""
|
|
491
|
+
Tokenize text and return tokens with metadata.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
text: Input text to tokenize
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
List of TokenMetadata objects
|
|
498
|
+
"""
|
|
499
|
+
preprocessed = self._preprocess_text(text)
|
|
500
|
+
result = []
|
|
501
|
+
|
|
502
|
+
for match in _TOKEN_PATTERN.finditer(preprocessed):
|
|
503
|
+
token_type = match.lastgroup or "unknown"
|
|
504
|
+
token = match.group(0)
|
|
505
|
+
|
|
506
|
+
# Expand if needed
|
|
507
|
+
token = self._expand_token(token)
|
|
508
|
+
|
|
509
|
+
# Apply case transformation
|
|
510
|
+
if self.lowercase:
|
|
511
|
+
token = token.lower()
|
|
512
|
+
|
|
513
|
+
# Check filters
|
|
514
|
+
if self._should_keep_token(token, token_type):
|
|
515
|
+
result.append(
|
|
516
|
+
TokenMetadata(
|
|
517
|
+
token=token, start=match.start(), end=match.end(), token_type=token_type
|
|
518
|
+
)
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
return result
|
|
522
|
+
|
|
523
|
+
def get_statistics(self, text: str) -> TokenizationStats:
|
|
524
|
+
"""
|
|
525
|
+
Get tokenization statistics.
|
|
526
|
+
|
|
527
|
+
Args:
|
|
528
|
+
text: Input text to analyze
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
TokenizationStats object
|
|
532
|
+
"""
|
|
533
|
+
metadata = self.tokenize_with_metadata(text)
|
|
534
|
+
|
|
535
|
+
if not metadata:
|
|
536
|
+
return TokenizationStats(
|
|
537
|
+
total_tokens=0,
|
|
538
|
+
unique_tokens=0,
|
|
539
|
+
word_tokens=0,
|
|
540
|
+
number_tokens=0,
|
|
541
|
+
punctuation_tokens=0,
|
|
542
|
+
url_tokens=0,
|
|
543
|
+
email_tokens=0,
|
|
544
|
+
hashtag_tokens=0,
|
|
545
|
+
mention_tokens=0,
|
|
546
|
+
average_token_length=0.0,
|
|
547
|
+
min_token_length=0,
|
|
548
|
+
max_token_length=0,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
tokens = [m.token for m in metadata]
|
|
552
|
+
unique_tokens = set(tokens)
|
|
553
|
+
|
|
554
|
+
# Count by type
|
|
555
|
+
type_counts = {
|
|
556
|
+
"word": 0,
|
|
557
|
+
"number": 0,
|
|
558
|
+
"punct": 0,
|
|
559
|
+
"url": 0,
|
|
560
|
+
"email": 0,
|
|
561
|
+
"hashtag": 0,
|
|
562
|
+
"mention": 0,
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
for item in metadata:
|
|
566
|
+
if item.token_type in type_counts:
|
|
567
|
+
type_counts[item.token_type] += 1
|
|
568
|
+
elif item.token_type in (
|
|
569
|
+
"word",
|
|
570
|
+
"contraction_start",
|
|
571
|
+
"internal_elision",
|
|
572
|
+
"standard_contraction",
|
|
573
|
+
"possessive",
|
|
574
|
+
"hyphenated",
|
|
575
|
+
"g_drop",
|
|
576
|
+
"roman",
|
|
577
|
+
"abbreviation",
|
|
578
|
+
):
|
|
579
|
+
type_counts["word"] += 1
|
|
580
|
+
elif item.token_type in ("number_currency", "ordinal", "time", "date", "acronym"):
|
|
581
|
+
type_counts["number"] += 1
|
|
582
|
+
|
|
583
|
+
lengths = [len(t) for t in tokens]
|
|
584
|
+
|
|
585
|
+
return TokenizationStats(
|
|
586
|
+
total_tokens=len(tokens),
|
|
587
|
+
unique_tokens=len(unique_tokens),
|
|
588
|
+
word_tokens=type_counts["word"],
|
|
589
|
+
number_tokens=type_counts["number"],
|
|
590
|
+
punctuation_tokens=type_counts["punct"],
|
|
591
|
+
url_tokens=type_counts["url"],
|
|
592
|
+
email_tokens=type_counts["email"],
|
|
593
|
+
hashtag_tokens=type_counts["hashtag"],
|
|
594
|
+
mention_tokens=type_counts["mention"],
|
|
595
|
+
average_token_length=sum(lengths) / len(lengths) if lengths else 0.0,
|
|
596
|
+
min_token_length=min(lengths) if lengths else 0,
|
|
597
|
+
max_token_length=max(lengths) if lengths else 0,
|
|
598
|
+
)
|