pystylometry 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,598 @@
1
+ """Advanced tokenizer for stylometric analysis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ import unicodedata
7
+ from dataclasses import dataclass
8
+ from typing import Iterator
9
+
10
+ # ===== Unicode Normalization Tables =====
11
+
12
+ # Single-character replacements (fast lookup with str.maketrans)
13
+ _UNICODE_REPLACEMENTS = str.maketrans(
14
+ {
15
+ # Smart quotes
16
+ "\u2018": "'", # Left single quote
17
+ "\u2019": "'", # Right single quote
18
+ "\u201a": "'", # Single low-9 quote
19
+ "\u201b": "'", # Single high-reversed-9 quote
20
+ "\u201c": '"', # Left double quote
21
+ "\u201d": '"', # Right double quote
22
+ "\u201e": '"', # Double low-9 quote
23
+ "\u201f": '"', # Double high-reversed-9 quote
24
+ # Dashes
25
+ "\u2013": "-", # En dash
26
+ "\u2014": "-", # Em dash
27
+ "\u2015": "-", # Horizontal bar
28
+ "\u2212": "-", # Minus sign
29
+ # Spaces
30
+ "\u00a0": " ", # Non-breaking space
31
+ "\u2002": " ", # En space
32
+ "\u2003": " ", # Em space
33
+ "\u2009": " ", # Thin space
34
+ "\u200a": " ", # Hair space
35
+ # Apostrophes and primes
36
+ "\u02bc": "'", # Modifier letter apostrophe
37
+ "\u2032": "'", # Prime
38
+ "\u2033": '"', # Double prime
39
+ # Ellipsis
40
+ "\u2026": "...", # Horizontal ellipsis
41
+ # Ligatures (decompose)
42
+ "\ufb01": "fi",
43
+ "\ufb02": "fl",
44
+ "\ufb03": "ffi",
45
+ "\ufb04": "ffl",
46
+ "\u00e6": "ae", # æ
47
+ "\u00c6": "AE", # Æ
48
+ "\u0153": "oe", # œ
49
+ "\u0152": "OE", # Œ
50
+ # Mathematical operators
51
+ "\u00d7": "x", # Multiplication sign
52
+ "\u00f7": "/", # Division sign
53
+ "\u00b1": "+/-", # Plus-minus
54
+ # Currency (normalize for analysis)
55
+ "\u00a3": "GBP", # Pound
56
+ "\u00a5": "JPY", # Yen
57
+ "\u20ac": "EUR", # Euro
58
+ # Fractions
59
+ "\u00bc": "1/4",
60
+ "\u00bd": "1/2",
61
+ "\u00be": "3/4",
62
+ "\u2153": "1/3",
63
+ "\u2154": "2/3",
64
+ }
65
+ )
66
+
67
+ # Multi-character patterns (regex-based)
68
+ _MULTI_CHAR_PATTERNS = [
69
+ # Multiple dashes to single dash
70
+ (re.compile(r"[-\u2013\u2014]{2,}"), "-"),
71
+ # Multiple dots (not ellipsis)
72
+ (re.compile(r"\.{4,}"), "..."),
73
+ # Zero-width characters
74
+ (re.compile(r"[\u200b-\u200d\ufeff]"), ""),
75
+ # Control characters except newline/tab
76
+ (re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]"), ""),
77
+ # Multiple spaces/tabs to single space
78
+ (re.compile(r"[ \t]+"), " "),
79
+ # HTML entities (common ones)
80
+ (re.compile(r" "), " "),
81
+ (re.compile(r"""), '"'),
82
+ (re.compile(r"'"), "'"),
83
+ (re.compile(r"&"), "&"),
84
+ (re.compile(r"&lt;"), "<"),
85
+ (re.compile(r"&gt;"), ">"),
86
+ ]
87
+
88
+
89
+ # ===== Text Cleaning Patterns =====
90
+
91
+
92
+ def _remove_italics_markers(text: str) -> str:
93
+ """Remove markdown/formatting italics markers."""
94
+ # Remove asterisk/underscore pairs around words
95
+ text = re.sub(r"\*([^\*]+)\*", r"\1", text)
96
+ text = re.sub(r"_([^_]+)_", r"\1", text)
97
+ return text
98
+
99
+
100
+ def _remove_brackets(text: str) -> str:
101
+ """Remove bracketed content [like this] and {like this}."""
102
+ text = re.sub(r"\[([^\]]+)\]", r"\1", text)
103
+ text = re.sub(r"\{([^\}]+)\}", r"\1", text)
104
+ return text
105
+
106
+
107
+ def _remove_line_break_hyphens(text: str) -> str:
108
+ """Remove hyphens at line breaks (word-\nbreak -> wordbreak)."""
109
+ return re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)
110
+
111
+
112
+ def _remove_page_markers(text: str) -> str:
113
+ """Remove page numbers and headers like [Page 123] or --- Page 45 ---."""
114
+ text = re.sub(r"\[Page\s+\d+\]", "", text, flags=re.IGNORECASE)
115
+ text = re.sub(r"[-=]{2,}\s*Page\s+\d+\s*[-=]{2,}", "", text, flags=re.IGNORECASE)
116
+ return text
117
+
118
+
119
+ def _normalize_whitespace(text: str) -> str:
120
+ """Normalize all whitespace to single spaces."""
121
+ # Collapse multiple newlines
122
+ text = re.sub(r"\n{3,}", "\n\n", text)
123
+ # Normalize spaces
124
+ text = re.sub(r"[ \t]+", " ", text)
125
+ return text.strip()
126
+
127
+
128
+ # ===== Token Pattern =====
129
+
130
+ # Comprehensive token pattern with priority-ordered alternations
131
+ _TOKEN_PATTERN = re.compile(
132
+ r"""
133
+ # URLs (highest priority to avoid splitting)
134
+ (?P<url>https?://\S+)|
135
+
136
+ # Email addresses
137
+ (?P<email>\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b)|
138
+
139
+ # Hashtags and mentions (social media)
140
+ (?P<hashtag>\#\w+)|
141
+ (?P<mention>@\w+)|
142
+
143
+ # Time expressions (3:45pm, 10:30:15)
144
+ (?P<time>\d{1,2}:\d{2}(?::\d{2})?(?:[ap]m)?)|
145
+
146
+ # Dates (ISO format: 2024-01-15)
147
+ (?P<date>\d{4}-\d{2}-\d{2})|
148
+
149
+ # Acronyms with periods (U.S.A., Ph.D.)
150
+ (?P<acronym>(?:[A-Z]\.){2,})|
151
+
152
+ # Contractions and possessives (complex patterns)
153
+ (?P<contraction_start>
154
+ '(?:tis|twas|twere|twould|twill|em|gainst|cause|bout|til|way)(?![a-z])
155
+ )|
156
+ (?P<internal_elision>
157
+ \w+[''](?:er|re|ve|ll|d|m|s|t|clock)(?![a-z])
158
+ )|
159
+ (?P<hyphen_possessive>
160
+ (?:\w+(?:-\w+)+)['']s?
161
+ )|
162
+ (?P<standard_contraction>
163
+ \w+[''][a-z]{1,3}(?![a-z])
164
+ )|
165
+ (?P<possessive>
166
+ \w+['']s?(?![a-z])
167
+ )|
168
+
169
+ # Roman numerals
170
+ (?P<roman>\b[IVXLCDM]+\b)|
171
+
172
+ # Ordinals (1st, 2nd, 3rd, 4th, etc.)
173
+ (?P<ordinal>\d+(?:st|nd|rd|th))|
174
+
175
+ # Numbers with commas and decimals ($1,234.56)
176
+ (?P<number_currency>\$?\d{1,3}(?:,\d{3})*(?:\.\d+)?)|
177
+
178
+ # Abbreviations (Dr., Mr., Mrs., etc.)
179
+ (?P<abbreviation>(?:Dr|Mr|Mrs|Ms|Prof|Sr|Jr|vs|etc|e\.g|i\.e)\.)|
180
+
181
+ # G-dropping (singin', dancin')
182
+ (?P<g_drop>\w+in[''])|
183
+
184
+ # Hyphenated compounds (mother-in-law, well-known)
185
+ (?P<hyphenated>(?:\w+-)+\w+)|
186
+
187
+ # Regular words (including internal apostrophes like "o'clock")
188
+ (?P<word>\w+(?:[']\w+)*)|
189
+
190
+ # Ellipsis
191
+ (?P<ellipsis>\.{3}|…)|
192
+
193
+ # Individual punctuation
194
+ (?P<punct>[^\w\s])
195
+ """,
196
+ re.VERBOSE | re.IGNORECASE | re.UNICODE,
197
+ )
198
+
199
+
200
+ # ===== Common Abbreviations =====
201
+
202
+ _COMMON_ABBREV = {
203
+ # Titles
204
+ "Dr.": "Doctor",
205
+ "Mr.": "Mister",
206
+ "Mrs.": "Misses",
207
+ "Ms.": "Miss",
208
+ "Prof.": "Professor",
209
+ "Sr.": "Senior",
210
+ "Jr.": "Junior",
211
+ # Latin
212
+ "e.g.": "for example",
213
+ "i.e.": "that is",
214
+ "etc.": "et cetera",
215
+ "vs.": "versus",
216
+ # Time
217
+ "a.m.": "AM",
218
+ "p.m.": "PM",
219
+ }
220
+
221
+ # Contraction expansions
222
+ _CONTRACTIONS = {
223
+ "ain't": "am not",
224
+ "aren't": "are not",
225
+ "can't": "cannot",
226
+ "won't": "will not",
227
+ "shan't": "shall not",
228
+ "couldn't": "could not",
229
+ "shouldn't": "should not",
230
+ "wouldn't": "would not",
231
+ "didn't": "did not",
232
+ "doesn't": "does not",
233
+ "don't": "do not",
234
+ "hadn't": "had not",
235
+ "hasn't": "has not",
236
+ "haven't": "have not",
237
+ "he'd": "he would",
238
+ "he'll": "he will",
239
+ "he's": "he is",
240
+ "i'd": "I would",
241
+ "i'll": "I will",
242
+ "i'm": "I am",
243
+ "i've": "I have",
244
+ "isn't": "is not",
245
+ "it's": "it is",
246
+ "let's": "let us",
247
+ "she'd": "she would",
248
+ "she'll": "she will",
249
+ "she's": "she is",
250
+ "that's": "that is",
251
+ "there's": "there is",
252
+ "they'd": "they would",
253
+ "they'll": "they will",
254
+ "they're": "they are",
255
+ "they've": "they have",
256
+ "we'd": "we would",
257
+ "we'll": "we will",
258
+ "we're": "we are",
259
+ "we've": "we have",
260
+ "weren't": "were not",
261
+ "what's": "what is",
262
+ "where's": "where is",
263
+ "who's": "who is",
264
+ "you'd": "you would",
265
+ "you'll": "you will",
266
+ "you're": "you are",
267
+ "you've": "you have",
268
+ "'tis": "it is",
269
+ "'twas": "it was",
270
+ "'em": "them",
271
+ }
272
+
273
+
274
+ @dataclass
275
+ class TokenMetadata:
276
+ """Metadata about a token."""
277
+
278
+ token: str
279
+ start: int
280
+ end: int
281
+ token_type: str # word, url, email, number, punctuation, etc.
282
+
283
+
284
+ @dataclass
285
+ class TokenizationStats:
286
+ """Statistics from tokenization."""
287
+
288
+ total_tokens: int
289
+ unique_tokens: int
290
+ word_tokens: int
291
+ number_tokens: int
292
+ punctuation_tokens: int
293
+ url_tokens: int
294
+ email_tokens: int
295
+ hashtag_tokens: int
296
+ mention_tokens: int
297
+ average_token_length: float
298
+ min_token_length: int
299
+ max_token_length: int
300
+
301
+
302
+ class Tokenizer:
303
+ """
304
+ Advanced tokenizer for stylometric analysis.
305
+
306
+ Features:
307
+ - Comprehensive unicode normalization
308
+ - Text cleaning (italics, brackets, page markers)
309
+ - Sophisticated token pattern matching
310
+ - Configurable filtering options
311
+ - Token metadata tracking
312
+ - Memory-efficient iteration
313
+
314
+ Args:
315
+ lowercase: Convert tokens to lowercase (default: True)
316
+ min_length: Minimum token length (default: 1)
317
+ max_length: Maximum token length (default: None)
318
+ strip_numbers: Remove numeric tokens (default: False)
319
+ strip_punctuation: Remove pure punctuation tokens (default: True)
320
+ preserve_urls: Keep URL tokens (default: False)
321
+ preserve_emails: Keep email tokens (default: False)
322
+ preserve_hashtags: Keep hashtag tokens (default: False)
323
+ preserve_mentions: Keep @mention tokens (default: False)
324
+ expand_contractions: Expand contractions to full words (default: False)
325
+ expand_abbreviations: Expand common abbreviations (default: False)
326
+ strip_accents: Remove accents from characters (default: False)
327
+ normalize_unicode: Apply unicode normalization (default: True)
328
+ clean_text: Apply text cleaning (default: True)
329
+
330
+ Example:
331
+ >>> tokenizer = Tokenizer(lowercase=True, strip_punctuation=True)
332
+ >>> tokens = tokenizer.tokenize("Hello, world! It's a test.")
333
+ >>> print(tokens)
334
+ ['hello', 'world', "it's", 'a', 'test']
335
+
336
+ >>> # With metadata
337
+ >>> metadata = tokenizer.tokenize_with_metadata("Test text")
338
+ >>> for item in metadata:
339
+ ... print(f"{item.token} [{item.token_type}] at {item.start}-{item.end}")
340
+ """
341
+
342
+ def __init__(
343
+ self,
344
+ lowercase: bool = True,
345
+ min_length: int = 1,
346
+ max_length: int | None = None,
347
+ strip_numbers: bool = False,
348
+ strip_punctuation: bool = True,
349
+ preserve_urls: bool = False,
350
+ preserve_emails: bool = False,
351
+ preserve_hashtags: bool = False,
352
+ preserve_mentions: bool = False,
353
+ expand_contractions: bool = False,
354
+ expand_abbreviations: bool = False,
355
+ strip_accents: bool = False,
356
+ normalize_unicode: bool = True,
357
+ clean_text: bool = True,
358
+ ):
359
+ self.lowercase = lowercase
360
+ self.min_length = min_length
361
+ self.max_length = max_length
362
+ self.strip_numbers = strip_numbers
363
+ self.strip_punctuation = strip_punctuation
364
+ self.preserve_urls = preserve_urls
365
+ self.preserve_emails = preserve_emails
366
+ self.preserve_hashtags = preserve_hashtags
367
+ self.preserve_mentions = preserve_mentions
368
+ self.expand_contractions = expand_contractions
369
+ self.expand_abbreviations = expand_abbreviations
370
+ self.strip_accents = strip_accents
371
+ self.normalize_unicode = normalize_unicode
372
+ self.clean_text = clean_text
373
+
374
+ def _preprocess_text(self, text: str) -> str:
375
+ """Apply unicode normalization and text cleaning."""
376
+ if not text:
377
+ return ""
378
+
379
+ # Unicode normalization
380
+ if self.normalize_unicode:
381
+ # Apply character replacements
382
+ text = text.translate(_UNICODE_REPLACEMENTS)
383
+
384
+ # Apply multi-character patterns
385
+ for pattern, replacement in _MULTI_CHAR_PATTERNS:
386
+ text = pattern.sub(replacement, text)
387
+
388
+ # Text cleaning
389
+ if self.clean_text:
390
+ text = _remove_italics_markers(text)
391
+ text = _remove_brackets(text)
392
+ text = _remove_line_break_hyphens(text)
393
+ text = _remove_page_markers(text)
394
+ text = _normalize_whitespace(text)
395
+
396
+ # Strip accents if requested
397
+ if self.strip_accents:
398
+ # NFD decomposition then filter out combining marks
399
+ text = unicodedata.normalize("NFD", text)
400
+ text = "".join(c for c in text if unicodedata.category(c) != "Mn")
401
+
402
+ return text
403
+
404
+ def _expand_token(self, token: str) -> str:
405
+ """Expand contractions and abbreviations if configured."""
406
+ if self.expand_contractions:
407
+ lower_token = token.lower()
408
+ if lower_token in _CONTRACTIONS:
409
+ expanded = _CONTRACTIONS[lower_token]
410
+ # Preserve case for first character
411
+ if token[0].isupper():
412
+ expanded = expanded.capitalize()
413
+ return expanded
414
+
415
+ if self.expand_abbreviations:
416
+ if token in _COMMON_ABBREV:
417
+ return _COMMON_ABBREV[token]
418
+
419
+ return token
420
+
421
+ def _should_keep_token(self, token: str, token_type: str) -> bool:
422
+ """Determine if token should be kept based on filters."""
423
+ # Length filters
424
+ if len(token) < self.min_length:
425
+ return False
426
+ if self.max_length and len(token) > self.max_length:
427
+ return False
428
+
429
+ # Type-based filters
430
+ if token_type == "url" and not self.preserve_urls:
431
+ return False
432
+ if token_type == "email" and not self.preserve_emails:
433
+ return False
434
+ if token_type == "hashtag" and not self.preserve_hashtags:
435
+ return False
436
+ if token_type == "mention" and not self.preserve_mentions:
437
+ return False
438
+ if token_type == "punct" and self.strip_punctuation:
439
+ return False
440
+ if self.strip_numbers and token_type in ("number_currency", "ordinal", "time", "date"):
441
+ return False
442
+
443
+ return True
444
+
445
+ def tokenize(self, text: str) -> list[str]:
446
+ """
447
+ Tokenize text into a list of tokens.
448
+
449
+ Args:
450
+ text: Input text to tokenize
451
+
452
+ Returns:
453
+ List of tokens
454
+ """
455
+ return list(self.tokenize_iter(text))
456
+
457
+ def tokenize_iter(self, text: str) -> Iterator[str]:
458
+ """
459
+ Tokenize text and return an iterator (memory efficient).
460
+
461
+ Args:
462
+ text: Input text to tokenize
463
+
464
+ Yields:
465
+ Individual tokens
466
+ """
467
+ text = self._preprocess_text(text)
468
+
469
+ for match in _TOKEN_PATTERN.finditer(text):
470
+ # Determine token type
471
+ token_type = match.lastgroup or "unknown"
472
+ token = match.group(0)
473
+
474
+ # Expand if needed
475
+ token = self._expand_token(token)
476
+
477
+ # Apply case transformation
478
+ if self.lowercase:
479
+ token = token.lower()
480
+
481
+ # Check filters
482
+ if self._should_keep_token(token, token_type):
483
+ # Handle expanded tokens (may contain spaces)
484
+ if " " in token:
485
+ yield from token.split()
486
+ else:
487
+ yield token
488
+
489
+ def tokenize_with_metadata(self, text: str) -> list[TokenMetadata]:
490
+ """
491
+ Tokenize text and return tokens with metadata.
492
+
493
+ Args:
494
+ text: Input text to tokenize
495
+
496
+ Returns:
497
+ List of TokenMetadata objects
498
+ """
499
+ preprocessed = self._preprocess_text(text)
500
+ result = []
501
+
502
+ for match in _TOKEN_PATTERN.finditer(preprocessed):
503
+ token_type = match.lastgroup or "unknown"
504
+ token = match.group(0)
505
+
506
+ # Expand if needed
507
+ token = self._expand_token(token)
508
+
509
+ # Apply case transformation
510
+ if self.lowercase:
511
+ token = token.lower()
512
+
513
+ # Check filters
514
+ if self._should_keep_token(token, token_type):
515
+ result.append(
516
+ TokenMetadata(
517
+ token=token, start=match.start(), end=match.end(), token_type=token_type
518
+ )
519
+ )
520
+
521
+ return result
522
+
523
+ def get_statistics(self, text: str) -> TokenizationStats:
524
+ """
525
+ Get tokenization statistics.
526
+
527
+ Args:
528
+ text: Input text to analyze
529
+
530
+ Returns:
531
+ TokenizationStats object
532
+ """
533
+ metadata = self.tokenize_with_metadata(text)
534
+
535
+ if not metadata:
536
+ return TokenizationStats(
537
+ total_tokens=0,
538
+ unique_tokens=0,
539
+ word_tokens=0,
540
+ number_tokens=0,
541
+ punctuation_tokens=0,
542
+ url_tokens=0,
543
+ email_tokens=0,
544
+ hashtag_tokens=0,
545
+ mention_tokens=0,
546
+ average_token_length=0.0,
547
+ min_token_length=0,
548
+ max_token_length=0,
549
+ )
550
+
551
+ tokens = [m.token for m in metadata]
552
+ unique_tokens = set(tokens)
553
+
554
+ # Count by type
555
+ type_counts = {
556
+ "word": 0,
557
+ "number": 0,
558
+ "punct": 0,
559
+ "url": 0,
560
+ "email": 0,
561
+ "hashtag": 0,
562
+ "mention": 0,
563
+ }
564
+
565
+ for item in metadata:
566
+ if item.token_type in type_counts:
567
+ type_counts[item.token_type] += 1
568
+ elif item.token_type in (
569
+ "word",
570
+ "contraction_start",
571
+ "internal_elision",
572
+ "standard_contraction",
573
+ "possessive",
574
+ "hyphenated",
575
+ "g_drop",
576
+ "roman",
577
+ "abbreviation",
578
+ ):
579
+ type_counts["word"] += 1
580
+ elif item.token_type in ("number_currency", "ordinal", "time", "date", "acronym"):
581
+ type_counts["number"] += 1
582
+
583
+ lengths = [len(t) for t in tokens]
584
+
585
+ return TokenizationStats(
586
+ total_tokens=len(tokens),
587
+ unique_tokens=len(unique_tokens),
588
+ word_tokens=type_counts["word"],
589
+ number_tokens=type_counts["number"],
590
+ punctuation_tokens=type_counts["punct"],
591
+ url_tokens=type_counts["url"],
592
+ email_tokens=type_counts["email"],
593
+ hashtag_tokens=type_counts["hashtag"],
594
+ mention_tokens=type_counts["mention"],
595
+ average_token_length=sum(lengths) / len(lengths) if lengths else 0.0,
596
+ min_token_length=min(lengths) if lengths else 0,
597
+ max_token_length=max(lengths) if lengths else 0,
598
+ )