slithyt 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
slithyt/__init__.py ADDED
File without changes
slithyt/build.py ADDED
@@ -0,0 +1,61 @@
1
+ # src/slithyt/build.py
2
+
3
+ import pickle
4
+ import pathlib
5
+ from collections import defaultdict
6
+ from . import utils
7
+ import pronouncing
8
+
9
+ def build_phonetic_model(corpus_path: str, n: int = 3) -> dict:
10
+ """Builds a phonetic n-gram model from a word corpus."""
11
+ model = defaultdict(list)
12
+ prefix_len = n - 1
13
+
14
+ with utils.open_any(corpus_path) as f:
15
+ for i, word in enumerate(f):
16
+ if (i + 1) % 20000 == 0:
17
+ print(f" ...processed {i+1} words for phonetic model...")
18
+ word = word.strip().lower()
19
+ if not word: continue
20
+
21
+ phones_list = pronouncing.phones_for_word(word)
22
+ if not phones_list: continue
23
+
24
+ phonemes = phones_list[0].split()
25
+ padded_phonemes = (["^"] * prefix_len) + phonemes + ["$"]
26
+
27
+ for i in range(len(padded_phonemes) - prefix_len):
28
+ prefix = tuple(padded_phonemes[i : i + prefix_len])
29
+ next_phoneme = padded_phonemes[i + prefix_len]
30
+ model[prefix].append(next_phoneme)
31
+
32
+ return dict(model)
33
+
34
+ def build_transcription_model(corpus_path: str) -> dict:
35
+ """Builds a statistical model for transcribing phonemes to graphemes."""
36
+ model = defaultdict(lambda: defaultdict(int))
37
+
38
+ with utils.open_any(corpus_path) as f:
39
+ for i, word in enumerate(f):
40
+ if (i + 1) % 20000 == 0:
41
+ print(f" ...processed {i+1} words for transcription model...")
42
+ word = word.strip().lower()
43
+ if not word: continue
44
+
45
+ phones_list = pronouncing.phones_for_word(word)
46
+ if not phones_list: continue
47
+
48
+ phonemes = phones_list[0].split()
49
+
50
+ if len(phonemes) == len(word):
51
+ for i, p in enumerate(phonemes):
52
+ base_phoneme = p.rstrip('012')
53
+ letter = word[i]
54
+ model[base_phoneme][letter] += 1
55
+
56
+ final_model = {}
57
+ for phoneme, spellings in model.items():
58
+ sorted_spellings = sorted(spellings.items(), key=lambda item: item[1], reverse=True)
59
+ final_model[phoneme] = [s[0] for s in sorted_spellings[:3]]
60
+
61
+ return final_model
slithyt/cli.py ADDED
@@ -0,0 +1,153 @@
1
+ # src/slithyt/cli.py
2
+
3
+ import argparse
4
+ import pathlib
5
+ import pickle
6
+ from . import generator, validator, sentiment, pronounce, rhyme, build
7
+
8
+ def main():
9
+ """Main function for the command-line interface."""
10
+ parser = argparse.ArgumentParser(description="SlithyT: A plausible word generation tool.")
11
+ subparsers = parser.add_subparsers(dest="command", required=True)
12
+
13
+ # --- Generate command ---
14
+ gen_parser = subparsers.add_parser("generate", help="Generate new words.")
15
+ gen_parser.add_argument("--corpus", help="Path to the corpus file for training. Required unless using --rhymes-with.")
16
+ # ... (all other generate arguments)
17
+ gen_parser.add_argument("--count", type=int, default=10)
18
+ gen_parser.add_argument("--min-len", type=int, default=5)
19
+ gen_parser.add_argument("--max-len", type=int, default=10)
20
+ gen_parser.add_argument("--matches-regex")
21
+ gen_parser.add_argument("--reject-regex")
22
+ gen_parser.add_argument("--dictionary")
23
+ gen_parser.add_argument("--blocklist")
24
+ gen_parser.add_argument("--ngram-size", type=int, default=3)
25
+ gen_parser.add_argument("--min-sentiment", type=float)
26
+ gen_parser.add_argument("--max-sentiment", type=float)
27
+ gen_parser.add_argument("--min-pronounceability", type=float)
28
+ gen_parser.add_argument("--rhymes-with")
29
+ gen_parser.add_argument("--allow-corpus-words", action="store_true")
30
+
31
+ # --- Validate command ---
32
+ val_parser = subparsers.add_parser("validate", help="Validate a potential word.")
33
+ val_parser.add_argument("word")
34
+ val_parser.add_argument("--dictionary")
35
+ val_parser.add_argument("--blocklist")
36
+
37
+ # --- Rhyme command ---
38
+ rhyme_parser = subparsers.add_parser("rhyme", help="Get phonetic info for a word.")
39
+ rhyme_parser.add_argument("word")
40
+
41
+ # --- Build Cache command ---
42
+ build_parser = subparsers.add_parser("build-cache", help="Build the phonetic and transcription models.")
43
+ build_parser.add_argument("--corpus", help="Path to a custom corpus to build models from.")
44
+
45
+ args = parser.parse_args()
46
+
47
+ # --- Argument Validation ---
48
+ if args.command == "generate" and not args.corpus and not args.rhymes_with:
49
+ parser.error("--corpus is required unless --rhymes-with is used.")
50
+
51
+ # --- Command Execution ---
52
+ if args.command == "build-cache":
53
+ module_path = pathlib.Path(__file__).parent
54
+ default_dict_path = module_path / 'data' / 'cmu.txt.gz'
55
+ corpus_to_use = args.corpus if args.corpus else str(default_dict_path)
56
+
57
+ cache_dir = pathlib.Path.home() / '.slithyt' / 'data'
58
+ cache_dir.mkdir(parents=True, exist_ok=True)
59
+
60
+ phonetic_model = build.build_phonetic_model(corpus_to_use)
61
+ with open(cache_dir / 'phonetic-model.dat', "wb") as f:
62
+ pickle.dump(phonetic_model, f)
63
+ print(f"Phonetic model saved to {cache_dir / 'phonetic-model.dat'}")
64
+
65
+ transcription_model = build.build_transcription_model(corpus_to_use)
66
+ with open(cache_dir / 'transcription-model.dat', "wb") as f:
67
+ pickle.dump(transcription_model, f)
68
+ print(f"Transcription model saved to {cache_dir / 'transcription-model.dat'}")
69
+ return
70
+
71
+ if args.command == "generate" or args.command == "validate":
72
+ module_path = pathlib.Path(__file__).parent
73
+ default_dict_path = module_path / 'data' / 'cmu.txt.gz'
74
+ default_block_path = module_path / 'data' / 'en-block.txt.gz'
75
+ block_to_load = args.blocklist if args.blocklist is not None else default_block_path
76
+ blocklist_set = validator.load_word_set(str(block_to_load))
77
+ dictionary_set = set()
78
+ dict_to_load = args.dictionary if args.dictionary is not None else default_dict_path
79
+ if not (args.command == "generate" and hasattr(args, 'corpus') and args.corpus and str(dict_to_load) == args.corpus):
80
+ dictionary_set = validator.load_word_set(str(dict_to_load))
81
+
82
+ if args.command == "generate":
83
+ if args.rhymes_with:
84
+ cache_dir = pathlib.Path.home() / '.slithyt' / 'data'
85
+ phonetic_model_path = cache_dir / 'phonetic-model.dat'
86
+ transcription_model_path = cache_dir / 'transcription-model.dat'
87
+ phonetic_model = rhyme.load_phonetic_model(str(phonetic_model_path))
88
+ transcription_model = rhyme.load_transcription_model(str(transcription_model_path))
89
+ if not phonetic_model or not transcription_model: return
90
+
91
+ target_phonemes = rhyme.get_phonetic_breakdown(args.rhymes_with)
92
+ if not target_phonemes:
93
+ print(f"ERROR: Cannot find '{args.rhymes_with}' in phonetic dictionary.")
94
+ return
95
+ signature = rhyme.get_rhyme_signature(target_phonemes)
96
+ if not signature:
97
+ print(f"ERROR: Cannot find a valid rhyme signature for '{args.rhymes_with}'.")
98
+ return
99
+
100
+ print(f"INFO: Generating words that rhyme with '{args.rhymes_with}'...")
101
+ generated_words = []
102
+ for _ in range(args.count * 200):
103
+ if len(generated_words) >= args.count: break
104
+ new_phonemes = rhyme.generate_phonetic_word(phonetic_model, signature)
105
+ if not new_phonemes: continue
106
+ word = rhyme.transcribe_word(transcription_model, new_phonemes)
107
+ if word and word not in generated_words and validator.validate_word(
108
+ word, args.matches_regex, args.reject_regex, dictionary_set, blocklist_set,
109
+ None, args.min_sentiment, args.max_sentiment, args.min_pronounceability
110
+ ):
111
+ generated_words.append(word)
112
+ print(f" - {word}")
113
+ else:
114
+ print(f"INFO: Training model from '{args.corpus}'...")
115
+ model, corpus_set = generator.train_from_corpus(args.corpus, n=args.ngram_size)
116
+ if not model: return
117
+ corpus_rejection_set = None if args.allow_corpus_words else corpus_set
118
+
119
+ print(f"INFO: Generating {args.count} words...")
120
+ generated_words = []
121
+ for _ in range(args.count * 100):
122
+ if len(generated_words) >= args.count: break
123
+ word = generator.generate_word(model, args.min_len, args.max_len, n=args.ngram_size)
124
+ if word and word not in generated_words and validator.validate_word(
125
+ word, args.matches_regex, args.reject_regex, dictionary_set, blocklist_set,
126
+ corpus_rejection_set, args.min_sentiment, args.max_sentiment, args.min_pronounceability
127
+ ):
128
+ generated_words.append(word)
129
+ print(f" - {word}")
130
+
131
+ elif args.command == "validate":
132
+ is_valid = validator.validate_word(args.word, dictionary_set=dictionary_set, blocklist_set=blocklist_set)
133
+ s_score = sentiment.analyze_word_sentiment(args.word)
134
+ p_score = pronounce.score_pronounceability(args.word)
135
+ print(f"Validating word: '{args.word}'")
136
+ print(f" - Validation Result: {'Valid' if is_valid else 'Invalid'}")
137
+ print(f" - Sentiment Score: {s_score:.3f}")
138
+ print(f" - Pronounceability Score: {p_score:.3f}")
139
+
140
+ elif args.command == "rhyme":
141
+ print(f"Analyzing word: '{args.word}'")
142
+ phonemes = rhyme.get_phonetic_breakdown(args.word)
143
+ if not phonemes:
144
+ print(" - Word not found in the phonetic dictionary.")
145
+ return
146
+
147
+ print(f" - Phonetic Breakdown: {' '.join(phonemes)}")
148
+ signature = rhyme.get_rhyme_signature(phonemes)
149
+ if signature:
150
+ print(f" - Rhyme Signature: {' '.join(signature)}")
151
+
152
+ if __name__ == "__main__":
153
+ main()
File without changes
slithyt/generator.py ADDED
@@ -0,0 +1,94 @@
1
+ # Contains the n-gram model training and word generation logic.
2
+
3
+ import random
4
+ from collections import defaultdict
5
+ from . import utils
6
+
7
+ def train_from_corpus(corpus_path: str, n: int = 3) -> tuple[dict, set]:
8
+ """
9
+ Reads a corpus file once to train a character-level n-gram model
10
+ and create a set of all words in the corpus for novelty checking.
11
+
12
+ The model is a dictionary where keys are prefixes of length (n-1)
13
+ and values are lists of characters that can follow that prefix.
14
+
15
+ Args:
16
+ corpus_path: Path to the text file to train on (one word per line).
17
+ n: The order of the n-gram model (e.g., 3 for trigrams).
18
+
19
+ Returns:
20
+ A tuple containing (model_dict, corpus_word_set).
21
+ """
22
+ model = defaultdict(list)
23
+ corpus_word_set = set()
24
+
25
+ # Use special characters for start and end of a word
26
+ start_char = "^"
27
+ end_char = "$"
28
+
29
+ prefix_len = n - 1
30
+
31
+ try:
32
+ with utils.open_any(corpus_path) as f:
33
+ for line in f:
34
+ word = line.strip().lower()
35
+ if not word:
36
+ continue
37
+ corpus_word_set.add(word)
38
+
39
+ # Pad the word with start/end markers
40
+ padded_word = (start_char * prefix_len) + word + end_char
41
+
42
+ for i in range(len(padded_word) - prefix_len):
43
+ prefix = padded_word[i : i + prefix_len]
44
+ next_char = padded_word[i + prefix_len]
45
+ model[prefix].append(next_char)
46
+ except FileNotFoundError:
47
+ print(f"ERROR: Corpus file not found at {corpus_path}")
48
+ return {}, set()
49
+
50
+ return dict(model), corpus_word_set
51
+
52
+ def generate_word(model: dict, min_len: int = 5, max_len: int = 10, n: int = 3) -> str:
53
+ """
54
+ Generates a single word using the trained n-gram model.
55
+
56
+ Args:
57
+ model: The trained n-gram model from train_model().
58
+ min_len: The minimum length of the generated word.
59
+ max_len: The maximum length of the generated word.
60
+ n: The order of the n-gram model used for generation.
61
+
62
+ Returns:
63
+ A newly generated word as a string, or an empty string if generation fails.
64
+ """
65
+ if not model:
66
+ return ""
67
+
68
+ start_char = "^"
69
+ end_char = "$"
70
+ prefix_len = n - 1
71
+
72
+ # Loop until a valid word is generated
73
+ for _ in range(100): # Max attempts to prevent infinite loops
74
+ word_chars = []
75
+ current_prefix = start_char * prefix_len
76
+
77
+ for _ in range(max_len):
78
+ if current_prefix not in model:
79
+ # This prefix was not seen during training, dead end.
80
+ break
81
+
82
+ next_char = random.choice(model[current_prefix])
83
+
84
+ if next_char == end_char:
85
+ break
86
+
87
+ word_chars.append(next_char)
88
+ current_prefix = current_prefix[1:] + next_char
89
+
90
+ final_word = "".join(word_chars)
91
+ if min_len <= len(final_word) <= max_len:
92
+ return final_word
93
+
94
+ return "" # Return empty if we couldn't generate a valid word
slithyt/pronounce.py ADDED
@@ -0,0 +1,60 @@
1
+ # slithyt/pronounce.py
2
+
3
+ def score_pronounceability(word: str) -> float:
4
+ """
5
+ Calculates a pronounceability score for a word based on heuristics.
6
+ The score is between 0.0 (less pronounceable) and 1.0 (more pronounceable).
7
+
8
+ Args:
9
+ word: The word to score.
10
+
11
+ Returns:
12
+ A float representing the pronounceability score.
13
+ """
14
+ if not word:
15
+ return 0.0
16
+
17
+ word_lower = word.lower()
18
+ vowels = "aeiou"
19
+
20
+ # Heuristic 1: Penalize long consonant clusters
21
+ max_consonant_cluster = 0
22
+ current_consonant_cluster = 0
23
+ for char in word_lower:
24
+ if char not in vowels:
25
+ current_consonant_cluster += 1
26
+ else:
27
+ max_consonant_cluster = max(max_consonant_cluster, current_consonant_cluster)
28
+ current_consonant_cluster = 0
29
+ max_consonant_cluster = max(max_consonant_cluster, current_consonant_cluster)
30
+
31
+ # A cluster of more than 3 consonants is difficult.
32
+ consonant_penalty = max(0, max_consonant_cluster - 3) * 0.3
33
+
34
+ # Heuristic 2: Penalize long vowel clusters
35
+ max_vowel_cluster = 0
36
+ current_vowel_cluster = 0
37
+ for char in word_lower:
38
+ if char in vowels:
39
+ current_vowel_cluster += 1
40
+ else:
41
+ max_vowel_cluster = max(max_vowel_cluster, current_vowel_cluster)
42
+ current_vowel_cluster = 0
43
+ max_vowel_cluster = max(max_vowel_cluster, current_vowel_cluster)
44
+
45
+ # A cluster of more than 2 vowels is uncommon.
46
+ vowel_penalty = max(0, max_vowel_cluster - 2) * 0.4
47
+
48
+ # Heuristic 3: Ideal vowel-to-consonant ratio (35%-65% vowels)
49
+ num_vowels = sum(1 for char in word_lower if char in vowels)
50
+ vowel_ratio = num_vowels / len(word_lower) if len(word_lower) > 0 else 0
51
+
52
+ ratio_penalty = 0
53
+ if not (0.35 <= vowel_ratio <= 0.65):
54
+ ratio_penalty = 0.3
55
+
56
+ # Calculate final score
57
+ total_penalty = consonant_penalty + vowel_penalty + ratio_penalty
58
+ score = max(0.0, 1.0 - total_penalty)
59
+
60
+ return score
slithyt/rhyme.py ADDED
@@ -0,0 +1,87 @@
1
+ # src/slithyt/rhyme.py
2
+
3
+ import pronouncing
4
+ import pickle
5
+ import random
6
+ import pathlib
7
+ from . import build
8
+
9
+ def get_phonetic_breakdown(word: str) -> list[str] | None:
10
+ """Gets the phonetic breakdown for a word."""
11
+ pronunciations = pronouncing.phones_for_word(word)
12
+ if not pronunciations:
13
+ return None
14
+ return pronunciations[0].split()
15
+
16
+ def get_rhyme_signature(phonemes: list[str]) -> list[str] | None:
17
+ """Extracts the rhyming part of a word from its list of phonemes."""
18
+ last_stressed_vowel_index = -1
19
+ for i, p in enumerate(phonemes):
20
+ if p[-1] in ('1', '2'):
21
+ last_stressed_vowel_index = i
22
+ if last_stressed_vowel_index == -1:
23
+ return None
24
+ return phonemes[last_stressed_vowel_index:]
25
+
26
+ def load_phonetic_model(model_path: str) -> dict:
27
+ """Loads a pre-computed phonetic model, building it if it doesn't exist."""
28
+ model_path = pathlib.Path(model_path)
29
+ if model_path.exists():
30
+ with open(model_path, "rb") as f:
31
+ return pickle.load(f)
32
+ else:
33
+ print("First-time setup: Building phonetic model. This may take a moment...")
34
+ module_path = pathlib.Path(__file__).parent
35
+ default_dict_path = module_path / 'data' / 'cmu.txt.gz'
36
+
37
+ model = build.build_phonetic_model(str(default_dict_path))
38
+
39
+ model_path.parent.mkdir(parents=True, exist_ok=True)
40
+ with open(model_path, "wb") as f:
41
+ pickle.dump(model, f)
42
+ print(f"Phonetic model saved to {model_path}")
43
+ return model
44
+
45
+ def load_transcription_model(model_path: str) -> dict:
46
+ """Loads a pre-computed transcription model, building it if it doesn't exist."""
47
+ model_path = pathlib.Path(model_path)
48
+ if model_path.exists():
49
+ with open(model_path, "rb") as f:
50
+ return pickle.load(f)
51
+ else:
52
+ print("First-time setup: Building transcription model. This may take a moment...")
53
+ module_path = pathlib.Path(__file__).parent
54
+ default_dict_path = module_path / 'data' / 'cmu.txt.gz'
55
+
56
+ model = build.build_transcription_model(str(default_dict_path))
57
+
58
+ model_path.parent.mkdir(parents=True, exist_ok=True)
59
+ with open(model_path, "wb") as f:
60
+ pickle.dump(model, f)
61
+ print(f"Transcription model saved to {model_path}")
62
+ return model
63
+
64
+ def generate_phonetic_word(model: dict, rhyme_signature: list[str], n: int = 3) -> list[str] | None:
65
+ """Generates a new sequence of phonemes that ends with the given rhyme signature."""
66
+ if not model: return None
67
+ prefix_len = n - 1
68
+ current_prefix = tuple(["^"] * prefix_len)
69
+ generated_phonemes = []
70
+ for _ in range(10):
71
+ if current_prefix not in model: return None
72
+ next_phoneme = random.choice(model[current_prefix])
73
+ if next_phoneme == "$": break
74
+ generated_phonemes.append(next_phoneme)
75
+ current_prefix = tuple(list(current_prefix[1:]) + [next_phoneme])
76
+ return generated_phonemes + rhyme_signature
77
+
78
+ def transcribe_word(transcription_model: dict, phonemes: list[str]) -> str:
79
+ """Transcribes a sequence of phonemes into a plausible word spelling."""
80
+ word = []
81
+ for p in phonemes:
82
+ base_phoneme = p.rstrip('012')
83
+ if base_phoneme in transcription_model and transcription_model[base_phoneme]:
84
+ word.append(random.choice(transcription_model[base_phoneme]))
85
+ else:
86
+ word.append('?')
87
+ return "".join(word)
slithyt/sentiment.py ADDED
@@ -0,0 +1,119 @@
1
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
2
+
3
+ # Initialize the analyzer for its word lexicon.
4
+ _analyzer = SentimentIntensityAnalyzer()
5
+
6
+ # --- Structured Morpheme Lexicons ---
7
+
8
+ _INVERTING_PREFIXES = {"un", "in", "im", "il", "ir", "non", "dis", "mis", "dys", "anti"}
9
+ _INVERTING_SUFFIXES = {"less"}
10
+
11
+ _PREFIXES = {
12
+ "mal": -4.0, "mis": -3.0, "dis": -2.0, "un": -1.0, "in": -1.0, "im": -1.0,
13
+ "non": -1.0, "de": -1.0, "anti": -2.0, "contra": -2.0, "ob": -2.0,
14
+ "pseudo": -2.0, "cata": -2.0, "dys": -2.2, "caco": -2.3,
15
+ "bene": 3.0, "eu": 4.0, "pro": 2.0, "pre": 1.0, "con": 2.0, "com": 2.0,
16
+ "sym": 2.0, "syn": 2.0,
17
+ }
18
+
19
+ _SUFFIXES = {
20
+ "less": -2.0, "cide": -4.0, "ful": 1.5, "able": 1.0, "ible": 1.0,
21
+ }
22
+
23
+ _INFIXES = {
24
+ "mort": -3.0, "nec": -3.0, "necr": -3.0, "path": -3.0, "tox": -4.0,
25
+ "pess": -3.0, "mor": -3.0, "vill": -3.0, "crim": -3.0, "rupt": -2.0,
26
+ "fail": -3.0, "terr": -2.0, "horr": -4.0, "vuln": -2.0, "hostil": -3.0,
27
+ "vex": -2.0, "trib": -2.0, "fall": -2.0, "err": -1.9,
28
+ "am": 3.0, "amic": 3.0, "phil": 3.0, "pac": 4.0, "grat": 4.0,
29
+ "felic": 4.0, "beat": 4.0, "sanct": 3.0, "salv": 3.0, "ver": 3.0,
30
+ "honor": 3.0, "dign": 3.0, "fortun": 2.0, "optim": 4.0, "lucr": 2.0,
31
+ "prosper": 4.0, "brill": 3.0, "clar": 2.0, "lumin": 3.0, "vital": 3.0,
32
+ "viv": 3.0, "gen": 2.0, "cresc": 2.0, "cret": 2.0, "magn": 3.0,
33
+ "grand": 3.0, "nobl": 3.0, "excell": 4.0, "laud": 4.0, "glor": 3.0,
34
+ "merit": 3.0, "secure": 3.0, "firm": 2.0, "resolut": 2.0, "joy": 4.0,
35
+ "happ": 4.0, "hope": 3.0, "vit": 2.0, "equi": 1.5, "amor": 2.8,
36
+ "bon": 2.5, "luc": 1.8, "lum": 1.8, "cred": 1.7,
37
+ }
38
+
39
+ _WORD_LEXICON = _analyzer.lexicon
40
+
41
+ _SORTED_PREFIXES = sorted(_PREFIXES.keys(), key=len, reverse=True)
42
+ _SORTED_SUFFIXES = sorted(_SUFFIXES.keys(), key=len, reverse=True)
43
+
44
+ def _normalize_score(score: float) -> float:
45
+ """Normalizes a VADER score to a 0.0-1.0 scale."""
46
+ return (score + 4) / 8
47
+
48
+ def analyze_word_sentiment(word: str) -> float:
49
+ """
50
+ Analyzes word sentiment using a recursive, positional, multi-pass algorithm.
51
+ """
52
+ word_lower = word.lower()
53
+
54
+ if not word_lower:
55
+ return 0.5
56
+
57
+ if word_lower in _WORD_LEXICON:
58
+ return _normalize_score(_WORD_LEXICON[word_lower])
59
+
60
+ for p in _SORTED_PREFIXES:
61
+ if len(p) >= 2 and word_lower.startswith(p):
62
+ prefix_score = _PREFIXES[p]
63
+ stem = word_lower[len(p):]
64
+
65
+ if len(stem) < 4:
66
+ return _normalize_score(prefix_score)
67
+
68
+ stem_sentiment = analyze_word_sentiment(stem)
69
+
70
+ # If the stem is neutral, the prefix's sentiment dominates.
71
+ if stem_sentiment == 0.5:
72
+ return _normalize_score(prefix_score)
73
+
74
+ if p in _INVERTING_PREFIXES:
75
+ return 1.0 - stem_sentiment
76
+
77
+ avg_raw_score = (prefix_score + (stem_sentiment * 8 - 4)) / 2
78
+ return _normalize_score(avg_raw_score)
79
+
80
+ for s in _SORTED_SUFFIXES:
81
+ if len(s) >= 2 and word_lower.endswith(s):
82
+ suffix_score = _SUFFIXES[s]
83
+ stem = word_lower[:-len(s)]
84
+
85
+ if len(stem) < 4:
86
+ return _normalize_score(suffix_score)
87
+
88
+ stem_sentiment = analyze_word_sentiment(stem)
89
+
90
+ if stem_sentiment == 0.5:
91
+ return _normalize_score(suffix_score)
92
+
93
+ if s in _INVERTING_SUFFIXES:
94
+ return 1.0 - stem_sentiment
95
+
96
+ avg_raw_score = (suffix_score + (stem_sentiment * 8 - 4)) / 2
97
+ return _normalize_score(avg_raw_score)
98
+
99
+ found_scores = []
100
+ i = 0
101
+ while i < len(word_lower):
102
+ best_match = ""
103
+ for j in range(len(word_lower), i, -1):
104
+ substring = word_lower[i:j]
105
+ if len(substring) >= 3 and substring in _INFIXES:
106
+ best_match = substring
107
+ break
108
+
109
+ if best_match:
110
+ found_scores.append(_INFIXES[best_match])
111
+ i += len(best_match)
112
+ else:
113
+ i += 1
114
+
115
+ if not found_scores:
116
+ return 0.5
117
+
118
+ avg_score = sum(found_scores) / len(found_scores)
119
+ return _normalize_score(avg_score)
slithyt/utils.py ADDED
@@ -0,0 +1,21 @@
1
+ import gzip
2
+
3
+ def open_any(file_path: str):
4
+ """
5
+ Opens a file, transparently handling whether it is gzipped or plain text
6
+ by checking for the gzip magic number.
7
+
8
+ Args:
9
+ file_path: The path to the file to open.
10
+
11
+ Returns:
12
+ A file handle ready for reading in text mode.
13
+ """
14
+ with open(file_path, 'rb') as f:
15
+ is_gzipped = (f.read(2) == b'\x1f\x8b')
16
+
17
+ # Return the correct file handle based on the check
18
+ if is_gzipped:
19
+ return gzip.open(file_path, 'rt', encoding="utf-8")
20
+ else:
21
+ return open(file_path, 'r', encoding="utf-8")
slithyt/validator.py ADDED
@@ -0,0 +1,58 @@
1
+ import re
2
+ from typing import Set
3
+ from . import sentiment
4
+ from . import pronounce
5
+ from . import utils
6
+
7
+ def load_word_set(file_path: str) -> Set[str]:
8
+ """
9
+ Loads a word list from a plain text or gzipped file into a set
10
+ for efficient lookup.
11
+ """
12
+ if not file_path:
13
+ return set()
14
+ try:
15
+ with utils.open_any(file_path) as f:
16
+ return {line.strip().lower() for line in f if line.strip()}
17
+ except FileNotFoundError:
18
+ print(f"WARNING: File not found at {file_path}. Skipping this check.")
19
+ return set()
20
+
21
+ def validate_word(
22
+ word: str,
23
+ matches_regex: str = None,
24
+ reject_regex: str = None,
25
+ dictionary_set: set[str] = None,
26
+ blocklist_set: set[str] = None,
27
+ corpus_rejection_set: set[str] = None,
28
+ min_sentiment: float = None,
29
+ max_sentiment: float = None,
30
+ min_pronounceability: float = None
31
+ ) -> bool:
32
+ """
33
+ Validates a word against a set of constraints.
34
+ """
35
+ if not word:
36
+ return False
37
+ word_lower = word.lower()
38
+ if matches_regex and not re.search(matches_regex, word, re.IGNORECASE):
39
+ return False
40
+ if reject_regex and re.search(reject_regex, word, re.IGNORECASE):
41
+ return False
42
+ if dictionary_set and word_lower in dictionary_set:
43
+ return False
44
+ if blocklist_set and word_lower in blocklist_set:
45
+ return False
46
+ if corpus_rejection_set and word_lower in corpus_rejection_set:
47
+ return False
48
+ if min_sentiment is not None or max_sentiment is not None:
49
+ score = sentiment.analyze_word_sentiment(word)
50
+ if min_sentiment is not None and score < min_sentiment:
51
+ return False
52
+ if max_sentiment is not None and score > max_sentiment:
53
+ return False
54
+ if min_pronounceability is not None:
55
+ score = pronounce.score_pronounceability(word)
56
+ if score < min_pronounceability:
57
+ return False
58
+ return True
@@ -0,0 +1,103 @@
1
+ Metadata-Version: 2.4
2
+ Name: slithyt
3
+ Version: 1.0.0
4
+ Summary: A tool for generating novel, pronounceable words based on linguistic corpuses.
5
+ Author-email: Daniel Hardman <daniel.hardman@gmail.com>
6
+ License: MIT License
7
+ Project-URL: Homepage, https://github.com/dhh1128/slithyt
8
+ Project-URL: Bug Tracker, https://github.com/dhh1128/slithyt/issues
9
+ Keywords: word generation,procedural generation,nlp,linguistics,naming
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Topic :: Text Processing :: Linguistic
14
+ Classifier: Development Status :: 4 - Beta
15
+ Requires-Python: >=3.8
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: pronouncing
19
+ Requires-Dist: vaderSentiment
20
+ Dynamic: license-file
21
+
22
+ # SlithyT
23
+
24
+ A tool for generating novel, plausible, and pronounceable words based on linguistic corpuses.
25
+
26
+ The name is a reference to the "slithy toves" in Lewis Carroll's poem "Jabberwocky".
27
+
28
+ (Code was written substantially by AI, although I did a fair amount of reviewing, criticizing, revising
29
+ and debugging.)
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install .
35
+ ```
36
+
37
+ ## Usage
38
+
39
+ Generate a word that looks/sounds like it fits with other words in a given
40
+ corpus. Similarity is determined partly by ngram analysis and partly by
41
+ pronunciation.
42
+
43
+ You can make your own corpus, or use pregenerated ones (in the data folder
44
+ of the package):
45
+
46
+ * Astronomy names (stars, galaxies, planets)
47
+ * Transliterated Greek, Latin, Hebrew, Egyptian names
48
+ * Harry Potter or Star Wars names
49
+ * Drug names
50
+ * Latin words from biology taxonomy (genus, species)
51
+
52
+ You can also use the whole dictionary as your corpus, in which case you will
53
+ get words with no particular flavor to them. A good corpus has at least a
54
+ couple hundred words in it.
55
+
56
+ By default, generated words are *novel*, meaning they won't appear in the
57
+ corpus you reference. You can also add a blocklist to avoid generating curse
58
+ words, words that violate trademarks or spam filters, etc.
59
+
60
+ All corpora and dictionary/block list files used by this tool are text
61
+ files having a single word per line, and can optionally be gzipped.
62
+ Sentiment analysis, pronounceability, and rhyming are moderately English-
63
+ centric, though the tolerate romance and germanic languages a bit as well.
64
+ However, they could be made to reflect the sensibilities of other language
65
+ communities by running build_phonetic_model.py and build_transcription_model.py
66
+ in the package's scripts folder. These generate cached patterns in
67
+ ~/.slithyt/data.
68
+
69
+ ```bash
70
+ # Generate 10 realistic words that sound like they belong in corpus. Make
71
+ # the words have a length of at least 5 characters.
72
+ slithyt generate --corpus path/to/your/corpus.txt
73
+
74
+ # Generate words that have a positive connotation due to sound symbolism
75
+ # (see https://en.wikipedia.org/wiki/Sound_symbolism), that have use n=4
76
+ # for ngram analysis. (The --ngram-size argument is a tradeoff. Default is 3.
77
+ # Bigger values make the resonance with the corpus stronger, but also make it
78
+ # harder to be creative; it may be impossible to generate words if you go too
79
+ # high. Smaller values give the algorithm more freedom in both size and
80
+ # character sequence, but the output might sound less like the corpus.)
81
+ slithyt generate --corpus path/to/corpus.txt --min-sentiment 0.8 --ngram-size 4
82
+
83
+ # Generate words that are at between 4 and 8 characters long, and that are at
84
+ # least moderately pronounceable. (Pronounceability depends partly on the
85
+ # speaker's judgment; slithyt uses a simple algorithm to predict scores from
86
+ # 0 (hardest) to 1 (easiest), but the corpus may affect how reasonable 0.5 is.
87
+ # Typically, the variety of generated word lengths matches the variety of
88
+ # word lengths in the corpus. These values constrain output but may make
89
+ # generation impossible, if nothing in the corpus is as small or as large as
90
+ # what was requested.)
91
+ slithyt generate --corpus path/to/corpus.txt --min-length 4 --max-length 8 --min-pronounceability 0.5
92
+
93
+ # Generate 5 words that rhyme with synergy
94
+ slithyt generate --count 5 --rhymes-with synergy
95
+
96
+ # Report the rhyming analysis for synergy. (Only known words are usable
97
+ # as a rhyming template; passing made-up words here will do nothing
98
+ # useful.)
99
+ slithyt rhyme synergy
100
+
101
+ # Check to see whether a particular made-up word would pass certain tests.
102
+ slithyt validate synerjee
103
+ ```
@@ -0,0 +1,16 @@
1
+ slithyt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ slithyt/build.py,sha256=kY33U9J-9VsqE7Yk7GmGU1suz4djF-WR5MCdpmb8rTg,2218
3
+ slithyt/cli.py,sha256=Iti7pwMBmme5rjX1I-8L5sCtbOZER3yjYAEYWhQa9h4,7780
4
+ slithyt/generator.py,sha256=a6rIySznbYE-CNU8SpRbEusPFQsKfWHE04J1kQRp6mU,3123
5
+ slithyt/pronounce.py,sha256=WJrnBqQhj23eg-ab7X2O18Pzkt8nWF9wgGCxVe4qj9A,1995
6
+ slithyt/rhyme.py,sha256=Ka-bbU_F6FZ9U_X6Blz_hCZE0f3bXRhFfFJ6HGwSHs8,3468
7
+ slithyt/sentiment.py,sha256=JRbJ2Etp9cUNw5h4BoN7XxiYT7dGE6eDhaqhixdvjY4,4228
8
+ slithyt/utils.py,sha256=_NrSALdA-tnnCmMXA3m60_Jpqf-lRRwku92LJ0zt414,595
9
+ slithyt/validator.py,sha256=eglvR9nKCh9oIylzgBc4Y2P5OheJzLtwdi-RqEF0RXQ,1922
10
+ slithyt/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ slithyt-1.0.0.dist-info/licenses/LICENSE,sha256=Hr7Rdl74t83L74WCKMfbp7vkpIdkrqaJ3uUjpnnse4w,1057
12
+ slithyt-1.0.0.dist-info/METADATA,sha256=pdRjy34I6EB9OOuyxPzbEPkfAoePp1jGDnZgN2BrfqU,4381
13
+ slithyt-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ slithyt-1.0.0.dist-info/entry_points.txt,sha256=EPXqPZYpJ7a1DV3x3AfJLi8uniQjoPMsHy5dQOoGNOE,45
15
+ slithyt-1.0.0.dist-info/top_level.txt,sha256=0SZUC3JVinOynykEdmR_YnhhBIcBRciEA7kkcTWjRms,8
16
+ slithyt-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ slithyt = slithyt.cli:main
@@ -0,0 +1,7 @@
1
+ Copyright (c) 2025 Daniel Hardman
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1 @@
1
+ slithyt