slithyt 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- slithyt-1.0.0/LICENSE +7 -0
- slithyt-1.0.0/PKG-INFO +103 -0
- slithyt-1.0.0/README.md +82 -0
- slithyt-1.0.0/pyproject.toml +40 -0
- slithyt-1.0.0/setup.cfg +4 -0
- slithyt-1.0.0/src/slithyt/__init__.py +0 -0
- slithyt-1.0.0/src/slithyt/build.py +61 -0
- slithyt-1.0.0/src/slithyt/cli.py +153 -0
- slithyt-1.0.0/src/slithyt/data/__init__.py +0 -0
- slithyt-1.0.0/src/slithyt/generator.py +94 -0
- slithyt-1.0.0/src/slithyt/pronounce.py +60 -0
- slithyt-1.0.0/src/slithyt/rhyme.py +87 -0
- slithyt-1.0.0/src/slithyt/sentiment.py +119 -0
- slithyt-1.0.0/src/slithyt/utils.py +21 -0
- slithyt-1.0.0/src/slithyt/validator.py +58 -0
- slithyt-1.0.0/src/slithyt.egg-info/PKG-INFO +103 -0
- slithyt-1.0.0/src/slithyt.egg-info/SOURCES.txt +24 -0
- slithyt-1.0.0/src/slithyt.egg-info/dependency_links.txt +1 -0
- slithyt-1.0.0/src/slithyt.egg-info/entry_points.txt +2 -0
- slithyt-1.0.0/src/slithyt.egg-info/requires.txt +2 -0
- slithyt-1.0.0/src/slithyt.egg-info/top_level.txt +1 -0
- slithyt-1.0.0/tests/test_generator.py +45 -0
- slithyt-1.0.0/tests/test_pronounce.py +33 -0
- slithyt-1.0.0/tests/test_rhyme.py +28 -0
- slithyt-1.0.0/tests/test_sentiment.py +51 -0
- slithyt-1.0.0/tests/test_validator.py +60 -0
slithyt-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright (c) 2025 Daniel Hardman
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
slithyt-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: slithyt
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A tool for generating novel, pronounceable words based on linguistic corpuses.
|
|
5
|
+
Author-email: Daniel Hardman <daniel.hardman@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
Project-URL: Homepage, https://github.com/dhh1128/slithyt
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/dhh1128/slithyt/issues
|
|
9
|
+
Keywords: word generation,procedural generation,nlp,linguistics,naming
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: pronouncing
|
|
19
|
+
Requires-Dist: vaderSentiment
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# SlithyT
|
|
23
|
+
|
|
24
|
+
A tool for generating novel, plausible, and pronounceable words based on linguistic corpuses.
|
|
25
|
+
|
|
26
|
+
The name is a reference to the "slithy toves" in Lewis Carroll's poem "Jabberwocky".
|
|
27
|
+
|
|
28
|
+
(Code was written substantially by AI, although I did a fair amount of reviewing, criticizing, revising
|
|
29
|
+
and debugging.)
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install .
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
|
|
39
|
+
Generate a word that looks/sounds like it fits with other words in a given
|
|
40
|
+
corpus. Similarity is determined partly by ngram analysis and partly by
|
|
41
|
+
pronunciation.
|
|
42
|
+
|
|
43
|
+
You can make your own corpus, or use pregenerated ones (in the data folder
|
|
44
|
+
of the package):
|
|
45
|
+
|
|
46
|
+
* Astronomy names (stars, galaxies, planets)
|
|
47
|
+
* Transliterated Greek, Latin, Hebrew, Egyptian names
|
|
48
|
+
* Harry Potter or Star Wars names
|
|
49
|
+
* Drug names
|
|
50
|
+
* Latin words from biology taxonomy (genus, species)
|
|
51
|
+
|
|
52
|
+
You can also use the whole dictionary as your corpus, in which case you will
|
|
53
|
+
get words with no particular flavor to them. A good corpus has at least a
|
|
54
|
+
couple hundred words in it.
|
|
55
|
+
|
|
56
|
+
By default, generated words are *novel*, meaning they won't appear in the
|
|
57
|
+
corpus you reference. You can also add a blocklist to avoid generating curse
|
|
58
|
+
words, words that violate trademarks or spam filters, etc.
|
|
59
|
+
|
|
60
|
+
All corpora and dictionary/block list files used by this tool are text
|
|
61
|
+
files having a single word per line, and can optionally be gzipped.
|
|
62
|
+
Sentiment analysis, pronounceability, and rhyming are moderately English-
|
|
63
|
+
centric, though the tolerate romance and germanic languages a bit as well.
|
|
64
|
+
However, they could be made to reflect the sensibilities of other language
|
|
65
|
+
communities by running build_phonetic_model.py and build_transcription_model.py
|
|
66
|
+
in the package's scripts folder. These generate cached patterns in
|
|
67
|
+
~/.slithyt/data.
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# Generate 10 realistic words that sound like they belong in corpus. Make
|
|
71
|
+
# the words have a length of at least 5 characters.
|
|
72
|
+
slithyt generate --corpus path/to/your/corpus.txt
|
|
73
|
+
|
|
74
|
+
# Generate words that have a positive connotation due to sound symbolism
|
|
75
|
+
# (see https://en.wikipedia.org/wiki/Sound_symbolism), that have use n=4
|
|
76
|
+
# for ngram analysis. (The --ngram-size argument is a tradeoff. Default is 3.
|
|
77
|
+
# Bigger values make the resonance with the corpus stronger, but also make it
|
|
78
|
+
# harder to be creative; it may be impossible to generate words if you go too
|
|
79
|
+
# high. Smaller values give the algorithm more freedom in both size and
|
|
80
|
+
# character sequence, but the output might sound less like the corpus.)
|
|
81
|
+
slithyt generate --corpus path/to/corpus.txt --min-sentiment 0.8 --ngram-size 4
|
|
82
|
+
|
|
83
|
+
# Generate words that are at between 4 and 8 characters long, and that are at
|
|
84
|
+
# least moderately pronounceable. (Pronounceability depends partly on the
|
|
85
|
+
# speaker's judgment; slithyt uses a simple algorithm to predict scores from
|
|
86
|
+
# 0 (hardest) to 1 (easiest), but the corpus may affect how reasonable 0.5 is.
|
|
87
|
+
# Typically, the variety of generated word lengths matches the variety of
|
|
88
|
+
# word lengths in the corpus. These values constrain output but may make
|
|
89
|
+
# generation impossible, if nothing in the corpus is as small or as large as
|
|
90
|
+
# what was requested.)
|
|
91
|
+
slithyt generate --corpus path/to/corpus.txt --min-length 4 --max-length 8 --min-pronounceability 0.5
|
|
92
|
+
|
|
93
|
+
# Generate 5 words that rhyme with synergy
|
|
94
|
+
slithyt generate --count 5 --rhymes-with synergy
|
|
95
|
+
|
|
96
|
+
# Report the rhyming analysis for synergy. (Only known words are usable
|
|
97
|
+
# as a rhyming template; passing made-up words here will do nothing
|
|
98
|
+
# useful.)
|
|
99
|
+
slithyt rhyme synergy
|
|
100
|
+
|
|
101
|
+
# Check to see whether a particular made-up word would pass certain tests.
|
|
102
|
+
slithyt validate synerjee
|
|
103
|
+
```
|
slithyt-1.0.0/README.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# SlithyT
|
|
2
|
+
|
|
3
|
+
A tool for generating novel, plausible, and pronounceable words based on linguistic corpuses.
|
|
4
|
+
|
|
5
|
+
The name is a reference to the "slithy toves" in Lewis Carroll's poem "Jabberwocky".
|
|
6
|
+
|
|
7
|
+
(Code was written substantially by AI, although I did a fair amount of reviewing, criticizing, revising
|
|
8
|
+
and debugging.)
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install .
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
|
|
18
|
+
Generate a word that looks/sounds like it fits with other words in a given
|
|
19
|
+
corpus. Similarity is determined partly by ngram analysis and partly by
|
|
20
|
+
pronunciation.
|
|
21
|
+
|
|
22
|
+
You can make your own corpus, or use pregenerated ones (in the data folder
|
|
23
|
+
of the package):
|
|
24
|
+
|
|
25
|
+
* Astronomy names (stars, galaxies, planets)
|
|
26
|
+
* Transliterated Greek, Latin, Hebrew, Egyptian names
|
|
27
|
+
* Harry Potter or Star Wars names
|
|
28
|
+
* Drug names
|
|
29
|
+
* Latin words from biology taxonomy (genus, species)
|
|
30
|
+
|
|
31
|
+
You can also use the whole dictionary as your corpus, in which case you will
|
|
32
|
+
get words with no particular flavor to them. A good corpus has at least a
|
|
33
|
+
couple hundred words in it.
|
|
34
|
+
|
|
35
|
+
By default, generated words are *novel*, meaning they won't appear in the
|
|
36
|
+
corpus you reference. You can also add a blocklist to avoid generating curse
|
|
37
|
+
words, words that violate trademarks or spam filters, etc.
|
|
38
|
+
|
|
39
|
+
All corpora and dictionary/block list files used by this tool are text
|
|
40
|
+
files having a single word per line, and can optionally be gzipped.
|
|
41
|
+
Sentiment analysis, pronounceability, and rhyming are moderately English-
|
|
42
|
+
centric, though the tolerate romance and germanic languages a bit as well.
|
|
43
|
+
However, they could be made to reflect the sensibilities of other language
|
|
44
|
+
communities by running build_phonetic_model.py and build_transcription_model.py
|
|
45
|
+
in the package's scripts folder. These generate cached patterns in
|
|
46
|
+
~/.slithyt/data.
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# Generate 10 realistic words that sound like they belong in corpus. Make
|
|
50
|
+
# the words have a length of at least 5 characters.
|
|
51
|
+
slithyt generate --corpus path/to/your/corpus.txt
|
|
52
|
+
|
|
53
|
+
# Generate words that have a positive connotation due to sound symbolism
|
|
54
|
+
# (see https://en.wikipedia.org/wiki/Sound_symbolism), that have use n=4
|
|
55
|
+
# for ngram analysis. (The --ngram-size argument is a tradeoff. Default is 3.
|
|
56
|
+
# Bigger values make the resonance with the corpus stronger, but also make it
|
|
57
|
+
# harder to be creative; it may be impossible to generate words if you go too
|
|
58
|
+
# high. Smaller values give the algorithm more freedom in both size and
|
|
59
|
+
# character sequence, but the output might sound less like the corpus.)
|
|
60
|
+
slithyt generate --corpus path/to/corpus.txt --min-sentiment 0.8 --ngram-size 4
|
|
61
|
+
|
|
62
|
+
# Generate words that are at between 4 and 8 characters long, and that are at
|
|
63
|
+
# least moderately pronounceable. (Pronounceability depends partly on the
|
|
64
|
+
# speaker's judgment; slithyt uses a simple algorithm to predict scores from
|
|
65
|
+
# 0 (hardest) to 1 (easiest), but the corpus may affect how reasonable 0.5 is.
|
|
66
|
+
# Typically, the variety of generated word lengths matches the variety of
|
|
67
|
+
# word lengths in the corpus. These values constrain output but may make
|
|
68
|
+
# generation impossible, if nothing in the corpus is as small or as large as
|
|
69
|
+
# what was requested.)
|
|
70
|
+
slithyt generate --corpus path/to/corpus.txt --min-length 4 --max-length 8 --min-pronounceability 0.5
|
|
71
|
+
|
|
72
|
+
# Generate 5 words that rhyme with synergy
|
|
73
|
+
slithyt generate --count 5 --rhymes-with synergy
|
|
74
|
+
|
|
75
|
+
# Report the rhyming analysis for synergy. (Only known words are usable
|
|
76
|
+
# as a rhyming template; passing made-up words here will do nothing
|
|
77
|
+
# useful.)
|
|
78
|
+
slithyt rhyme synergy
|
|
79
|
+
|
|
80
|
+
# Check to see whether a particular made-up word would pass certain tests.
|
|
81
|
+
slithyt validate synerjee
|
|
82
|
+
```
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "slithyt"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Daniel Hardman", email="daniel.hardman@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "A tool for generating novel, pronounceable words based on linguistic corpuses."
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
license = { text = "MIT License" }
|
|
14
|
+
requires-python = ">=3.8"
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Topic :: Text Processing :: Linguistic",
|
|
20
|
+
"Development Status :: 4 - Beta",
|
|
21
|
+
]
|
|
22
|
+
keywords = ["word generation", "procedural generation", "nlp", "linguistics", "naming"]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"pronouncing",
|
|
25
|
+
"vaderSentiment",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.urls]
|
|
29
|
+
Homepage = "https://github.com/dhh1128/slithyt"
|
|
30
|
+
"Bug Tracker" = "https://github.com/dhh1128/slithyt/issues"
|
|
31
|
+
|
|
32
|
+
[project.scripts]
|
|
33
|
+
slithyt = "slithyt.cli:main"
|
|
34
|
+
|
|
35
|
+
[tool.setuptools]
|
|
36
|
+
include-package-data = true
|
|
37
|
+
package-dir = {"" = "src"}
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.package-data]
|
|
40
|
+
slithyt = ["data/*.dat"]
|
slithyt-1.0.0/setup.cfg
ADDED
|
File without changes
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# src/slithyt/build.py
|
|
2
|
+
|
|
3
|
+
import pickle
|
|
4
|
+
import pathlib
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from . import utils
|
|
7
|
+
import pronouncing
|
|
8
|
+
|
|
9
|
+
def build_phonetic_model(corpus_path: str, n: int = 3) -> dict:
|
|
10
|
+
"""Builds a phonetic n-gram model from a word corpus."""
|
|
11
|
+
model = defaultdict(list)
|
|
12
|
+
prefix_len = n - 1
|
|
13
|
+
|
|
14
|
+
with utils.open_any(corpus_path) as f:
|
|
15
|
+
for i, word in enumerate(f):
|
|
16
|
+
if (i + 1) % 20000 == 0:
|
|
17
|
+
print(f" ...processed {i+1} words for phonetic model...")
|
|
18
|
+
word = word.strip().lower()
|
|
19
|
+
if not word: continue
|
|
20
|
+
|
|
21
|
+
phones_list = pronouncing.phones_for_word(word)
|
|
22
|
+
if not phones_list: continue
|
|
23
|
+
|
|
24
|
+
phonemes = phones_list[0].split()
|
|
25
|
+
padded_phonemes = (["^"] * prefix_len) + phonemes + ["$"]
|
|
26
|
+
|
|
27
|
+
for i in range(len(padded_phonemes) - prefix_len):
|
|
28
|
+
prefix = tuple(padded_phonemes[i : i + prefix_len])
|
|
29
|
+
next_phoneme = padded_phonemes[i + prefix_len]
|
|
30
|
+
model[prefix].append(next_phoneme)
|
|
31
|
+
|
|
32
|
+
return dict(model)
|
|
33
|
+
|
|
34
|
+
def build_transcription_model(corpus_path: str) -> dict:
|
|
35
|
+
"""Builds a statistical model for transcribing phonemes to graphemes."""
|
|
36
|
+
model = defaultdict(lambda: defaultdict(int))
|
|
37
|
+
|
|
38
|
+
with utils.open_any(corpus_path) as f:
|
|
39
|
+
for i, word in enumerate(f):
|
|
40
|
+
if (i + 1) % 20000 == 0:
|
|
41
|
+
print(f" ...processed {i+1} words for transcription model...")
|
|
42
|
+
word = word.strip().lower()
|
|
43
|
+
if not word: continue
|
|
44
|
+
|
|
45
|
+
phones_list = pronouncing.phones_for_word(word)
|
|
46
|
+
if not phones_list: continue
|
|
47
|
+
|
|
48
|
+
phonemes = phones_list[0].split()
|
|
49
|
+
|
|
50
|
+
if len(phonemes) == len(word):
|
|
51
|
+
for i, p in enumerate(phonemes):
|
|
52
|
+
base_phoneme = p.rstrip('012')
|
|
53
|
+
letter = word[i]
|
|
54
|
+
model[base_phoneme][letter] += 1
|
|
55
|
+
|
|
56
|
+
final_model = {}
|
|
57
|
+
for phoneme, spellings in model.items():
|
|
58
|
+
sorted_spellings = sorted(spellings.items(), key=lambda item: item[1], reverse=True)
|
|
59
|
+
final_model[phoneme] = [s[0] for s in sorted_spellings[:3]]
|
|
60
|
+
|
|
61
|
+
return final_model
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# src/slithyt/cli.py
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import pathlib
|
|
5
|
+
import pickle
|
|
6
|
+
from . import generator, validator, sentiment, pronounce, rhyme, build
|
|
7
|
+
|
|
8
|
+
def main():
|
|
9
|
+
"""Main function for the command-line interface."""
|
|
10
|
+
parser = argparse.ArgumentParser(description="SlithyT: A plausible word generation tool.")
|
|
11
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
12
|
+
|
|
13
|
+
# --- Generate command ---
|
|
14
|
+
gen_parser = subparsers.add_parser("generate", help="Generate new words.")
|
|
15
|
+
gen_parser.add_argument("--corpus", help="Path to the corpus file for training. Required unless using --rhymes-with.")
|
|
16
|
+
# ... (all other generate arguments)
|
|
17
|
+
gen_parser.add_argument("--count", type=int, default=10)
|
|
18
|
+
gen_parser.add_argument("--min-len", type=int, default=5)
|
|
19
|
+
gen_parser.add_argument("--max-len", type=int, default=10)
|
|
20
|
+
gen_parser.add_argument("--matches-regex")
|
|
21
|
+
gen_parser.add_argument("--reject-regex")
|
|
22
|
+
gen_parser.add_argument("--dictionary")
|
|
23
|
+
gen_parser.add_argument("--blocklist")
|
|
24
|
+
gen_parser.add_argument("--ngram-size", type=int, default=3)
|
|
25
|
+
gen_parser.add_argument("--min-sentiment", type=float)
|
|
26
|
+
gen_parser.add_argument("--max-sentiment", type=float)
|
|
27
|
+
gen_parser.add_argument("--min-pronounceability", type=float)
|
|
28
|
+
gen_parser.add_argument("--rhymes-with")
|
|
29
|
+
gen_parser.add_argument("--allow-corpus-words", action="store_true")
|
|
30
|
+
|
|
31
|
+
# --- Validate command ---
|
|
32
|
+
val_parser = subparsers.add_parser("validate", help="Validate a potential word.")
|
|
33
|
+
val_parser.add_argument("word")
|
|
34
|
+
val_parser.add_argument("--dictionary")
|
|
35
|
+
val_parser.add_argument("--blocklist")
|
|
36
|
+
|
|
37
|
+
# --- Rhyme command ---
|
|
38
|
+
rhyme_parser = subparsers.add_parser("rhyme", help="Get phonetic info for a word.")
|
|
39
|
+
rhyme_parser.add_argument("word")
|
|
40
|
+
|
|
41
|
+
# --- Build Cache command ---
|
|
42
|
+
build_parser = subparsers.add_parser("build-cache", help="Build the phonetic and transcription models.")
|
|
43
|
+
build_parser.add_argument("--corpus", help="Path to a custom corpus to build models from.")
|
|
44
|
+
|
|
45
|
+
args = parser.parse_args()
|
|
46
|
+
|
|
47
|
+
# --- Argument Validation ---
|
|
48
|
+
if args.command == "generate" and not args.corpus and not args.rhymes_with:
|
|
49
|
+
parser.error("--corpus is required unless --rhymes-with is used.")
|
|
50
|
+
|
|
51
|
+
# --- Command Execution ---
|
|
52
|
+
if args.command == "build-cache":
|
|
53
|
+
module_path = pathlib.Path(__file__).parent
|
|
54
|
+
default_dict_path = module_path / 'data' / 'cmu.txt.gz'
|
|
55
|
+
corpus_to_use = args.corpus if args.corpus else str(default_dict_path)
|
|
56
|
+
|
|
57
|
+
cache_dir = pathlib.Path.home() / '.slithyt' / 'data'
|
|
58
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
|
|
60
|
+
phonetic_model = build.build_phonetic_model(corpus_to_use)
|
|
61
|
+
with open(cache_dir / 'phonetic-model.dat', "wb") as f:
|
|
62
|
+
pickle.dump(phonetic_model, f)
|
|
63
|
+
print(f"Phonetic model saved to {cache_dir / 'phonetic-model.dat'}")
|
|
64
|
+
|
|
65
|
+
transcription_model = build.build_transcription_model(corpus_to_use)
|
|
66
|
+
with open(cache_dir / 'transcription-model.dat', "wb") as f:
|
|
67
|
+
pickle.dump(transcription_model, f)
|
|
68
|
+
print(f"Transcription model saved to {cache_dir / 'transcription-model.dat'}")
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
if args.command == "generate" or args.command == "validate":
|
|
72
|
+
module_path = pathlib.Path(__file__).parent
|
|
73
|
+
default_dict_path = module_path / 'data' / 'cmu.txt.gz'
|
|
74
|
+
default_block_path = module_path / 'data' / 'en-block.txt.gz'
|
|
75
|
+
block_to_load = args.blocklist if args.blocklist is not None else default_block_path
|
|
76
|
+
blocklist_set = validator.load_word_set(str(block_to_load))
|
|
77
|
+
dictionary_set = set()
|
|
78
|
+
dict_to_load = args.dictionary if args.dictionary is not None else default_dict_path
|
|
79
|
+
if not (args.command == "generate" and hasattr(args, 'corpus') and args.corpus and str(dict_to_load) == args.corpus):
|
|
80
|
+
dictionary_set = validator.load_word_set(str(dict_to_load))
|
|
81
|
+
|
|
82
|
+
if args.command == "generate":
|
|
83
|
+
if args.rhymes_with:
|
|
84
|
+
cache_dir = pathlib.Path.home() / '.slithyt' / 'data'
|
|
85
|
+
phonetic_model_path = cache_dir / 'phonetic-model.dat'
|
|
86
|
+
transcription_model_path = cache_dir / 'transcription-model.dat'
|
|
87
|
+
phonetic_model = rhyme.load_phonetic_model(str(phonetic_model_path))
|
|
88
|
+
transcription_model = rhyme.load_transcription_model(str(transcription_model_path))
|
|
89
|
+
if not phonetic_model or not transcription_model: return
|
|
90
|
+
|
|
91
|
+
target_phonemes = rhyme.get_phonetic_breakdown(args.rhymes_with)
|
|
92
|
+
if not target_phonemes:
|
|
93
|
+
print(f"ERROR: Cannot find '{args.rhymes_with}' in phonetic dictionary.")
|
|
94
|
+
return
|
|
95
|
+
signature = rhyme.get_rhyme_signature(target_phonemes)
|
|
96
|
+
if not signature:
|
|
97
|
+
print(f"ERROR: Cannot find a valid rhyme signature for '{args.rhymes_with}'.")
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
print(f"INFO: Generating words that rhyme with '{args.rhymes_with}'...")
|
|
101
|
+
generated_words = []
|
|
102
|
+
for _ in range(args.count * 200):
|
|
103
|
+
if len(generated_words) >= args.count: break
|
|
104
|
+
new_phonemes = rhyme.generate_phonetic_word(phonetic_model, signature)
|
|
105
|
+
if not new_phonemes: continue
|
|
106
|
+
word = rhyme.transcribe_word(transcription_model, new_phonemes)
|
|
107
|
+
if word and word not in generated_words and validator.validate_word(
|
|
108
|
+
word, args.matches_regex, args.reject_regex, dictionary_set, blocklist_set,
|
|
109
|
+
None, args.min_sentiment, args.max_sentiment, args.min_pronounceability
|
|
110
|
+
):
|
|
111
|
+
generated_words.append(word)
|
|
112
|
+
print(f" - {word}")
|
|
113
|
+
else:
|
|
114
|
+
print(f"INFO: Training model from '{args.corpus}'...")
|
|
115
|
+
model, corpus_set = generator.train_from_corpus(args.corpus, n=args.ngram_size)
|
|
116
|
+
if not model: return
|
|
117
|
+
corpus_rejection_set = None if args.allow_corpus_words else corpus_set
|
|
118
|
+
|
|
119
|
+
print(f"INFO: Generating {args.count} words...")
|
|
120
|
+
generated_words = []
|
|
121
|
+
for _ in range(args.count * 100):
|
|
122
|
+
if len(generated_words) >= args.count: break
|
|
123
|
+
word = generator.generate_word(model, args.min_len, args.max_len, n=args.ngram_size)
|
|
124
|
+
if word and word not in generated_words and validator.validate_word(
|
|
125
|
+
word, args.matches_regex, args.reject_regex, dictionary_set, blocklist_set,
|
|
126
|
+
corpus_rejection_set, args.min_sentiment, args.max_sentiment, args.min_pronounceability
|
|
127
|
+
):
|
|
128
|
+
generated_words.append(word)
|
|
129
|
+
print(f" - {word}")
|
|
130
|
+
|
|
131
|
+
elif args.command == "validate":
|
|
132
|
+
is_valid = validator.validate_word(args.word, dictionary_set=dictionary_set, blocklist_set=blocklist_set)
|
|
133
|
+
s_score = sentiment.analyze_word_sentiment(args.word)
|
|
134
|
+
p_score = pronounce.score_pronounceability(args.word)
|
|
135
|
+
print(f"Validating word: '{args.word}'")
|
|
136
|
+
print(f" - Validation Result: {'Valid' if is_valid else 'Invalid'}")
|
|
137
|
+
print(f" - Sentiment Score: {s_score:.3f}")
|
|
138
|
+
print(f" - Pronounceability Score: {p_score:.3f}")
|
|
139
|
+
|
|
140
|
+
elif args.command == "rhyme":
|
|
141
|
+
print(f"Analyzing word: '{args.word}'")
|
|
142
|
+
phonemes = rhyme.get_phonetic_breakdown(args.word)
|
|
143
|
+
if not phonemes:
|
|
144
|
+
print(" - Word not found in the phonetic dictionary.")
|
|
145
|
+
return
|
|
146
|
+
|
|
147
|
+
print(f" - Phonetic Breakdown: {' '.join(phonemes)}")
|
|
148
|
+
signature = rhyme.get_rhyme_signature(phonemes)
|
|
149
|
+
if signature:
|
|
150
|
+
print(f" - Rhyme Signature: {' '.join(signature)}")
|
|
151
|
+
|
|
152
|
+
if __name__ == "__main__":
|
|
153
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Contains the n-gram model training and word generation logic.
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from . import utils
|
|
6
|
+
|
|
7
|
+
def train_from_corpus(corpus_path: str, n: int = 3) -> tuple[dict, set]:
|
|
8
|
+
"""
|
|
9
|
+
Reads a corpus file once to train a character-level n-gram model
|
|
10
|
+
and create a set of all words in the corpus for novelty checking.
|
|
11
|
+
|
|
12
|
+
The model is a dictionary where keys are prefixes of length (n-1)
|
|
13
|
+
and values are lists of characters that can follow that prefix.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
corpus_path: Path to the text file to train on (one word per line).
|
|
17
|
+
n: The order of the n-gram model (e.g., 3 for trigrams).
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
A tuple containing (model_dict, corpus_word_set).
|
|
21
|
+
"""
|
|
22
|
+
model = defaultdict(list)
|
|
23
|
+
corpus_word_set = set()
|
|
24
|
+
|
|
25
|
+
# Use special characters for start and end of a word
|
|
26
|
+
start_char = "^"
|
|
27
|
+
end_char = "$"
|
|
28
|
+
|
|
29
|
+
prefix_len = n - 1
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
with utils.open_any(corpus_path) as f:
|
|
33
|
+
for line in f:
|
|
34
|
+
word = line.strip().lower()
|
|
35
|
+
if not word:
|
|
36
|
+
continue
|
|
37
|
+
corpus_word_set.add(word)
|
|
38
|
+
|
|
39
|
+
# Pad the word with start/end markers
|
|
40
|
+
padded_word = (start_char * prefix_len) + word + end_char
|
|
41
|
+
|
|
42
|
+
for i in range(len(padded_word) - prefix_len):
|
|
43
|
+
prefix = padded_word[i : i + prefix_len]
|
|
44
|
+
next_char = padded_word[i + prefix_len]
|
|
45
|
+
model[prefix].append(next_char)
|
|
46
|
+
except FileNotFoundError:
|
|
47
|
+
print(f"ERROR: Corpus file not found at {corpus_path}")
|
|
48
|
+
return {}, set()
|
|
49
|
+
|
|
50
|
+
return dict(model), corpus_word_set
|
|
51
|
+
|
|
52
|
+
def generate_word(model: dict, min_len: int = 5, max_len: int = 10, n: int = 3) -> str:
|
|
53
|
+
"""
|
|
54
|
+
Generates a single word using the trained n-gram model.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
model: The trained n-gram model from train_model().
|
|
58
|
+
min_len: The minimum length of the generated word.
|
|
59
|
+
max_len: The maximum length of the generated word.
|
|
60
|
+
n: The order of the n-gram model used for generation.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
A newly generated word as a string, or an empty string if generation fails.
|
|
64
|
+
"""
|
|
65
|
+
if not model:
|
|
66
|
+
return ""
|
|
67
|
+
|
|
68
|
+
start_char = "^"
|
|
69
|
+
end_char = "$"
|
|
70
|
+
prefix_len = n - 1
|
|
71
|
+
|
|
72
|
+
# Loop until a valid word is generated
|
|
73
|
+
for _ in range(100): # Max attempts to prevent infinite loops
|
|
74
|
+
word_chars = []
|
|
75
|
+
current_prefix = start_char * prefix_len
|
|
76
|
+
|
|
77
|
+
for _ in range(max_len):
|
|
78
|
+
if current_prefix not in model:
|
|
79
|
+
# This prefix was not seen during training, dead end.
|
|
80
|
+
break
|
|
81
|
+
|
|
82
|
+
next_char = random.choice(model[current_prefix])
|
|
83
|
+
|
|
84
|
+
if next_char == end_char:
|
|
85
|
+
break
|
|
86
|
+
|
|
87
|
+
word_chars.append(next_char)
|
|
88
|
+
current_prefix = current_prefix[1:] + next_char
|
|
89
|
+
|
|
90
|
+
final_word = "".join(word_chars)
|
|
91
|
+
if min_len <= len(final_word) <= max_len:
|
|
92
|
+
return final_word
|
|
93
|
+
|
|
94
|
+
return "" # Return empty if we couldn't generate a valid word
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# slithyt/pronounce.py
|
|
2
|
+
|
|
3
|
+
def score_pronounceability(word: str) -> float:
|
|
4
|
+
"""
|
|
5
|
+
Calculates a pronounceability score for a word based on heuristics.
|
|
6
|
+
The score is between 0.0 (less pronounceable) and 1.0 (more pronounceable).
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
word: The word to score.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
A float representing the pronounceability score.
|
|
13
|
+
"""
|
|
14
|
+
if not word:
|
|
15
|
+
return 0.0
|
|
16
|
+
|
|
17
|
+
word_lower = word.lower()
|
|
18
|
+
vowels = "aeiou"
|
|
19
|
+
|
|
20
|
+
# Heuristic 1: Penalize long consonant clusters
|
|
21
|
+
max_consonant_cluster = 0
|
|
22
|
+
current_consonant_cluster = 0
|
|
23
|
+
for char in word_lower:
|
|
24
|
+
if char not in vowels:
|
|
25
|
+
current_consonant_cluster += 1
|
|
26
|
+
else:
|
|
27
|
+
max_consonant_cluster = max(max_consonant_cluster, current_consonant_cluster)
|
|
28
|
+
current_consonant_cluster = 0
|
|
29
|
+
max_consonant_cluster = max(max_consonant_cluster, current_consonant_cluster)
|
|
30
|
+
|
|
31
|
+
# A cluster of more than 3 consonants is difficult.
|
|
32
|
+
consonant_penalty = max(0, max_consonant_cluster - 3) * 0.3
|
|
33
|
+
|
|
34
|
+
# Heuristic 2: Penalize long vowel clusters
|
|
35
|
+
max_vowel_cluster = 0
|
|
36
|
+
current_vowel_cluster = 0
|
|
37
|
+
for char in word_lower:
|
|
38
|
+
if char in vowels:
|
|
39
|
+
current_vowel_cluster += 1
|
|
40
|
+
else:
|
|
41
|
+
max_vowel_cluster = max(max_vowel_cluster, current_vowel_cluster)
|
|
42
|
+
current_vowel_cluster = 0
|
|
43
|
+
max_vowel_cluster = max(max_vowel_cluster, current_vowel_cluster)
|
|
44
|
+
|
|
45
|
+
# A cluster of more than 2 vowels is uncommon.
|
|
46
|
+
vowel_penalty = max(0, max_vowel_cluster - 2) * 0.4
|
|
47
|
+
|
|
48
|
+
# Heuristic 3: Ideal vowel-to-consonant ratio (35%-65% vowels)
|
|
49
|
+
num_vowels = sum(1 for char in word_lower if char in vowels)
|
|
50
|
+
vowel_ratio = num_vowels / len(word_lower) if len(word_lower) > 0 else 0
|
|
51
|
+
|
|
52
|
+
ratio_penalty = 0
|
|
53
|
+
if not (0.35 <= vowel_ratio <= 0.65):
|
|
54
|
+
ratio_penalty = 0.3
|
|
55
|
+
|
|
56
|
+
# Calculate final score
|
|
57
|
+
total_penalty = consonant_penalty + vowel_penalty + ratio_penalty
|
|
58
|
+
score = max(0.0, 1.0 - total_penalty)
|
|
59
|
+
|
|
60
|
+
return score
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# src/slithyt/rhyme.py
|
|
2
|
+
|
|
3
|
+
import pronouncing
|
|
4
|
+
import pickle
|
|
5
|
+
import random
|
|
6
|
+
import pathlib
|
|
7
|
+
from . import build
|
|
8
|
+
|
|
9
|
+
def get_phonetic_breakdown(word: str) -> list[str] | None:
|
|
10
|
+
"""Gets the phonetic breakdown for a word."""
|
|
11
|
+
pronunciations = pronouncing.phones_for_word(word)
|
|
12
|
+
if not pronunciations:
|
|
13
|
+
return None
|
|
14
|
+
return pronunciations[0].split()
|
|
15
|
+
|
|
16
|
+
def get_rhyme_signature(phonemes: list[str]) -> list[str] | None:
|
|
17
|
+
"""Extracts the rhyming part of a word from its list of phonemes."""
|
|
18
|
+
last_stressed_vowel_index = -1
|
|
19
|
+
for i, p in enumerate(phonemes):
|
|
20
|
+
if p[-1] in ('1', '2'):
|
|
21
|
+
last_stressed_vowel_index = i
|
|
22
|
+
if last_stressed_vowel_index == -1:
|
|
23
|
+
return None
|
|
24
|
+
return phonemes[last_stressed_vowel_index:]
|
|
25
|
+
|
|
26
|
+
def load_phonetic_model(model_path: str) -> dict:
|
|
27
|
+
"""Loads a pre-computed phonetic model, building it if it doesn't exist."""
|
|
28
|
+
model_path = pathlib.Path(model_path)
|
|
29
|
+
if model_path.exists():
|
|
30
|
+
with open(model_path, "rb") as f:
|
|
31
|
+
return pickle.load(f)
|
|
32
|
+
else:
|
|
33
|
+
print("First-time setup: Building phonetic model. This may take a moment...")
|
|
34
|
+
module_path = pathlib.Path(__file__).parent
|
|
35
|
+
default_dict_path = module_path / 'data' / 'cmu.txt.gz'
|
|
36
|
+
|
|
37
|
+
model = build.build_phonetic_model(str(default_dict_path))
|
|
38
|
+
|
|
39
|
+
model_path.parent.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
with open(model_path, "wb") as f:
|
|
41
|
+
pickle.dump(model, f)
|
|
42
|
+
print(f"Phonetic model saved to {model_path}")
|
|
43
|
+
return model
|
|
44
|
+
|
|
45
|
+
def load_transcription_model(model_path: str) -> dict:
|
|
46
|
+
"""Loads a pre-computed transcription model, building it if it doesn't exist."""
|
|
47
|
+
model_path = pathlib.Path(model_path)
|
|
48
|
+
if model_path.exists():
|
|
49
|
+
with open(model_path, "rb") as f:
|
|
50
|
+
return pickle.load(f)
|
|
51
|
+
else:
|
|
52
|
+
print("First-time setup: Building transcription model. This may take a moment...")
|
|
53
|
+
module_path = pathlib.Path(__file__).parent
|
|
54
|
+
default_dict_path = module_path / 'data' / 'cmu.txt.gz'
|
|
55
|
+
|
|
56
|
+
model = build.build_transcription_model(str(default_dict_path))
|
|
57
|
+
|
|
58
|
+
model_path.parent.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
with open(model_path, "wb") as f:
|
|
60
|
+
pickle.dump(model, f)
|
|
61
|
+
print(f"Transcription model saved to {model_path}")
|
|
62
|
+
return model
|
|
63
|
+
|
|
64
|
+
def generate_phonetic_word(model: dict, rhyme_signature: list[str], n: int = 3) -> list[str] | None:
|
|
65
|
+
"""Generates a new sequence of phonemes that ends with the given rhyme signature."""
|
|
66
|
+
if not model: return None
|
|
67
|
+
prefix_len = n - 1
|
|
68
|
+
current_prefix = tuple(["^"] * prefix_len)
|
|
69
|
+
generated_phonemes = []
|
|
70
|
+
for _ in range(10):
|
|
71
|
+
if current_prefix not in model: return None
|
|
72
|
+
next_phoneme = random.choice(model[current_prefix])
|
|
73
|
+
if next_phoneme == "$": break
|
|
74
|
+
generated_phonemes.append(next_phoneme)
|
|
75
|
+
current_prefix = tuple(list(current_prefix[1:]) + [next_phoneme])
|
|
76
|
+
return generated_phonemes + rhyme_signature
|
|
77
|
+
|
|
78
|
+
def transcribe_word(transcription_model: dict, phonemes: list[str]) -> str:
|
|
79
|
+
"""Transcribes a sequence of phonemes into a plausible word spelling."""
|
|
80
|
+
word = []
|
|
81
|
+
for p in phonemes:
|
|
82
|
+
base_phoneme = p.rstrip('012')
|
|
83
|
+
if base_phoneme in transcription_model and transcription_model[base_phoneme]:
|
|
84
|
+
word.append(random.choice(transcription_model[base_phoneme]))
|
|
85
|
+
else:
|
|
86
|
+
word.append('?')
|
|
87
|
+
return "".join(word)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
|
2
|
+
|
|
3
|
+
# Initialize the analyzer for its word lexicon.
|
|
4
|
+
_analyzer = SentimentIntensityAnalyzer()
|
|
5
|
+
|
|
6
|
+
# --- Structured Morpheme Lexicons ---
|
|
7
|
+
|
|
8
|
+
_INVERTING_PREFIXES = {"un", "in", "im", "il", "ir", "non", "dis", "mis", "dys", "anti"}
|
|
9
|
+
_INVERTING_SUFFIXES = {"less"}
|
|
10
|
+
|
|
11
|
+
_PREFIXES = {
|
|
12
|
+
"mal": -4.0, "mis": -3.0, "dis": -2.0, "un": -1.0, "in": -1.0, "im": -1.0,
|
|
13
|
+
"non": -1.0, "de": -1.0, "anti": -2.0, "contra": -2.0, "ob": -2.0,
|
|
14
|
+
"pseudo": -2.0, "cata": -2.0, "dys": -2.2, "caco": -2.3,
|
|
15
|
+
"bene": 3.0, "eu": 4.0, "pro": 2.0, "pre": 1.0, "con": 2.0, "com": 2.0,
|
|
16
|
+
"sym": 2.0, "syn": 2.0,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
_SUFFIXES = {
|
|
20
|
+
"less": -2.0, "cide": -4.0, "ful": 1.5, "able": 1.0, "ible": 1.0,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
_INFIXES = {
|
|
24
|
+
"mort": -3.0, "nec": -3.0, "necr": -3.0, "path": -3.0, "tox": -4.0,
|
|
25
|
+
"pess": -3.0, "mor": -3.0, "vill": -3.0, "crim": -3.0, "rupt": -2.0,
|
|
26
|
+
"fail": -3.0, "terr": -2.0, "horr": -4.0, "vuln": -2.0, "hostil": -3.0,
|
|
27
|
+
"vex": -2.0, "trib": -2.0, "fall": -2.0, "err": -1.9,
|
|
28
|
+
"am": 3.0, "amic": 3.0, "phil": 3.0, "pac": 4.0, "grat": 4.0,
|
|
29
|
+
"felic": 4.0, "beat": 4.0, "sanct": 3.0, "salv": 3.0, "ver": 3.0,
|
|
30
|
+
"honor": 3.0, "dign": 3.0, "fortun": 2.0, "optim": 4.0, "lucr": 2.0,
|
|
31
|
+
"prosper": 4.0, "brill": 3.0, "clar": 2.0, "lumin": 3.0, "vital": 3.0,
|
|
32
|
+
"viv": 3.0, "gen": 2.0, "cresc": 2.0, "cret": 2.0, "magn": 3.0,
|
|
33
|
+
"grand": 3.0, "nobl": 3.0, "excell": 4.0, "laud": 4.0, "glor": 3.0,
|
|
34
|
+
"merit": 3.0, "secure": 3.0, "firm": 2.0, "resolut": 2.0, "joy": 4.0,
|
|
35
|
+
"happ": 4.0, "hope": 3.0, "vit": 2.0, "equi": 1.5, "amor": 2.8,
|
|
36
|
+
"bon": 2.5, "luc": 1.8, "lum": 1.8, "cred": 1.7,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
_WORD_LEXICON = _analyzer.lexicon
|
|
40
|
+
|
|
41
|
+
_SORTED_PREFIXES = sorted(_PREFIXES.keys(), key=len, reverse=True)
|
|
42
|
+
_SORTED_SUFFIXES = sorted(_SUFFIXES.keys(), key=len, reverse=True)
|
|
43
|
+
|
|
44
|
+
def _normalize_score(score: float) -> float:
|
|
45
|
+
"""Normalizes a VADER score to a 0.0-1.0 scale."""
|
|
46
|
+
return (score + 4) / 8
|
|
47
|
+
|
|
48
|
+
def analyze_word_sentiment(word: str) -> float:
|
|
49
|
+
"""
|
|
50
|
+
Analyzes word sentiment using a recursive, positional, multi-pass algorithm.
|
|
51
|
+
"""
|
|
52
|
+
word_lower = word.lower()
|
|
53
|
+
|
|
54
|
+
if not word_lower:
|
|
55
|
+
return 0.5
|
|
56
|
+
|
|
57
|
+
if word_lower in _WORD_LEXICON:
|
|
58
|
+
return _normalize_score(_WORD_LEXICON[word_lower])
|
|
59
|
+
|
|
60
|
+
for p in _SORTED_PREFIXES:
|
|
61
|
+
if len(p) >= 2 and word_lower.startswith(p):
|
|
62
|
+
prefix_score = _PREFIXES[p]
|
|
63
|
+
stem = word_lower[len(p):]
|
|
64
|
+
|
|
65
|
+
if len(stem) < 4:
|
|
66
|
+
return _normalize_score(prefix_score)
|
|
67
|
+
|
|
68
|
+
stem_sentiment = analyze_word_sentiment(stem)
|
|
69
|
+
|
|
70
|
+
# If the stem is neutral, the prefix's sentiment dominates.
|
|
71
|
+
if stem_sentiment == 0.5:
|
|
72
|
+
return _normalize_score(prefix_score)
|
|
73
|
+
|
|
74
|
+
if p in _INVERTING_PREFIXES:
|
|
75
|
+
return 1.0 - stem_sentiment
|
|
76
|
+
|
|
77
|
+
avg_raw_score = (prefix_score + (stem_sentiment * 8 - 4)) / 2
|
|
78
|
+
return _normalize_score(avg_raw_score)
|
|
79
|
+
|
|
80
|
+
for s in _SORTED_SUFFIXES:
|
|
81
|
+
if len(s) >= 2 and word_lower.endswith(s):
|
|
82
|
+
suffix_score = _SUFFIXES[s]
|
|
83
|
+
stem = word_lower[:-len(s)]
|
|
84
|
+
|
|
85
|
+
if len(stem) < 4:
|
|
86
|
+
return _normalize_score(suffix_score)
|
|
87
|
+
|
|
88
|
+
stem_sentiment = analyze_word_sentiment(stem)
|
|
89
|
+
|
|
90
|
+
if stem_sentiment == 0.5:
|
|
91
|
+
return _normalize_score(suffix_score)
|
|
92
|
+
|
|
93
|
+
if s in _INVERTING_SUFFIXES:
|
|
94
|
+
return 1.0 - stem_sentiment
|
|
95
|
+
|
|
96
|
+
avg_raw_score = (suffix_score + (stem_sentiment * 8 - 4)) / 2
|
|
97
|
+
return _normalize_score(avg_raw_score)
|
|
98
|
+
|
|
99
|
+
found_scores = []
|
|
100
|
+
i = 0
|
|
101
|
+
while i < len(word_lower):
|
|
102
|
+
best_match = ""
|
|
103
|
+
for j in range(len(word_lower), i, -1):
|
|
104
|
+
substring = word_lower[i:j]
|
|
105
|
+
if len(substring) >= 3 and substring in _INFIXES:
|
|
106
|
+
best_match = substring
|
|
107
|
+
break
|
|
108
|
+
|
|
109
|
+
if best_match:
|
|
110
|
+
found_scores.append(_INFIXES[best_match])
|
|
111
|
+
i += len(best_match)
|
|
112
|
+
else:
|
|
113
|
+
i += 1
|
|
114
|
+
|
|
115
|
+
if not found_scores:
|
|
116
|
+
return 0.5
|
|
117
|
+
|
|
118
|
+
avg_score = sum(found_scores) / len(found_scores)
|
|
119
|
+
return _normalize_score(avg_score)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
|
|
3
|
+
def open_any(file_path: str):
|
|
4
|
+
"""
|
|
5
|
+
Opens a file, transparently handling whether it is gzipped or plain text
|
|
6
|
+
by checking for the gzip magic number.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
file_path: The path to the file to open.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
A file handle ready for reading in text mode.
|
|
13
|
+
"""
|
|
14
|
+
with open(file_path, 'rb') as f:
|
|
15
|
+
is_gzipped = (f.read(2) == b'\x1f\x8b')
|
|
16
|
+
|
|
17
|
+
# Return the correct file handle based on the check
|
|
18
|
+
if is_gzipped:
|
|
19
|
+
return gzip.open(file_path, 'rt', encoding="utf-8")
|
|
20
|
+
else:
|
|
21
|
+
return open(file_path, 'r', encoding="utf-8")
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Set
|
|
3
|
+
from . import sentiment
|
|
4
|
+
from . import pronounce
|
|
5
|
+
from . import utils
|
|
6
|
+
|
|
7
|
+
def load_word_set(file_path: str) -> Set[str]:
|
|
8
|
+
"""
|
|
9
|
+
Loads a word list from a plain text or gzipped file into a set
|
|
10
|
+
for efficient lookup.
|
|
11
|
+
"""
|
|
12
|
+
if not file_path:
|
|
13
|
+
return set()
|
|
14
|
+
try:
|
|
15
|
+
with utils.open_any(file_path) as f:
|
|
16
|
+
return {line.strip().lower() for line in f if line.strip()}
|
|
17
|
+
except FileNotFoundError:
|
|
18
|
+
print(f"WARNING: File not found at {file_path}. Skipping this check.")
|
|
19
|
+
return set()
|
|
20
|
+
|
|
21
|
+
def validate_word(
|
|
22
|
+
word: str,
|
|
23
|
+
matches_regex: str = None,
|
|
24
|
+
reject_regex: str = None,
|
|
25
|
+
dictionary_set: set[str] = None,
|
|
26
|
+
blocklist_set: set[str] = None,
|
|
27
|
+
corpus_rejection_set: set[str] = None,
|
|
28
|
+
min_sentiment: float = None,
|
|
29
|
+
max_sentiment: float = None,
|
|
30
|
+
min_pronounceability: float = None
|
|
31
|
+
) -> bool:
|
|
32
|
+
"""
|
|
33
|
+
Validates a word against a set of constraints.
|
|
34
|
+
"""
|
|
35
|
+
if not word:
|
|
36
|
+
return False
|
|
37
|
+
word_lower = word.lower()
|
|
38
|
+
if matches_regex and not re.search(matches_regex, word, re.IGNORECASE):
|
|
39
|
+
return False
|
|
40
|
+
if reject_regex and re.search(reject_regex, word, re.IGNORECASE):
|
|
41
|
+
return False
|
|
42
|
+
if dictionary_set and word_lower in dictionary_set:
|
|
43
|
+
return False
|
|
44
|
+
if blocklist_set and word_lower in blocklist_set:
|
|
45
|
+
return False
|
|
46
|
+
if corpus_rejection_set and word_lower in corpus_rejection_set:
|
|
47
|
+
return False
|
|
48
|
+
if min_sentiment is not None or max_sentiment is not None:
|
|
49
|
+
score = sentiment.analyze_word_sentiment(word)
|
|
50
|
+
if min_sentiment is not None and score < min_sentiment:
|
|
51
|
+
return False
|
|
52
|
+
if max_sentiment is not None and score > max_sentiment:
|
|
53
|
+
return False
|
|
54
|
+
if min_pronounceability is not None:
|
|
55
|
+
score = pronounce.score_pronounceability(word)
|
|
56
|
+
if score < min_pronounceability:
|
|
57
|
+
return False
|
|
58
|
+
return True
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: slithyt
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A tool for generating novel, pronounceable words based on linguistic corpuses.
|
|
5
|
+
Author-email: Daniel Hardman <daniel.hardman@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
Project-URL: Homepage, https://github.com/dhh1128/slithyt
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/dhh1128/slithyt/issues
|
|
9
|
+
Keywords: word generation,procedural generation,nlp,linguistics,naming
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: pronouncing
|
|
19
|
+
Requires-Dist: vaderSentiment
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# SlithyT
|
|
23
|
+
|
|
24
|
+
A tool for generating novel, plausible, and pronounceable words based on linguistic corpuses.
|
|
25
|
+
|
|
26
|
+
The name is a reference to the "slithy toves" in Lewis Carroll's poem "Jabberwocky".
|
|
27
|
+
|
|
28
|
+
(Code was written substantially by AI, although I did a fair amount of reviewing, criticizing, revising
|
|
29
|
+
and debugging.)
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install .
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
|
|
39
|
+
Generate a word that looks/sounds like it fits with other words in a given
|
|
40
|
+
corpus. Similarity is determined partly by ngram analysis and partly by
|
|
41
|
+
pronunciation.
|
|
42
|
+
|
|
43
|
+
You can make your own corpus, or use pregenerated ones (in the data folder
|
|
44
|
+
of the package):
|
|
45
|
+
|
|
46
|
+
* Astronomy names (stars, galaxies, planets)
|
|
47
|
+
* Transliterated Greek, Latin, Hebrew, Egyptian names
|
|
48
|
+
* Harry Potter or Star Wars names
|
|
49
|
+
* Drug names
|
|
50
|
+
* Latin words from biology taxonomy (genus, species)
|
|
51
|
+
|
|
52
|
+
You can also use the whole dictionary as your corpus, in which case you will
|
|
53
|
+
get words with no particular flavor to them. A good corpus has at least a
|
|
54
|
+
couple hundred words in it.
|
|
55
|
+
|
|
56
|
+
By default, generated words are *novel*, meaning they won't appear in the
|
|
57
|
+
corpus you reference. You can also add a blocklist to avoid generating curse
|
|
58
|
+
words, words that violate trademarks or spam filters, etc.
|
|
59
|
+
|
|
60
|
+
All corpora and dictionary/block list files used by this tool are text
|
|
61
|
+
files having a single word per line, and can optionally be gzipped.
|
|
62
|
+
Sentiment analysis, pronounceability, and rhyming are moderately English-
|
|
63
|
+
centric, though the tolerate romance and germanic languages a bit as well.
|
|
64
|
+
However, they could be made to reflect the sensibilities of other language
|
|
65
|
+
communities by running build_phonetic_model.py and build_transcription_model.py
|
|
66
|
+
in the package's scripts folder. These generate cached patterns in
|
|
67
|
+
~/.slithyt/data.
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# Generate 10 realistic words that sound like they belong in corpus. Make
|
|
71
|
+
# the words have a length of at least 5 characters.
|
|
72
|
+
slithyt generate --corpus path/to/your/corpus.txt
|
|
73
|
+
|
|
74
|
+
# Generate words that have a positive connotation due to sound symbolism
|
|
75
|
+
# (see https://en.wikipedia.org/wiki/Sound_symbolism), that have use n=4
|
|
76
|
+
# for ngram analysis. (The --ngram-size argument is a tradeoff. Default is 3.
|
|
77
|
+
# Bigger values make the resonance with the corpus stronger, but also make it
|
|
78
|
+
# harder to be creative; it may be impossible to generate words if you go too
|
|
79
|
+
# high. Smaller values give the algorithm more freedom in both size and
|
|
80
|
+
# character sequence, but the output might sound less like the corpus.)
|
|
81
|
+
slithyt generate --corpus path/to/corpus.txt --min-sentiment 0.8 --ngram-size 4
|
|
82
|
+
|
|
83
|
+
# Generate words that are at between 4 and 8 characters long, and that are at
|
|
84
|
+
# least moderately pronounceable. (Pronounceability depends partly on the
|
|
85
|
+
# speaker's judgment; slithyt uses a simple algorithm to predict scores from
|
|
86
|
+
# 0 (hardest) to 1 (easiest), but the corpus may affect how reasonable 0.5 is.
|
|
87
|
+
# Typically, the variety of generated word lengths matches the variety of
|
|
88
|
+
# word lengths in the corpus. These values constrain output but may make
|
|
89
|
+
# generation impossible, if nothing in the corpus is as small or as large as
|
|
90
|
+
# what was requested.)
|
|
91
|
+
slithyt generate --corpus path/to/corpus.txt --min-length 4 --max-length 8 --min-pronounceability 0.5
|
|
92
|
+
|
|
93
|
+
# Generate 5 words that rhyme with synergy
|
|
94
|
+
slithyt generate --count 5 --rhymes-with synergy
|
|
95
|
+
|
|
96
|
+
# Report the rhyming analysis for synergy. (Only known words are usable
|
|
97
|
+
# as a rhyming template; passing made-up words here will do nothing
|
|
98
|
+
# useful.)
|
|
99
|
+
slithyt rhyme synergy
|
|
100
|
+
|
|
101
|
+
# Check to see whether a particular made-up word would pass certain tests.
|
|
102
|
+
slithyt validate synerjee
|
|
103
|
+
```
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/slithyt/__init__.py
|
|
5
|
+
src/slithyt/build.py
|
|
6
|
+
src/slithyt/cli.py
|
|
7
|
+
src/slithyt/generator.py
|
|
8
|
+
src/slithyt/pronounce.py
|
|
9
|
+
src/slithyt/rhyme.py
|
|
10
|
+
src/slithyt/sentiment.py
|
|
11
|
+
src/slithyt/utils.py
|
|
12
|
+
src/slithyt/validator.py
|
|
13
|
+
src/slithyt.egg-info/PKG-INFO
|
|
14
|
+
src/slithyt.egg-info/SOURCES.txt
|
|
15
|
+
src/slithyt.egg-info/dependency_links.txt
|
|
16
|
+
src/slithyt.egg-info/entry_points.txt
|
|
17
|
+
src/slithyt.egg-info/requires.txt
|
|
18
|
+
src/slithyt.egg-info/top_level.txt
|
|
19
|
+
src/slithyt/data/__init__.py
|
|
20
|
+
tests/test_generator.py
|
|
21
|
+
tests/test_pronounce.py
|
|
22
|
+
tests/test_rhyme.py
|
|
23
|
+
tests/test_sentiment.py
|
|
24
|
+
tests/test_validator.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
slithyt
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Tests for the generator module.
|
|
2
|
+
import tempfile
|
|
3
|
+
import os
|
|
4
|
+
from slithyt import generator
|
|
5
|
+
|
|
6
|
+
def test_train_and_generate():
|
|
7
|
+
"""
|
|
8
|
+
Tests that the generator can be trained and can produce a word.
|
|
9
|
+
This test creates a temporary corpus file.
|
|
10
|
+
"""
|
|
11
|
+
corpus_content = "slithy\nautonomer\npythonic\n"
|
|
12
|
+
|
|
13
|
+
# Create a temporary file to act as the corpus
|
|
14
|
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, encoding='utf-8') as tmp:
|
|
15
|
+
tmp.write(corpus_content)
|
|
16
|
+
corpus_path = tmp.name
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
# Test training
|
|
20
|
+
model, corpus_set = generator.train_from_corpus(corpus_path, n=3)
|
|
21
|
+
assert isinstance(model, dict)
|
|
22
|
+
assert len(model) > 0
|
|
23
|
+
assert isinstance(corpus_set, set)
|
|
24
|
+
assert "pythonic" in corpus_set
|
|
25
|
+
# Check if a known trigram was learned correctly.
|
|
26
|
+
# The key should be the prefix of length n-1.
|
|
27
|
+
# The value should be a list containing the next character.
|
|
28
|
+
assert '^^' in model and 's' in model['^^']
|
|
29
|
+
assert "th" in model
|
|
30
|
+
assert "y" in model["th"]
|
|
31
|
+
|
|
32
|
+
# Test generation
|
|
33
|
+
word = generator.generate_word(model, min_len=4, max_len=10, n=3)
|
|
34
|
+
assert isinstance(word, str)
|
|
35
|
+
assert len(word) >= 4
|
|
36
|
+
assert word.islower() # Check that the word is lowercase
|
|
37
|
+
|
|
38
|
+
# Check that the generated word only contains characters from the corpus
|
|
39
|
+
corpus_chars = set("slithyautonomerpcn")
|
|
40
|
+
word_chars = set(word.lower())
|
|
41
|
+
assert word_chars.issubset(corpus_chars)
|
|
42
|
+
|
|
43
|
+
finally:
|
|
44
|
+
# Clean up the temporary file
|
|
45
|
+
os.remove(corpus_path)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from slithyt import pronounce
|
|
2
|
+
|
|
3
|
+
def test_pronounceability_scorer():
|
|
4
|
+
"""Tests the pronounceability scoring logic."""
|
|
5
|
+
|
|
6
|
+
# 1. Good, pronounceable words should score high
|
|
7
|
+
# These have good vowel/consonant alternation.
|
|
8
|
+
assert pronounce.score_pronounceability("veridian") > 0.9
|
|
9
|
+
assert pronounce.score_pronounceability("solara") > 0.9
|
|
10
|
+
assert pronounce.score_pronounceability("kalani") > 0.9
|
|
11
|
+
|
|
12
|
+
# 2. Words with long consonant clusters should be penalized
|
|
13
|
+
# "rhythmsk" has a 4-consonant cluster.
|
|
14
|
+
assert pronounce.score_pronounceability("rhythmsk") < 0.5
|
|
15
|
+
# "schtroumpf" ("smurf" in German) has a 5-consonant cluster.
|
|
16
|
+
assert pronounce.score_pronounceability("schtroumpf") < 0.7
|
|
17
|
+
|
|
18
|
+
# 3. Words with long vowel clusters should be penalized
|
|
19
|
+
# "aeioua" has a 6-vowel cluster.
|
|
20
|
+
assert pronounce.score_pronounceability("aeioua") < 0.5
|
|
21
|
+
# "eunoia" has a 5-vowel cluster.
|
|
22
|
+
assert pronounce.score_pronounceability("eunoia") < 0.7
|
|
23
|
+
|
|
24
|
+
# 4. Words with bad vowel/consonant ratios should be penalized
|
|
25
|
+
# "strength" has a low vowel ratio (1/8 = 12.5%).
|
|
26
|
+
assert pronounce.score_pronounceability("strength") < 0.8
|
|
27
|
+
# "aeia" has a high vowel ratio (4/4 = 100%).
|
|
28
|
+
assert pronounce.score_pronounceability("aeia") < 0.8
|
|
29
|
+
|
|
30
|
+
# 5. Edge cases should not cause errors
|
|
31
|
+
assert pronounce.score_pronounceability("") == 0.0
|
|
32
|
+
assert pronounce.score_pronounceability("a") < 0.8 # Bad ratio
|
|
33
|
+
assert pronounce.score_pronounceability("b") == 0.7 # Bad ratio
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from slithyt import rhyme
|
|
2
|
+
|
|
3
|
+
def test_get_phonetic_breakdown():
|
|
4
|
+
"""Tests that we can get a list of phonemes for a word."""
|
|
5
|
+
# Test a word that should be in the dictionary
|
|
6
|
+
phonemes = rhyme.get_phonetic_breakdown("legacy")
|
|
7
|
+
assert phonemes == ['L', 'EH1', 'G', 'AH0', 'S', 'IY0']
|
|
8
|
+
|
|
9
|
+
# Test a word that should not be in the dictionary
|
|
10
|
+
assert rhyme.get_phonetic_breakdown("brillig") is None
|
|
11
|
+
|
|
12
|
+
def test_get_rhyme_signature():
|
|
13
|
+
"""Tests that we can correctly extract the rhyming part of a word."""
|
|
14
|
+
# Test a word where the last stressed vowel is in the middle
|
|
15
|
+
phonemes = ['JH', 'EH2', 'N', 'ER0', 'EY1', 'SH', 'AH0', 'N'] # generation
|
|
16
|
+
signature = rhyme.get_rhyme_signature(phonemes)
|
|
17
|
+
assert signature == ['EY1', 'SH', 'AH0', 'N']
|
|
18
|
+
|
|
19
|
+
# Test a word where the last stressed vowel is at the beginning
|
|
20
|
+
phonemes = ['S', 'IH1', 'N', 'ER0', 'JH', 'IY0'] # synergy
|
|
21
|
+
signature = rhyme.get_rhyme_signature(phonemes)
|
|
22
|
+
assert signature == ['IH1', 'N', 'ER0', 'JH', 'IY0']
|
|
23
|
+
|
|
24
|
+
# Test a word with no stressed vowels (should be rare, but possible)
|
|
25
|
+
# The pronouncing library often returns pronunciations without stress for some words.
|
|
26
|
+
phonemes = ['AH', 'B', 'AW', 'T'] # a pronunciation of "about"
|
|
27
|
+
signature = rhyme.get_rhyme_signature(phonemes)
|
|
28
|
+
assert signature is None
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from slithyt import sentiment
|
|
2
|
+
|
|
3
|
+
def test_sentiment_inversion():
|
|
4
|
+
"""
|
|
5
|
+
Tests that inverting prefixes and suffixes correctly invert sentiment.
|
|
6
|
+
"""
|
|
7
|
+
assert sentiment.analyze_word_sentiment("unhelpful") < 0.4
|
|
8
|
+
assert sentiment.analyze_word_sentiment("impossible") < 0.4
|
|
9
|
+
assert sentiment.analyze_word_sentiment("fearless") > 0.6
|
|
10
|
+
assert sentiment.analyze_word_sentiment("hopeless") < 0.4
|
|
11
|
+
|
|
12
|
+
def test_recursive_affixes():
|
|
13
|
+
"""
|
|
14
|
+
Tests that the algorithm can handle multiple prefixes and suffixes.
|
|
15
|
+
"""
|
|
16
|
+
# "happy" is positive. "unhappy" is negative.
|
|
17
|
+
# "disunhappy" should be positive again (double negative prefix).
|
|
18
|
+
assert sentiment.analyze_word_sentiment("happy") > 0.7
|
|
19
|
+
assert sentiment.analyze_word_sentiment("unhappy") < 0.3
|
|
20
|
+
assert sentiment.analyze_word_sentiment("disunhappy") > 0.7
|
|
21
|
+
|
|
22
|
+
# "fear" is negative. "fearless" is positive.
|
|
23
|
+
# "fearlessless" should be negative again (double negative suffix).
|
|
24
|
+
assert sentiment.analyze_word_sentiment("fearlessless") < 0.4
|
|
25
|
+
|
|
26
|
+
def test_morphemes_with_neutral_stem():
|
|
27
|
+
"""
|
|
28
|
+
Tests that a morpheme's sentiment dominates if the stem is neutral.
|
|
29
|
+
"""
|
|
30
|
+
# "bene" is positive, "zyx" is neutral. Result should be positive.
|
|
31
|
+
assert sentiment.analyze_word_sentiment("benezyx") > 0.6
|
|
32
|
+
# "mal" is negative, "zyx" is neutral. Result should be negative.
|
|
33
|
+
assert sentiment.analyze_word_sentiment("malzyx") < 0.4
|
|
34
|
+
# "zyx" is neutral, "less" is an inverting suffix. Result should be negative.
|
|
35
|
+
assert sentiment.analyze_word_sentiment("zyxless") < 0.4
|
|
36
|
+
|
|
37
|
+
def test_positional_morphemes():
|
|
38
|
+
"""
|
|
39
|
+
Tests that prefixes, suffixes, and infixes are scored correctly.
|
|
40
|
+
"""
|
|
41
|
+
assert sentiment.analyze_word_sentiment("benefactor") > 0.65
|
|
42
|
+
assert sentiment.analyze_word_sentiment("malrupt") < 0.2
|
|
43
|
+
assert sentiment.analyze_word_sentiment("proactive") > 0.6
|
|
44
|
+
assert sentiment.analyze_word_sentiment("euamor") > 0.8
|
|
45
|
+
|
|
46
|
+
def test_whole_word_lookup():
|
|
47
|
+
"""
|
|
48
|
+
Tests that the validator finds whole words in the VADER lexicon first.
|
|
49
|
+
"""
|
|
50
|
+
assert sentiment.analyze_word_sentiment("disaster") < 0.2
|
|
51
|
+
assert sentiment.analyze_word_sentiment("love") > 0.8
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Tests for the validator module.
|
|
2
|
+
import os
|
|
3
|
+
from slithyt import validator
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from slithyt import validator
|
|
7
|
+
|
|
8
|
+
def test_validator_with_sets():
|
|
9
|
+
"""
|
|
10
|
+
Tests the validation logic with in-memory sets.
|
|
11
|
+
"""
|
|
12
|
+
# Create the sets directly in memory for the test
|
|
13
|
+
dictionary_set = {"common", "ordinary"}
|
|
14
|
+
blocklist_set = {"blocked", "forbidden"}
|
|
15
|
+
|
|
16
|
+
# Test a valid word
|
|
17
|
+
assert validator.validate_word("zentoria", dictionary_set=dictionary_set, blocklist_set=blocklist_set) == True
|
|
18
|
+
|
|
19
|
+
# Test a word that is a common word
|
|
20
|
+
assert validator.validate_word("common", dictionary_set=dictionary_set, blocklist_set=blocklist_set) == False
|
|
21
|
+
|
|
22
|
+
# Test a word that is on the blocklist
|
|
23
|
+
assert validator.validate_word("forbidden", dictionary_set=dictionary_set, blocklist_set=blocklist_set) == False
|
|
24
|
+
|
|
25
|
+
# Test regex constraints (these don't need the sets)
|
|
26
|
+
assert validator.validate_word("startgood", matches_regex="^start") == True
|
|
27
|
+
assert validator.validate_word("startbad", matches_regex="^wrong") == False
|
|
28
|
+
assert validator.validate_word("endgood", reject_regex="bad$") == True
|
|
29
|
+
assert validator.validate_word("endbad", reject_regex="bad$") == False
|
|
30
|
+
|
|
31
|
+
def test_sentiment_validator():
|
|
32
|
+
"""Tests the sentiment validation logic."""
|
|
33
|
+
# These words are constructed to have clear sentiment leanings
|
|
34
|
+
# based on morphemes in the VADER lexicon (e.g., 'win', 'love', 'doom', 'bad').
|
|
35
|
+
positive_word = "eulove"
|
|
36
|
+
negative_word = "maldoom"
|
|
37
|
+
neutral_word = "zxyabc" # No morphemes in VADER lexicon
|
|
38
|
+
|
|
39
|
+
# Test min_sentiment: positive word should pass, negative word should fail.
|
|
40
|
+
assert validator.validate_word(positive_word, min_sentiment=0.75) == True
|
|
41
|
+
assert validator.validate_word(negative_word, min_sentiment=0.75) == False
|
|
42
|
+
|
|
43
|
+
# Test max_sentiment: negative word should pass, positive word should fail.
|
|
44
|
+
assert validator.validate_word(negative_word, max_sentiment=0.35) == True
|
|
45
|
+
assert validator.validate_word(positive_word, max_sentiment=0.35) == False
|
|
46
|
+
|
|
47
|
+
# Test a neutral word in a neutral range
|
|
48
|
+
assert validator.validate_word(neutral_word, min_sentiment=0.4, max_sentiment=0.6) == True
|
|
49
|
+
|
|
50
|
+
# Test a positive word failing a neutral range
|
|
51
|
+
assert validator.validate_word(positive_word, min_sentiment=0.4, max_sentiment=0.6) == False
|
|
52
|
+
|
|
53
|
+
def test_corpus_rejection():
|
|
54
|
+
"""Tests that a word from the corpus rejection set is correctly invalidated."""
|
|
55
|
+
corpus_rejection_set = {"brillig", "slithy", "toves"}
|
|
56
|
+
|
|
57
|
+
# A word from the set should be rejected
|
|
58
|
+
assert validator.validate_word("slithy", corpus_rejection_set=corpus_rejection_set) == False
|
|
59
|
+
# A novel word should be accepted
|
|
60
|
+
assert validator.validate_word("gimble", corpus_rejection_set=corpus_rejection_set) == True
|