medieval_latina 3.1.2 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/tests.yml +1 -0
- data/.tool-versions +1 -1
- data/AGENTS.md +60 -0
- data/Gemfile +5 -0
- data/Gemfile.lock +4 -2
- data/README.md +18 -1
- data/bin/sample_audio +76 -0
- data/data/dictionary.json +615 -7564
- data/lexicons/Latin00.pls +416 -576
- data/lexicons/Latin01.pls +512 -672
- data/lexicons/Latin02.pls +512 -672
- data/lexicons/Latin03.pls +512 -672
- data/lexicons/Latin04.pls +512 -672
- data/lexicons/Latin05.pls +512 -672
- data/lexicons/Latin06.pls +512 -672
- data/lexicons/Latin07.pls +512 -672
- data/lexicons/Latin08.pls +0 -160
- data/lexicons/Latin09.pls +40 -0
- data/lib/medieval_latina/lexicon_builder.rb +5 -5
- data/lib/medieval_latina/version.rb +1 -1
- data/lib/medieval_latina.rb +24 -12
- data/medieval_latina.gemspec +1 -1
- metadata +5 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 47f39ae4c5ce8d0b6b624beae267c9a8e00b2ac44e6c09686e34e825d8671dcd
|
|
4
|
+
data.tar.gz: a8f8ca9f9357a0453429104a1d8e8423c63dbd3ab4360277993c29c46cb7573f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d1e79d2981bdc926e0640de05a3d870ea907075f6c3c822be135c89f2060b1abaa386fe50fb4e61325e7d799ab7d216b33329ec58b35b6e9520bc4c1b7367300
|
|
7
|
+
data.tar.gz: 64dd8015e355efc41bdc754f3720c204dda502d64c0665dfed7830063f8ececb09f5eefefcfd9e520ee105676695f525a6fc1752e4be61e41a2e16f0c1b2eac9
|
data/.github/workflows/tests.yml
CHANGED
data/.tool-versions
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
nodejs 24.2.0
|
|
2
|
-
ruby
|
|
2
|
+
ruby 4.0.5
|
data/AGENTS.md
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# AGENTS.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Codex (Codex.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Development Commands
|
|
6
|
+
|
|
7
|
+
- **Setup**: `bin/setup` - Install dependencies with bundle install
|
|
8
|
+
- **Tests**: `rake spec` or `bundle exec rspec` - Run the full test suite
|
|
9
|
+
- **Linting**: `bin/lint` - Run StandardRB linter and jsonlint on dictionary.json
|
|
10
|
+
- **Build lexicons**: `bin/build` - Regenerate PLS lexicon files from dictionary.json
|
|
11
|
+
- **Console**: `bin/console` - Interactive prompt for experimentation
|
|
12
|
+
- **Install gem locally**: `bundle exec rake install`
|
|
13
|
+
- **Release**: `bundle exec rake release` (after updating version.rb)
|
|
14
|
+
|
|
15
|
+
## Architecture Overview
|
|
16
|
+
|
|
17
|
+
This is a Ruby gem that converts medieval Latin text to phonetic English for text-to-speech engines. The architecture consists of:
|
|
18
|
+
|
|
19
|
+
### Core Components
|
|
20
|
+
|
|
21
|
+
- **MedievalLatina class** (`lib/medieval_latina.rb`): Main interface with class methods for text conversion and linguistic analysis
|
|
22
|
+
- `MedievalLatina[text]` - Primary conversion method
|
|
23
|
+
- Part-of-speech helpers: `verb?`, `noun?`, `adjective?`, `adverb?`
|
|
24
|
+
- `pronunciations_for(words)` - Extract IPA pronunciations for lexicon building
|
|
25
|
+
|
|
26
|
+
- **Dictionary system** (`data/dictionary.json`): Large JSON file containing Latin words with metadata including:
|
|
27
|
+
- IPA pronunciations
|
|
28
|
+
- Part of speech classifications
|
|
29
|
+
- Custom pronunciation overrides
|
|
30
|
+
|
|
31
|
+
- **Lexicon generation** (`lib/medieval_latina/lexicon_builder.rb`, `lib/medieval_latina/lexicon.rb`): Creates PLS (Pronunciation Lexicon Specification) files for AWS Polly and other TTS engines
|
|
32
|
+
|
|
33
|
+
### Phonetic Conversion Logic
|
|
34
|
+
|
|
35
|
+
The main conversion algorithm handles:
|
|
36
|
+
- Vowel teams: ae→ay, oe→ay, au→ou
|
|
37
|
+
- Consonant transformations: c→ch/k (soft/hard), g→j/g, j→y, t→ts/t, x→ks
|
|
38
|
+
- Consonant teams: gn→n-y, qu→kw
|
|
39
|
+
- Text preprocessing with I18n transliteration
|
|
40
|
+
|
|
41
|
+
### Data Flow
|
|
42
|
+
|
|
43
|
+
1. Text input → word tokenization → dictionary lookup
|
|
44
|
+
2. If word has custom pronunciation → use it
|
|
45
|
+
3. Otherwise → apply phonetic transformation rules
|
|
46
|
+
4. Rejoin with proper punctuation spacing
|
|
47
|
+
|
|
48
|
+
## Key Files
|
|
49
|
+
|
|
50
|
+
- `lib/medieval_latina.rb` - Main conversion logic and API
|
|
51
|
+
- `data/dictionary.json` - Latin word database (400KB+)
|
|
52
|
+
- `bin/build` - Splits dictionary into multiple PLS files in lexicons/ directory
|
|
53
|
+
- `.standard.yml` - StandardRB configuration (Ruby 3.2, parallel linting)
|
|
54
|
+
- `medieval_latina.gemspec` - Gem specification (requires Ruby >= 3.2.0)
|
|
55
|
+
|
|
56
|
+
## Testing
|
|
57
|
+
|
|
58
|
+
- RSpec test suite in `spec/`
|
|
59
|
+
- Configuration in `.rspec` with documentation format
|
|
60
|
+
- Run specific tests: `bundle exec rspec spec/specific_spec.rb`
|
data/Gemfile
CHANGED
|
@@ -2,3 +2,8 @@ source "https://rubygems.org"
|
|
|
2
2
|
|
|
3
3
|
# Specify your gem's dependencies in medieval_latina.gemspec
|
|
4
4
|
gemspec
|
|
5
|
+
|
|
6
|
+
# `logger` stopped being autoloaded under Bundler once it became a bundled gem
|
|
7
|
+
# (Ruby 3.4+) and is no longer in Ruby 4.0's default set. jsonlint requires it
|
|
8
|
+
# without declaring the dependency, so declare it here for `bundle exec jsonlint`.
|
|
9
|
+
gem "logger"
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
medieval_latina (3.
|
|
4
|
+
medieval_latina (3.2.0)
|
|
5
5
|
i18n
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -19,8 +19,9 @@ GEM
|
|
|
19
19
|
optimist (~> 3)
|
|
20
20
|
language_server-protocol (3.17.0.3)
|
|
21
21
|
lint_roller (1.1.0)
|
|
22
|
+
logger (1.7.0)
|
|
22
23
|
mini_portile2 (2.8.9)
|
|
23
|
-
nokogiri (1.
|
|
24
|
+
nokogiri (1.19.3)
|
|
24
25
|
mini_portile2 (~> 2.8.2)
|
|
25
26
|
racc (~> 1.4)
|
|
26
27
|
oj (3.16.5)
|
|
@@ -88,6 +89,7 @@ PLATFORMS
|
|
|
88
89
|
|
|
89
90
|
DEPENDENCIES
|
|
90
91
|
jsonlint
|
|
92
|
+
logger
|
|
91
93
|
medieval_latina!
|
|
92
94
|
nokogiri
|
|
93
95
|
rake (~> 12.0)
|
data/README.md
CHANGED
|
@@ -3,6 +3,22 @@
|
|
|
3
3
|
There are good text-to-speech engines for English and classical Latin, but none for medieval Latin.
|
|
4
4
|
`MedievalLatina` converts Latin text to a kind of phonetic spelling that can be read by English language text-to-speech engines.
|
|
5
5
|
|
|
6
|
+
## Hear it
|
|
7
|
+
|
|
8
|
+
A line of the Lord's Prayer — *Pater noster qui es in caelis* — spoken by Amazon Polly using this gem's IPA pronunciation lexicon, so you hear MedievalLatina's pronunciation rather than raw Latin:
|
|
9
|
+
|
|
10
|
+
▶️ **[Play the sample](https://github.com/jaysonvirissimo/medieval_latina/raw/master/audio/pater-noster.mp3)** (`audio/pater-noster.mp3`)
|
|
11
|
+
|
|
12
|
+
<!-- Inline player: drag-and-drop audio/pater-noster.mp3 into a GitHub PR/comment composer to
|
|
13
|
+
mint a https://github.com/user-attachments/assets/<id> URL, then paste that URL on its own
|
|
14
|
+
line directly below to render an inline audio player in the rendered README. -->
|
|
15
|
+
|
|
16
|
+
- **Text:** *Pater noster qui es in caelis* — "Our Father, who art in heaven", the traditional Latin Lord's Prayer (public domain).
|
|
17
|
+
- **Gem output:** `MedievalLatina["Pater noster qui es in caelis"]` → `"pah-tare nohstayr kwee es een chaylees"`.
|
|
18
|
+
- **Voice:** Amazon Polly **Bianca** (Italian, `it-IT`, neural engine), driven by MedievalLatina's IPA via a PLS lexicon.
|
|
19
|
+
- **Reproduce:** `ruby -Ilib bin/sample_audio Bianca` (requires AWS credentials in the environment and the `aws-sdk-polly` gem).
|
|
20
|
+
- **Provenance:** audio generated with Amazon Polly on 2026-06-07; Polly output may be used and redistributed under the [AWS Service Terms](https://aws.amazon.com/service-terms/).
|
|
21
|
+
|
|
6
22
|
## Installation
|
|
7
23
|
|
|
8
24
|
Add this line to your application's Gemfile:
|
|
@@ -40,7 +56,8 @@ responsiveVoice.speak(sentence, "UK English Female");
|
|
|
40
56
|
polly = Aws::Polly::Client.new
|
|
41
57
|
s3 = Aws::S3::Client.new
|
|
42
58
|
|
|
43
|
-
|
|
59
|
+
# Lowercase so the text matches the lexicon's (case-sensitive) lowercase graphemes.
|
|
60
|
+
sentence = "pater noster qui es in caelis"
|
|
44
61
|
|
|
45
62
|
words = sentence.split(" ")
|
|
46
63
|
pronunciations = MedievalLatina.pronunciations_for(words)
|
data/bin/sample_audio
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Generates the README's spoken sample with Amazon Polly, using MedievalLatina's
|
|
5
|
+
# IPA pronunciations via a PLS lexicon so Polly pronounces the Latin our way.
|
|
6
|
+
#
|
|
7
|
+
# Usage:
|
|
8
|
+
# AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY must be in the environment.
|
|
9
|
+
# aws-sdk-polly must be available, e.g. run through a bundle that provides it:
|
|
10
|
+
# BUNDLE_GEMFILE=/path/to/aws/Gemfile bundle exec ruby -Ilib bin/sample_audio Joanna
|
|
11
|
+
# Output: tmp/sample-<voice>.mp3
|
|
12
|
+
#
|
|
13
|
+
# Credentials are read from the AWS SDK default chain only; they are never echoed,
|
|
14
|
+
# logged, or written to disk.
|
|
15
|
+
|
|
16
|
+
require "medieval_latina"
|
|
17
|
+
require "aws-sdk-polly"
|
|
18
|
+
require "fileutils"
|
|
19
|
+
|
|
20
|
+
# us-east-1 is where the voices/engines below are available; override via env if needed.
|
|
21
|
+
REGION = ENV["AWS_REGION"] || ENV["AWS_DEFAULT_REGION"] || "us-east-1"
|
|
22
|
+
# Polly matches lexicon graphemes case-sensitively and our graphemes are lowercase,
|
|
23
|
+
# so synthesize from lowercased text (the README still shows it capitalized).
|
|
24
|
+
SENTENCE = "pater noster qui es in caelis"
|
|
25
|
+
|
|
26
|
+
# Polly applies a lexicon only when its xml:lang matches the voice's language, and
|
|
27
|
+
# lexicons work on the standard/neural engines (not generative). Each voice below is
|
|
28
|
+
# paired with the matching lexicon language and a lexicon-capable engine.
|
|
29
|
+
VOICES = {
|
|
30
|
+
"Joanna" => {lang: "en-US", engine: "neural"},
|
|
31
|
+
"Matthew" => {lang: "en-US", engine: "neural"},
|
|
32
|
+
"Danielle" => {lang: "en-US", engine: "neural"},
|
|
33
|
+
"Stephen" => {lang: "en-US", engine: "neural"},
|
|
34
|
+
"Bianca" => {lang: "it-IT", engine: "neural"},
|
|
35
|
+
"Carla" => {lang: "it-IT", engine: "standard"},
|
|
36
|
+
"Giorgio" => {lang: "it-IT", engine: "standard"}
|
|
37
|
+
}.freeze
|
|
38
|
+
|
|
39
|
+
LEXICON_NAMES = {"en-US" => "MedievalLatinaEnUs", "it-IT" => "MedievalLatinaItIt"}.freeze
|
|
40
|
+
|
|
41
|
+
abort "Usage: #{$PROGRAM_NAME} <voice>\nChoices: #{VOICES.keys.join(", ")}" if ARGV.empty?
|
|
42
|
+
|
|
43
|
+
voice = ARGV[0]
|
|
44
|
+
config = VOICES[voice]
|
|
45
|
+
abort "Unknown voice #{voice.inspect}. Choose one of: #{VOICES.keys.join(", ")}" unless config
|
|
46
|
+
|
|
47
|
+
# Build the {word => IPA} map from the gem's dictionary for every word in the sentence.
|
|
48
|
+
pronunciations = MedievalLatina.pronunciations_for(SENTENCE.split)
|
|
49
|
+
|
|
50
|
+
# Build a lexicon whose xml:lang matches the voice's language (Polly applies a lexicon
|
|
51
|
+
# only when the languages match).
|
|
52
|
+
lang = config[:lang]
|
|
53
|
+
lexicon_name = LEXICON_NAMES[lang]
|
|
54
|
+
lexicon = MedievalLatina::LexiconBuilder.new(pronunciations, lang: lang).call.to_s
|
|
55
|
+
|
|
56
|
+
polly = Aws::Polly::Client.new(region: REGION)
|
|
57
|
+
polly.put_lexicon(name: lexicon_name, content: lexicon) # idempotent; overwrites
|
|
58
|
+
|
|
59
|
+
FileUtils.mkdir_p("tmp")
|
|
60
|
+
output_path = "tmp/sample-#{voice}.mp3"
|
|
61
|
+
|
|
62
|
+
# Always remove the lexicon from the AWS account, even if synthesis/write raises.
|
|
63
|
+
begin
|
|
64
|
+
response = polly.synthesize_speech(
|
|
65
|
+
text: SENTENCE,
|
|
66
|
+
lexicon_names: [lexicon_name],
|
|
67
|
+
voice_id: voice,
|
|
68
|
+
engine: config[:engine],
|
|
69
|
+
output_format: "mp3"
|
|
70
|
+
)
|
|
71
|
+
File.binwrite(output_path, response.audio_stream.read)
|
|
72
|
+
ensure
|
|
73
|
+
polly.delete_lexicon(name: lexicon_name)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
puts "#{voice} (#{lang}, #{config[:engine]}) -> #{output_path} (#{File.size(output_path)} bytes)"
|