lda-ruby 0.5.0-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +69 -0
- data/Gemfile +9 -0
- data/README.md +160 -0
- data/VERSION.yml +5 -0
- data/docs/modernization-handoff.md +233 -0
- data/docs/porting-strategy.md +148 -0
- data/docs/precompiled-platform-policy.md +81 -0
- data/docs/precompiled-target-evaluation.md +67 -0
- data/docs/release-runbook.md +192 -0
- data/docs/rust-orchestration-guardrails.md +50 -0
- data/ext/lda-ruby/cokus.c +144 -0
- data/ext/lda-ruby/cokus.h +27 -0
- data/ext/lda-ruby/extconf.rb +13 -0
- data/ext/lda-ruby/lda-alpha.c +96 -0
- data/ext/lda-ruby/lda-alpha.h +21 -0
- data/ext/lda-ruby/lda-data.c +67 -0
- data/ext/lda-ruby/lda-data.h +14 -0
- data/ext/lda-ruby/lda-inference.c +1023 -0
- data/ext/lda-ruby/lda-inference.h +63 -0
- data/ext/lda-ruby/lda-model.c +345 -0
- data/ext/lda-ruby/lda-model.h +31 -0
- data/ext/lda-ruby/lda-x64-mingw-ucrt.def +2 -0
- data/ext/lda-ruby/lda.h +54 -0
- data/ext/lda-ruby/utils.c +119 -0
- data/ext/lda-ruby/utils.h +18 -0
- data/ext/lda-ruby-rust/Cargo.toml +12 -0
- data/ext/lda-ruby-rust/README.md +73 -0
- data/ext/lda-ruby-rust/extconf.rb +135 -0
- data/ext/lda-ruby-rust/include/strings.h +35 -0
- data/ext/lda-ruby-rust/src/lib.rs +1263 -0
- data/lda-ruby.gemspec +78 -0
- data/lib/lda-ruby/backends/base.rb +133 -0
- data/lib/lda-ruby/backends/native.rb +158 -0
- data/lib/lda-ruby/backends/pure_ruby.rb +675 -0
- data/lib/lda-ruby/backends/rust.rb +607 -0
- data/lib/lda-ruby/backends.rb +58 -0
- data/lib/lda-ruby/config/stopwords.yml +571 -0
- data/lib/lda-ruby/corpus/corpus.rb +45 -0
- data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
- data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
- data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
- data/lib/lda-ruby/document/data_document.rb +30 -0
- data/lib/lda-ruby/document/document.rb +40 -0
- data/lib/lda-ruby/document/text_document.rb +39 -0
- data/lib/lda-ruby/lda.so +0 -0
- data/lib/lda-ruby/rust_build_policy.rb +21 -0
- data/lib/lda-ruby/version.rb +5 -0
- data/lib/lda-ruby/vocabulary.rb +46 -0
- data/lib/lda-ruby.rb +413 -0
- data/lib/lda_ruby_rust.so +0 -0
- data/license.txt +504 -0
- data/test/backend_compatibility_test.rb +146 -0
- data/test/backends_selection_test.rb +100 -0
- data/test/benchmark_scripts_test.rb +23 -0
- data/test/data/docs.dat +46 -0
- data/test/data/sample.rb +20 -0
- data/test/data/wiki-test-docs.yml +123 -0
- data/test/gemspec_test.rb +27 -0
- data/test/lda_ruby_test.rb +319 -0
- data/test/packaged_gem_smoke_test.rb +33 -0
- data/test/pure_ruby_orchestration_test.rb +109 -0
- data/test/release_scripts_test.rb +93 -0
- data/test/rust_build_policy_test.rb +23 -0
- data/test/rust_orchestration_test.rb +911 -0
- data/test/simple_pipeline_test.rb +22 -0
- data/test/simple_yaml.rb +17 -0
- data/test/test_helper.rb +10 -0
- metadata +118 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Precompiled Platform Gem Policy (Phase 5B)
|
|
2
|
+
|
|
3
|
+
This document defines the publish strategy and compatibility policy for `lda-ruby` precompiled gems.
|
|
4
|
+
|
|
5
|
+
## Artifact Strategy
|
|
6
|
+
|
|
7
|
+
Each release version publishes a split package set:
|
|
8
|
+
|
|
9
|
+
- Source gem: `lda-ruby-<version>.gem`
|
|
10
|
+
- Precompiled platform gems:
|
|
11
|
+
- `lda-ruby-<version>-x86_64-linux.gem`
|
|
12
|
+
- `lda-ruby-<version>-x86_64-darwin.gem`
|
|
13
|
+
- `lda-ruby-<version>-arm64-darwin.gem`
|
|
14
|
+
- `lda-ruby-<version>-x64-mingw-ucrt.gem`
|
|
15
|
+
- `lda-ruby-<version>-x86_64-linux-musl.gem`
|
|
16
|
+
|
|
17
|
+
The source gem remains the universal fallback. Platform gems are additive and are expected to install without local build tools.
|
|
18
|
+
Precompiled artifacts are built on matching host runners (no cross-compilation in current workflow).
|
|
19
|
+
|
|
20
|
+
## Compatibility Policy
|
|
21
|
+
|
|
22
|
+
- Supported Ruby versions: 3.2 and 3.3 (plus future versions validated by CI).
|
|
23
|
+
- Release-blocking precompiled targets:
|
|
24
|
+
- Linux `x86_64-linux`
|
|
25
|
+
- Linux musl `x86_64-linux-musl`
|
|
26
|
+
- macOS Intel `x86_64-darwin`
|
|
27
|
+
- macOS Apple Silicon `arm64-darwin`
|
|
28
|
+
- Windows `x64-mingw-ucrt`
|
|
29
|
+
- Other platforms:
|
|
30
|
+
- Install from source gem.
|
|
31
|
+
- Runtime remains supported through native/pure fallback paths.
|
|
32
|
+
|
|
33
|
+
Backend behavior expectations:
|
|
34
|
+
|
|
35
|
+
- Platform gem install:
|
|
36
|
+
- `auto` backend resolves to `rust` by default.
|
|
37
|
+
- `native` and `pure` overrides continue to work.
|
|
38
|
+
- Source gem install:
|
|
39
|
+
- Rust build policy is controlled by `LDA_RUBY_RUST_BUILD=auto|always|never`.
|
|
40
|
+
- If Rust build is skipped/unavailable, `auto` falls back to `native`, then `pure_ruby`.
|
|
41
|
+
|
|
42
|
+
## Guardrails
|
|
43
|
+
|
|
44
|
+
Validation must pass before publish:
|
|
45
|
+
|
|
46
|
+
- `./bin/release-preflight` (source-gem checks).
|
|
47
|
+
- `./bin/release-precompiled-artifacts --platform <target>` for each release-blocking platform.
|
|
48
|
+
|
|
49
|
+
Release automation requirements:
|
|
50
|
+
|
|
51
|
+
- `.github/workflows/release.yml` builds source + precompiled artifacts.
|
|
52
|
+
- Release workflow matrix must include all release-blocking precompiled targets.
|
|
53
|
+
- Publish jobs push all built gems and attach checksums to GitHub releases.
|
|
54
|
+
- Post-publish verification job must validate RubyGems entries and GitHub release assets for the tagged version.
|
|
55
|
+
|
|
56
|
+
Continuous integration guardrail:
|
|
57
|
+
|
|
58
|
+
- `.github/workflows/ci.yml` runs `release-precompiled-artifacts` for the full release-blocking precompiled matrix (Linux, Linux musl, macOS Intel, macOS Apple Silicon, Windows) on every branch/PR.
|
|
59
|
+
- macOS precompiled lanes pin Homebrew `llvm@18` (falling back to `llvm` if unavailable) and export `LIBCLANG_PATH` from the selected prefix to keep bindgen stable across Homebrew formula updates.
|
|
60
|
+
- `.github/workflows/precompiled-candidate-evaluation.yml` is used for additional platform candidate checks.
|
|
61
|
+
- `.github/workflows/release.yml` dry-run validates the full release-blocking matrix before publish.
|
|
62
|
+
|
|
63
|
+
Latest release-matrix validation:
|
|
64
|
+
|
|
65
|
+
- [release dry-run 22556487788](https://github.com/ealdent/lda-ruby/actions/runs/22556487788) succeeded for Linux, Linux musl, macOS Intel, macOS Apple Silicon, and Windows targets.
|
|
66
|
+
|
|
67
|
+
## Rollout / Expansion Rules
|
|
68
|
+
|
|
69
|
+
When adding a new precompiled platform:
|
|
70
|
+
|
|
71
|
+
1. Add target to release workflow matrix.
|
|
72
|
+
2. Add or update CI coverage for that platform family.
|
|
73
|
+
3. Update this policy and the release runbook support matrix.
|
|
74
|
+
4. Record feasibility evidence and rollout notes in `docs/precompiled-target-evaluation.md`.
|
|
75
|
+
5. Validate a dry-run release with `workflow_dispatch` before shipping.
|
|
76
|
+
|
|
77
|
+
When deprecating a precompiled platform:
|
|
78
|
+
|
|
79
|
+
1. Remove platform from release matrix.
|
|
80
|
+
2. Keep source-gem path available unless the overall platform support policy changes.
|
|
81
|
+
3. Document deprecation in `CHANGELOG.md` and release notes.
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# Precompiled Target Evaluation (Priority 2)
|
|
2
|
+
|
|
3
|
+
This document tracks current feasibility for expanding precompiled gem targets beyond the Phase 5B baseline.
|
|
4
|
+
|
|
5
|
+
Current release-blocking precompiled targets:
|
|
6
|
+
|
|
7
|
+
- `x86_64-linux`
|
|
8
|
+
- `x86_64-darwin`
|
|
9
|
+
- `arm64-darwin`
|
|
10
|
+
- `x64-mingw-ucrt`
|
|
11
|
+
- `x86_64-linux-musl`
|
|
12
|
+
|
|
13
|
+
Reference implementation constraints:
|
|
14
|
+
|
|
15
|
+
- `bin/release-precompiled-artifacts` only supports host-matching platform builds (no cross-compilation).
|
|
16
|
+
- Release workflow currently uses matching host runners for each precompiled target.
|
|
17
|
+
|
|
18
|
+
## Candidate: Windows (`x64-mingw-ucrt`)
|
|
19
|
+
|
|
20
|
+
Status: promoted to release-blocking after release dry-run matrix success.
|
|
21
|
+
|
|
22
|
+
Feasibility notes:
|
|
23
|
+
|
|
24
|
+
- GitHub Actions provides Windows runners, so host-matching builds are possible in principle.
|
|
25
|
+
- Existing release tooling is bash-first and assumes POSIX shell ergonomics throughout.
|
|
26
|
+
- Runtime smoke and packaged-gem checks were validated in candidate runs before promotion.
|
|
27
|
+
- Candidate runs:
|
|
28
|
+
- [run 22555475302](https://github.com/ealdent/lda-ruby/actions/runs/22555475302): failed in native extension compile (`rake compile`) with `cokus.h` macro collision and `time_t` mismatch.
|
|
29
|
+
- [run 22555550326](https://github.com/ealdent/lda-ruby/actions/runs/22555550326): progressed further, failed on `utils.c` `mkdir(name, mode)` mismatch (Windows `_mkdir` required).
|
|
30
|
+
- [run 22556009214](https://github.com/ealdent/lda-ruby/actions/runs/22556009214): Rust bindgen/toolchain parsing fixed; build then failed on Windows DLL name staging expectation.
|
|
31
|
+
- [run 22556129503](https://github.com/ealdent/lda-ruby/actions/runs/22556129503): Windows candidate build + artifact upload succeeded after GNU toolchain alignment, bindgen header/sysroot setup, and dual DLL name staging support.
|
|
32
|
+
- [run 22556206925](https://github.com/ealdent/lda-ruby/actions/runs/22556206925): Windows candidate remained green with packaged-gem runtime smoke checks enabled.
|
|
33
|
+
- [run 22556487788](https://github.com/ealdent/lda-ruby/actions/runs/22556487788): release workflow dry-run succeeded with `windows-x64-mingw-ucrt` included in release matrix.
|
|
34
|
+
|
|
35
|
+
Required validation to promote:
|
|
36
|
+
|
|
37
|
+
1. Completed: release dry-run matrix validation passed.
|
|
38
|
+
|
|
39
|
+
## Candidate: musl Linux (`x86_64-linux-musl`)
|
|
40
|
+
|
|
41
|
+
Status: promoted to release-blocking after release dry-run matrix success.
|
|
42
|
+
|
|
43
|
+
Feasibility notes:
|
|
44
|
+
|
|
45
|
+
- Current workflow uses `ubuntu-latest` (glibc), not musl.
|
|
46
|
+
- Current artifact script rejects cross-platform builds, so a musl artifact requires either:
|
|
47
|
+
- a musl-hosted builder, or
|
|
48
|
+
- a dedicated musl-native build container/workflow path treated as host-equivalent for packaging.
|
|
49
|
+
- Local validation signal (2026-03-01): Alpine container dry-run succeeded for host-matching `aarch64-linux-musl` with:
|
|
50
|
+
- `./bin/release-precompiled-artifacts --platform <detected-musl-platform> --skip-preflight --skip-runtime-checks`
|
|
51
|
+
- Candidate workflow runs (2026-03-01):
|
|
52
|
+
- [run 22555475302](https://github.com/ealdent/lda-ruby/actions/runs/22555475302): built `x86_64-linux-musl` successfully but artifact upload path was misconfigured.
|
|
53
|
+
- [run 22555550326](https://github.com/ealdent/lda-ruby/actions/runs/22555550326): musl candidate built and uploaded artifacts successfully with corrected glob path (`pkg/lda-ruby-*-linux-musl.gem*`).
|
|
54
|
+
- [run 22556129503](https://github.com/ealdent/lda-ruby/actions/runs/22556129503): musl candidate build + artifact upload remained green alongside the fixed Windows lane.
|
|
55
|
+
- [run 22556206925](https://github.com/ealdent/lda-ruby/actions/runs/22556206925): musl candidate remained green with packaged-gem runtime smoke checks enabled.
|
|
56
|
+
- [run 22556487788](https://github.com/ealdent/lda-ruby/actions/runs/22556487788): release workflow dry-run succeeded with `linux-musl-x86_64` included in release matrix.
|
|
57
|
+
|
|
58
|
+
Required validation to promote:
|
|
59
|
+
|
|
60
|
+
1. Completed: release dry-run matrix validation passed.
|
|
61
|
+
|
|
62
|
+
## Recommendation
|
|
63
|
+
|
|
64
|
+
Current expansion step is complete for Windows and musl. Any additional target should follow the same sequence:
|
|
65
|
+
1. Add candidate workflow coverage.
|
|
66
|
+
2. Verify candidate runtime checks.
|
|
67
|
+
3. Validate one release dry-run with the new matrix lane before promotion.
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
# Release Runbook (Phase 5A + 5B)
|
|
2
|
+
|
|
3
|
+
This runbook defines the maintainer workflow for shipping `lda-ruby` source and precompiled platform gem releases.
|
|
4
|
+
|
|
5
|
+
Authoritative platform/support policy is maintained in `docs/precompiled-platform-policy.md`; expansion feasibility notes live in `docs/precompiled-target-evaluation.md`.
|
|
6
|
+
|
|
7
|
+
## Scope
|
|
8
|
+
|
|
9
|
+
- Release artifact types:
|
|
10
|
+
- source gem: `pkg/lda-ruby-<version>.gem`
|
|
11
|
+
- precompiled gems (current targets are defined in `docs/precompiled-platform-policy.md`)
|
|
12
|
+
- Release trigger: git tag (`vX.Y.Z`) with matching version files
|
|
13
|
+
- Publish targets:
|
|
14
|
+
- RubyGems (`gem push`)
|
|
15
|
+
- GitHub Releases (gem + checksum attachment)
|
|
16
|
+
|
|
17
|
+
## Prerequisites
|
|
18
|
+
|
|
19
|
+
1. Access:
|
|
20
|
+
- push/tag rights on `master`
|
|
21
|
+
- access to GitHub Actions environments for release approvals
|
|
22
|
+
- RubyGems owner access for `lda-ruby`
|
|
23
|
+
2. Local tooling:
|
|
24
|
+
- Ruby 3.2+ with Bundler
|
|
25
|
+
- Rust toolchain (`cargo`) for local precompiled-gem build checks
|
|
26
|
+
- `libclang` available to Rust bindgen
|
|
27
|
+
- Docker (recommended for reproducible checks)
|
|
28
|
+
3. Repository state:
|
|
29
|
+
- release commit merged to `master`
|
|
30
|
+
- clean working tree
|
|
31
|
+
- version files in sync
|
|
32
|
+
|
|
33
|
+
## Required Secrets and Environments
|
|
34
|
+
|
|
35
|
+
GitHub repository secret:
|
|
36
|
+
|
|
37
|
+
- `RUBYGEMS_API_KEY`: API key with push rights for `lda-ruby` and non-interactive publish support (no OTP prompt during `gem push`).
|
|
38
|
+
|
|
39
|
+
GitHub Actions environment:
|
|
40
|
+
|
|
41
|
+
- `release`: protect this environment with required reviewer approval.
|
|
42
|
+
- Both publish jobs in `.github/workflows/release.yml` are bound to `release`.
|
|
43
|
+
|
|
44
|
+
## Release Preparation
|
|
45
|
+
|
|
46
|
+
1. Prepare and update release files:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
./bin/release-prepare 0.4.0
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
2. Review changes:
|
|
53
|
+
- `VERSION.yml`
|
|
54
|
+
- `lib/lda-ruby/version.rb`
|
|
55
|
+
- `CHANGELOG.md`
|
|
56
|
+
|
|
57
|
+
3. Validate full release checks locally:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
SKIP_DOCKER=1 ./bin/release-preflight
|
|
61
|
+
./bin/test-packaged-gem-manifest
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
4. Validate local precompiled gem flow for your current host platform:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
./bin/release-precompiled-artifacts --tag v0.4.0 --skip-preflight
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Note: `release-precompiled-artifacts` only supports building for the current host platform (no cross-compilation).
|
|
71
|
+
|
|
72
|
+
5. Verify RubyGems API key behavior before tagging:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
./bin/verify-rubygems-api-key
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
This check intentionally attempts a duplicate push of an existing gem version. A duplicate-rejected response is expected and confirms non-interactive auth works.
|
|
79
|
+
|
|
80
|
+
6. Commit and merge to `master`.
|
|
81
|
+
|
|
82
|
+
## Dry-Run Path (No Publish)
|
|
83
|
+
|
|
84
|
+
Use `workflow_dispatch` with `publish=false`.
|
|
85
|
+
|
|
86
|
+
Behavior:
|
|
87
|
+
|
|
88
|
+
- runs release validation and artifact build
|
|
89
|
+
- uploads source + precompiled `pkg/lda-ruby-*.gem` and checksum files as workflow artifacts
|
|
90
|
+
- does not push to RubyGems
|
|
91
|
+
- does not create a GitHub release
|
|
92
|
+
|
|
93
|
+
Latest verified dry-run reference:
|
|
94
|
+
|
|
95
|
+
- date: 2026-03-02
|
|
96
|
+
- workflow run: `https://github.com/ealdent/lda-ruby/actions/runs/22556487788`
|
|
97
|
+
- dispatch parameters: `release_tag=v0.4.0`, `publish=false`
|
|
98
|
+
- result: success across `validate`, `build_artifacts`, and full `build_precompiled_artifacts` matrix
|
|
99
|
+
- verified precompiled lanes:
|
|
100
|
+
- `linux-x86_64`
|
|
101
|
+
- `linux-musl-x86_64`
|
|
102
|
+
- `macos-x86_64`
|
|
103
|
+
- `macos-arm64`
|
|
104
|
+
- `windows-x64-mingw-ucrt`
|
|
105
|
+
|
|
106
|
+
Optional local dry-run equivalent:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
./bin/release-artifacts --tag v0.4.0
|
|
110
|
+
./bin/release-precompiled-artifacts --tag v0.4.0 --skip-preflight
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Candidate expansion workflow:
|
|
114
|
+
|
|
115
|
+
- For future platform evaluation beyond current release-blocking targets, run `.github/workflows/precompiled-candidate-evaluation.yml` via `workflow_dispatch`.
|
|
116
|
+
- Record outcome artifacts/logs in `docs/precompiled-target-evaluation.md`.
|
|
117
|
+
|
|
118
|
+
## Known Publish Incident (`v0.4.0`)
|
|
119
|
+
|
|
120
|
+
- date: 2026-02-25
|
|
121
|
+
- release runs:
|
|
122
|
+
- `https://github.com/ealdent/lda-ruby/actions/runs/22383716372`
|
|
123
|
+
- `https://github.com/ealdent/lda-ruby/actions/runs/22383849236` (attempt 1 + rerun attempt 2 + rerun attempt 3)
|
|
124
|
+
- result: artifact build stages passed, `publish to RubyGems` failed with OTP-required auth (`You have enabled multifactor authentication but no OTP code provided.`)
|
|
125
|
+
- recovery action: rotated `release` environment secret `RUBYGEMS_API_KEY` to a CI-safe key and reran run `22383849236`.
|
|
126
|
+
- recovery result: rerun attempt 3 succeeded; RubyGems `0.4.0` and GitHub release `v0.4.0` published.
|
|
127
|
+
|
|
128
|
+
## Publish Path (Tag-Driven)
|
|
129
|
+
|
|
130
|
+
1. Ensure the release commit is on `master`.
|
|
131
|
+
2. Create and push the release tag:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
git checkout master
|
|
135
|
+
git pull --ff-only
|
|
136
|
+
git tag -a v0.4.0 -m "Release v0.4.0"
|
|
137
|
+
git push origin v0.4.0
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
3. Monitor `.github/workflows/release.yml`:
|
|
141
|
+
- `validate`
|
|
142
|
+
- `build_artifacts`
|
|
143
|
+
- `build_precompiled_artifacts` (linux + linux-musl + macOS + windows matrix)
|
|
144
|
+
- environment-gated `publish_rubygems`
|
|
145
|
+
- environment-gated `publish_github_release`
|
|
146
|
+
- `verify_published_artifacts`
|
|
147
|
+
- on failed tag-triggered `release.yml` runs, `.github/workflows/release-failure-alert.yml` opens a triage issue with failed job links
|
|
148
|
+
- if the same release run later succeeds (for example via rerun), the alert issue is auto-closed by `.github/workflows/release-failure-alert.yml`
|
|
149
|
+
4. Approve the protected `release` environment when prompted.
|
|
150
|
+
5. Confirm published outputs:
|
|
151
|
+
- RubyGems shows `lda-ruby` `0.4.0` source gem and platform gems
|
|
152
|
+
- GitHub release `v0.4.0` exists with all gem and `.sha256` attachments
|
|
153
|
+
- workflow job `verify_published_artifacts` succeeds
|
|
154
|
+
|
|
155
|
+
## Rollback and Recovery
|
|
156
|
+
|
|
157
|
+
If publish fails before RubyGems push:
|
|
158
|
+
|
|
159
|
+
1. Fix issue on `master`.
|
|
160
|
+
2. Delete and recreate the tag only if the broken tag did not produce public artifacts:
|
|
161
|
+
- `git tag -d vX.Y.Z`
|
|
162
|
+
- `git push origin :refs/tags/vX.Y.Z`
|
|
163
|
+
3. Re-tag and re-run release.
|
|
164
|
+
|
|
165
|
+
If RubyGems push succeeds but GitHub release fails:
|
|
166
|
+
|
|
167
|
+
1. Re-run only the GitHub release path by re-running the workflow job after fix.
|
|
168
|
+
2. Do not re-push gem for the same version.
|
|
169
|
+
|
|
170
|
+
If an incorrect gem is published:
|
|
171
|
+
|
|
172
|
+
1. Yank from RubyGems:
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
gem yank lda-ruby -v X.Y.Z
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
2. Publish a corrective version (for example `X.Y.(Z+1)`), do not re-use yanked version numbers.
|
|
179
|
+
3. Update `CHANGELOG.md` and release notes to document the correction.
|
|
180
|
+
|
|
181
|
+
## Troubleshooting
|
|
182
|
+
|
|
183
|
+
- `Could not find 'bundler'`: install the Bundler version pinned in `Gemfile.lock`.
|
|
184
|
+
- `cargo not found` in rust-enabled checks: ensure Rust toolchain is installed or run in Docker.
|
|
185
|
+
- `libclang` not found while building precompiled gems: install LLVM/libclang and set `LIBCLANG_PATH` if needed.
|
|
186
|
+
- Linux `Install Rust bindgen dependencies` can take several minutes on fresh runners due apt package index and package installs.
|
|
187
|
+
- RubyGems publish asks for OTP (`You have enabled multi-factor authentication but no OTP code provided`): run `./bin/verify-rubygems-api-key`, then rotate `RUBYGEMS_API_KEY` to a CI-safe key if OTP is requested.
|
|
188
|
+
- Post-publish verification fails: run `./bin/verify-release-publish --tag vX.Y.Z` and fix missing RubyGems entries or GitHub release assets before considering the release complete.
|
|
189
|
+
- macOS Rust link errors (`symbol(s) not found` for Ruby APIs): ensure build path preserves `-C link-arg=-Wl,-undefined,dynamic_lookup` in `RUSTFLAGS`.
|
|
190
|
+
- Tag/version mismatch: run `./bin/check-version-sync --tag vX.Y.Z`.
|
|
191
|
+
- Artifact mismatch during release: rebuild with `./bin/release-artifacts --tag vX.Y.Z`.
|
|
192
|
+
- Precompiled artifact mismatch: rebuild with `./bin/release-precompiled-artifacts --tag vX.Y.Z --skip-preflight`.
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Rust Orchestration Guardrails
|
|
2
|
+
|
|
3
|
+
This document defines the minimum parity and performance gates for deeper Rust orchestration refactors.
|
|
4
|
+
|
|
5
|
+
## Numeric parity guardrails
|
|
6
|
+
|
|
7
|
+
Required tests:
|
|
8
|
+
|
|
9
|
+
- `bundle exec ruby -Ilib:test test/backend_compatibility_test.rb`
|
|
10
|
+
- `bundle exec ruby -Ilib:test test/rust_orchestration_test.rb`
|
|
11
|
+
|
|
12
|
+
Current parity expectations:
|
|
13
|
+
|
|
14
|
+
- Rust vs pure backend fixture parity remains exact within existing tolerances used by tests.
|
|
15
|
+
- Session-based orchestration paths (`run_em_on_session`, `run_em_on_session_with_start_seed`, `run_em_on_session_start`, `run_em_on_session_with_corpus`) must match direct non-session orchestration for equivalent settings/seeds.
|
|
16
|
+
- `Lda::Backends::Rust` cached-corpus EM should prefer the managed Rust session entrypoint (`run_em_on_session_with_corpus`) even when no active session id is cached locally, rather than branching in Ruby between session-only, recovery, and direct paths.
|
|
17
|
+
- `Lda::Backends::Rust` non-session fallback should prefer Rust start-aware orchestration (`run_em_with_start_seed`) before legacy beta-input orchestration (`run_em`).
|
|
18
|
+
- Direct non-session fallback should reuse the backend's cached Rust corpus snapshot rather than rebuilding corpus arrays from `@corpus` for each invocation.
|
|
19
|
+
- Legacy beta-input compatibility fallback should also reuse the backend's cached Rust corpus snapshot rather than rebuilding full EM corpus input in Ruby.
|
|
20
|
+
- Rust backend corpus/session lifecycle must not leak session count across corpus replacement.
|
|
21
|
+
- Missing-session recovery in managed session orchestration (`run_em_on_session_with_corpus`) must recreate a usable session and keep parity with direct orchestration.
|
|
22
|
+
- Managed Rust corpus orchestration (`run_em_on_session_with_corpus`) must keep parity with direct orchestration even when it falls back internally from session-backed execution to start-seeded array execution.
|
|
23
|
+
- Corpus reassignment through Rust session replacement lifecycle (`replace_corpus_session`) must preserve stable session count and route subsequent EM runs over updated corpus data.
|
|
24
|
+
- Unknown start-mode handling in seed-aware Rust orchestration must match Ruby's non-seeded fallback behavior when given the same explicit seed.
|
|
25
|
+
|
|
26
|
+
## Benchmark guardrail
|
|
27
|
+
|
|
28
|
+
Run:
|
|
29
|
+
|
|
30
|
+
- `./bin/check-rust-benchmark`
|
|
31
|
+
|
|
32
|
+
Default benchmark policy:
|
|
33
|
+
|
|
34
|
+
- `BENCH_RUST_TO_PURE_MAX_RATIO=0.045`
|
|
35
|
+
- i.e., Rust mean runtime must be no worse than 4.5% of pure mean runtime on the benchmark fixture/config.
|
|
36
|
+
- CI benchmark guardrail job enforces the same ratio with `BENCH_RUNS=1` for runtime stability.
|
|
37
|
+
- latest tightening evidence (2026-03-05): local Docker guardrail check with `BENCH_RUNS=3` observed Rust/Pure ratio `0.0368` (`rust=0.0758s`, `pure=2.0569s`), and prior CI streak data on `codex/rust-orchestration-phase8` (`22555725309` .. `22557953998`) observed `[0.0252, 0.0288]`, supporting a tighter `0.045` threshold with headroom.
|
|
38
|
+
|
|
39
|
+
Configurable environment knobs:
|
|
40
|
+
|
|
41
|
+
- `BENCH_RUNS` (default `5`)
|
|
42
|
+
- `BENCH_START` (default `seeded`)
|
|
43
|
+
- `BENCH_TOPICS` (default `8`)
|
|
44
|
+
- `BENCH_MAX_ITER` (default `20`)
|
|
45
|
+
- `BENCH_EM_MAX_ITER` (default `40`)
|
|
46
|
+
- `BENCH_RUST_TO_PURE_MAX_RATIO` (default `0.045`)
|
|
47
|
+
|
|
48
|
+
## When to tighten thresholds
|
|
49
|
+
|
|
50
|
+
Tighten benchmark thresholds only after collecting multiple stable runs on the same host/environment and updating this document with the new target ratio.
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
// This is the ``Mersenne Twister'' random number generator MT19937, which
|
|
2
|
+
// generates pseudorandom integers uniformly distributed in 0..(2^32 - 1)
|
|
3
|
+
// starting from any odd seed in 0..(2^32 - 1). This version is a recode
|
|
4
|
+
// by Shawn Cokus (Cokus@math.washington.edu) on March 8, 1998 of a version by
|
|
5
|
+
// Takuji Nishimura (who had suggestions from Topher Cooper and Marc Rieffel in
|
|
6
|
+
// July-August 1997).
|
|
7
|
+
//
|
|
8
|
+
// Effectiveness of the recoding (on Goedel2.math.washington.edu, a DEC Alpha
|
|
9
|
+
// running OSF/1) using GCC -O3 as a compiler: before recoding: 51.6 sec. to
|
|
10
|
+
// generate 300 million random numbers; after recoding: 24.0 sec. for the same
|
|
11
|
+
// (i.e., 46.5% of original time), so speed is now about 12.5 million random
|
|
12
|
+
// number generations per second on this machine.
|
|
13
|
+
//
|
|
14
|
+
// According to the URL <http://www.math.keio.ac.jp/~matumoto/emt.html>
|
|
15
|
+
// (and paraphrasing a bit in places), the Mersenne Twister is ``designed
|
|
16
|
+
// with consideration of the flaws of various existing generators,'' has
|
|
17
|
+
// a period of 2^19937 - 1, gives a sequence that is 623-dimensionally
|
|
18
|
+
// equidistributed, and ``has passed many stringent tests, including the
|
|
19
|
+
// die-hard test of G. Marsaglia and the load test of P. Hellekalek and
|
|
20
|
+
// S. Wegenkittl.'' It is efficient in memory usage (typically using 2506
|
|
21
|
+
// to 5012 bytes of static data, depending on data type sizes, and the code
|
|
22
|
+
// is quite short as well). It generates random numbers in batches of 624
|
|
23
|
+
// at a time, so the caching and pipelining of modern systems is exploited.
|
|
24
|
+
// It is also divide- and mod-free.
|
|
25
|
+
//
|
|
26
|
+
// This library is free software; you can redistribute it and/or modify it
|
|
27
|
+
// under the terms of the GNU Library General Public License as published by
|
|
28
|
+
// the Free Software Foundation (either version 2 of the License or, at your
|
|
29
|
+
// option, any later version). This library is distributed in the hope that
|
|
30
|
+
// it will be useful, but WITHOUT ANY WARRANTY, without even the implied
|
|
31
|
+
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
|
|
32
|
+
// the GNU Library General Public License for more details. You should have
|
|
33
|
+
// received a copy of the GNU Library General Public License along with this
|
|
34
|
+
// library; if not, write to the Free Software Foundation, Inc., 59 Temple
|
|
35
|
+
// Place, Suite 330, Boston, MA 02111-1307, USA.
|
|
36
|
+
//
|
|
37
|
+
// The code as Shawn received it included the following notice:
|
|
38
|
+
//
|
|
39
|
+
// Copyright (C) 1997 Makoto Matsumoto and Takuji Nishimura. When
|
|
40
|
+
// you use this, send an e-mail to <matumoto@math.keio.ac.jp> with
|
|
41
|
+
// an appropriate reference to your work.
|
|
42
|
+
//
|
|
43
|
+
// It would be nice to CC: <Cokus@math.washington.edu> when you write.
|
|
44
|
+
//
|
|
45
|
+
|
|
46
|
+
#include "cokus.h"
|
|
47
|
+
|
|
48
|
+
static uint32 state[COKUS_N+1]; // state vector + 1 extra to not violate ANSI C
|
|
49
|
+
static uint32 *next; // next random value is computed from here
|
|
50
|
+
static int left = -1; // can *next++ this many times before reloading
|
|
51
|
+
|
|
52
|
+
void seedMT(uint32 seed)
|
|
53
|
+
{
|
|
54
|
+
//
|
|
55
|
+
// We initialize state[0..(COKUS_N-1)] via the generator
|
|
56
|
+
//
|
|
57
|
+
// x_new = (69069 * x_old) mod 2^32
|
|
58
|
+
//
|
|
59
|
+
// from Line 15 of Table 1, p. 106, Sec. 3.3.4 of Knuth's
|
|
60
|
+
// _The Art of Computer Programming_, Volume 2, 3rd ed.
|
|
61
|
+
//
|
|
62
|
+
// Notes (SJC): I do not know what the initial state requirements
|
|
63
|
+
// of the Mersenne Twister are, but it seems this seeding generator
|
|
64
|
+
// could be better. It achieves the maximum period for its modulus
|
|
65
|
+
// (2^30) iff x_initial is odd (p. 20-21, Sec. 3.2.1.2, Knuth); if
|
|
66
|
+
// x_initial can be even, you have sequences like 0, 0, 0, ...;
|
|
67
|
+
// 2^31, 2^31, 2^31, ...; 2^30, 2^30, 2^30, ...; 2^29, 2^29 + 2^31,
|
|
68
|
+
// 2^29, 2^29 + 2^31, ..., etc. so I force seed to be odd below.
|
|
69
|
+
//
|
|
70
|
+
// Even if x_initial is odd, if x_initial is 1 mod 4 then
|
|
71
|
+
//
|
|
72
|
+
// the lowest bit of x is always 1,
|
|
73
|
+
// the next-to-lowest bit of x is always 0,
|
|
74
|
+
// the 2nd-from-lowest bit of x alternates ... 0 1 0 1 0 1 0 1 ... ,
|
|
75
|
+
// the 3rd-from-lowest bit of x 4-cycles ... 0 1 1 0 0 1 1 0 ... ,
|
|
76
|
+
// the 4th-from-lowest bit of x has the 8-cycle ... 0 0 0 1 1 1 1 0 ... ,
|
|
77
|
+
// ...
|
|
78
|
+
//
|
|
79
|
+
// and if x_initial is 3 mod 4 then
|
|
80
|
+
//
|
|
81
|
+
// the lowest bit of x is always 1,
|
|
82
|
+
// the next-to-lowest bit of x is always 1,
|
|
83
|
+
// the 2nd-from-lowest bit of x alternates ... 0 1 0 1 0 1 0 1 ... ,
|
|
84
|
+
// the 3rd-from-lowest bit of x 4-cycles ... 0 0 1 1 0 0 1 1 ... ,
|
|
85
|
+
// the 4th-from-lowest bit of x has the 8-cycle ... 0 0 1 1 1 1 0 0 ... ,
|
|
86
|
+
// ...
|
|
87
|
+
//
|
|
88
|
+
// The generator's potency (min. s>=0 with (69069-1)^s = 0 mod 2^32) is
|
|
89
|
+
// 16, which seems to be alright by p. 25, Sec. 3.2.1.3 of Knuth. It
|
|
90
|
+
// also does well in the dimension 2..5 spectral tests, but it could be
|
|
91
|
+
// better in dimension 6 (Line 15, Table 1, p. 106, Sec. 3.3.4, Knuth).
|
|
92
|
+
//
|
|
93
|
+
// Note that the random number user does not see the values generated
|
|
94
|
+
// here directly since reloadMT() will always munge them first, so maybe
|
|
95
|
+
// none of all of this matters. In fact, the seed values made here could
|
|
96
|
+
// even be extra-special desirable if the Mersenne Twister theory says
|
|
97
|
+
// so-- that's why the only change I made is to restrict to odd seeds.
|
|
98
|
+
//
|
|
99
|
+
|
|
100
|
+
register uint32 x = (seed | 1U) & 0xFFFFFFFFU, *s = state;
|
|
101
|
+
register int j;
|
|
102
|
+
|
|
103
|
+
for(left=0, *s++=x, j=COKUS_N; --j;
|
|
104
|
+
*s++ = (x*=69069U) & 0xFFFFFFFFU);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
uint32 reloadMT(void)
|
|
109
|
+
{
|
|
110
|
+
register uint32 *p0=state, *p2=state+2, *pM=state+COKUS_M, s0, s1;
|
|
111
|
+
register int j;
|
|
112
|
+
|
|
113
|
+
if(left < -1)
|
|
114
|
+
seedMT(4357U);
|
|
115
|
+
|
|
116
|
+
left=COKUS_N-1, next=state+1;
|
|
117
|
+
|
|
118
|
+
for(s0=state[0], s1=state[1], j=COKUS_N-COKUS_M+1; --j; s0=s1, s1=*p2++)
|
|
119
|
+
*p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? COKUS_K : 0U);
|
|
120
|
+
|
|
121
|
+
for(pM=state, j=COKUS_M; --j; s0=s1, s1=*p2++)
|
|
122
|
+
*p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? COKUS_K : 0U);
|
|
123
|
+
|
|
124
|
+
s1=state[0], *p0 = *pM ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? COKUS_K : 0U);
|
|
125
|
+
s1 ^= (s1 >> 11);
|
|
126
|
+
s1 ^= (s1 << 7) & 0x9D2C5680U;
|
|
127
|
+
s1 ^= (s1 << 15) & 0xEFC60000U;
|
|
128
|
+
return(s1 ^ (s1 >> 18));
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
uint32 randomMT(void)
|
|
132
|
+
{
|
|
133
|
+
uint32 y;
|
|
134
|
+
|
|
135
|
+
if(--left < 0)
|
|
136
|
+
return(reloadMT());
|
|
137
|
+
|
|
138
|
+
y = *next++;
|
|
139
|
+
y ^= (y >> 11);
|
|
140
|
+
y ^= (y << 7) & 0x9D2C5680U;
|
|
141
|
+
y ^= (y << 15) & 0xEFC60000U;
|
|
142
|
+
y ^= (y >> 18);
|
|
143
|
+
return(y);
|
|
144
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
#ifndef COKUS_H
|
|
2
|
+
#define COKUS_H
|
|
3
|
+
|
|
4
|
+
#include <stdio.h>
|
|
5
|
+
#include <stdlib.h>
|
|
6
|
+
|
|
7
|
+
//
|
|
8
|
+
// uint32 must be an unsigned integer type capable of holding at least 32
|
|
9
|
+
// bits; exactly 32 should be fastest, but 64 is better on an Alpha with
|
|
10
|
+
// GCC at -O3 optimization so try your options and see what's best for you
|
|
11
|
+
//
|
|
12
|
+
|
|
13
|
+
typedef unsigned long uint32;
|
|
14
|
+
|
|
15
|
+
#define COKUS_N (624) // length of state vector
|
|
16
|
+
#define COKUS_M (397) // a period parameter
|
|
17
|
+
#define COKUS_K (0x9908B0DFU) // a magic constant
|
|
18
|
+
#define hiBit(u) ((u) & 0x80000000U) // mask all but highest bit of u
|
|
19
|
+
#define loBit(u) ((u) & 0x00000001U) // mask all but lowest bit of u
|
|
20
|
+
#define loBits(u) ((u) & 0x7FFFFFFFU) // mask the highest bit of u
|
|
21
|
+
#define mixBits(u, v) (hiBit(u)|loBits(v)) // move hi bit of u to hi bit of v
|
|
22
|
+
|
|
23
|
+
void seedMT(uint32 seed);
|
|
24
|
+
uint32 reloadMT(void);
|
|
25
|
+
uint32 randomMT(void);
|
|
26
|
+
|
|
27
|
+
#endif
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "mkmf"
|
|
4
|
+
|
|
5
|
+
extension_name = "lda-ruby/lda"
|
|
6
|
+
dir_config(extension_name)
|
|
7
|
+
|
|
8
|
+
$defs << "-DUSE_RUBY"
|
|
9
|
+
append_cflags("-Wall")
|
|
10
|
+
append_cflags("-Wextra")
|
|
11
|
+
append_cflags("-Wno-unused-parameter")
|
|
12
|
+
|
|
13
|
+
create_makefile(extension_name)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
// (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu)
|
|
2
|
+
|
|
3
|
+
// This file is part of LDA-C.
|
|
4
|
+
|
|
5
|
+
// LDA-C is free software; you can redistribute it and/or modify it under
|
|
6
|
+
// the terms of the GNU General Public License as published by the Free
|
|
7
|
+
// Software Foundation; either version 2 of the License, or (at your
|
|
8
|
+
// option) any later version.
|
|
9
|
+
|
|
10
|
+
// LDA-C is distributed in the hope that it will be useful, but WITHOUT
|
|
11
|
+
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
12
|
+
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
13
|
+
// for more details.
|
|
14
|
+
|
|
15
|
+
// You should have received a copy of the GNU General Public License
|
|
16
|
+
// along with this program; if not, write to the Free Software
|
|
17
|
+
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
|
18
|
+
// USA
|
|
19
|
+
|
|
20
|
+
#include "lda-alpha.h"
|
|
21
|
+
|
|
22
|
+
/*
|
|
23
|
+
* objective function and its derivatives
|
|
24
|
+
*
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
double alhood(double a, double ss, int D, int K)
|
|
28
|
+
{ return(D * (lgamma(K * a) - K * lgamma(a)) + (a - 1) * ss); }
|
|
29
|
+
|
|
30
|
+
double d_alhood(double a, double ss, int D, int K)
|
|
31
|
+
{ return(D * (K * digamma(K * a) - K * digamma(a)) + ss); }
|
|
32
|
+
|
|
33
|
+
double d2_alhood(double a, int D, int K)
|
|
34
|
+
{ return(D * (K * K * trigamma(K * a) - K * trigamma(a))); }
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
/*
|
|
38
|
+
* newtons method
|
|
39
|
+
*
|
|
40
|
+
*/
|
|
41
|
+
|
|
42
|
+
double opt_alpha(double ss, int D, int K)
|
|
43
|
+
{
|
|
44
|
+
double a, log_a, init_a = 100;
|
|
45
|
+
double f, df, d2f;
|
|
46
|
+
int iter = 0;
|
|
47
|
+
|
|
48
|
+
log_a = log(init_a);
|
|
49
|
+
do
|
|
50
|
+
{
|
|
51
|
+
iter++;
|
|
52
|
+
a = exp(log_a);
|
|
53
|
+
if (isnan(a))
|
|
54
|
+
{
|
|
55
|
+
init_a = init_a * 10;
|
|
56
|
+
printf("warning : alpha is nan; new init = %5.5f\n", init_a);
|
|
57
|
+
a = init_a;
|
|
58
|
+
log_a = log(a);
|
|
59
|
+
}
|
|
60
|
+
f = alhood(a, ss, D, K);
|
|
61
|
+
df = d_alhood(a, ss, D, K);
|
|
62
|
+
d2f = d2_alhood(a, D, K);
|
|
63
|
+
log_a = log_a - df/(d2f * a + df);
|
|
64
|
+
printf("alpha maximization : %5.5f %5.5f\n", f, df);
|
|
65
|
+
}
|
|
66
|
+
while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
|
|
67
|
+
return(exp(log_a));
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
double quiet_opt_alpha(double ss, int D, int K)
|
|
71
|
+
{
|
|
72
|
+
double a, log_a, init_a = 100;
|
|
73
|
+
double f, df, d2f;
|
|
74
|
+
int iter = 0;
|
|
75
|
+
|
|
76
|
+
log_a = log(init_a);
|
|
77
|
+
do
|
|
78
|
+
{
|
|
79
|
+
iter++;
|
|
80
|
+
a = exp(log_a);
|
|
81
|
+
if (isnan(a))
|
|
82
|
+
{
|
|
83
|
+
init_a = init_a * 10;
|
|
84
|
+
//printf("warning : alpha is nan; new init = %5.5f\n", init_a);
|
|
85
|
+
a = init_a;
|
|
86
|
+
log_a = log(a);
|
|
87
|
+
}
|
|
88
|
+
f = alhood(a, ss, D, K);
|
|
89
|
+
df = d_alhood(a, ss, D, K);
|
|
90
|
+
d2f = d2_alhood(a, D, K);
|
|
91
|
+
log_a = log_a - df/(d2f * a + df);
|
|
92
|
+
//printf("alpha maximization : %5.5f %5.5f\n", f, df);
|
|
93
|
+
}
|
|
94
|
+
while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
|
|
95
|
+
return(exp(log_a));
|
|
96
|
+
}
|