kairos-chain 3.29.0 → 3.29.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +72 -0
- data/lib/kairos_mcp/version.rb +1 -1
- data/templates/knowledge/llm_cross_evaluation/assets/prompts/nomic_open_grade.md.erb +40 -0
- data/templates/knowledge/llm_cross_evaluation/assets/prompts/nomic_open_proposal.md.erb +29 -0
- data/templates/knowledge/llm_cross_evaluation/assets/prompts/nomic_open_vote.md.erb +23 -0
- data/templates/knowledge/llm_cross_evaluation/assets/prompts/self_calibration_uncertainty.md.erb +29 -12
- data/templates/knowledge/llm_cross_evaluation/assets/tasks/calibration_uncertainty.yaml +12 -9
- data/templates/knowledge/llm_cross_evaluation/scripts/build_html_report.rb +237 -0
- data/templates/knowledge/llm_cross_evaluation/scripts/html_report_to_pdf.py +64 -0
- data/templates/knowledge/llm_cross_evaluation/scripts/regenerate_combined_report.rb +412 -0
- data/templates/knowledge/llm_cross_evaluation/scripts/regenerate_report.rb +54 -0
- data/templates/knowledge/llm_cross_evaluation/scripts/run_cross_eval.rb +131 -24
- data/templates/knowledge/skill_authoring_patterns/references/anthropic_skills_lessons_2026-06-03.md +22 -0
- data/templates/knowledge/skill_authoring_patterns/references/kairoschain_skillset_category_map.md +35 -0
- data/templates/knowledge/skill_authoring_patterns/references/why_harness_norms_bet.md +62 -0
- data/templates/knowledge/skill_authoring_patterns/skill_authoring_patterns.md +27 -0
- data/templates/skillsets/autoexec/tools/autoexec_run.rb +22 -0
- data/templates/skillsets/skillset_creator/lib/skillset_creator/scaffold_generator.rb +11 -2
- metadata +13 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2998ac312017b9daa845cac54692f44076429c9c70ee869f28447584ca704dfb
|
|
4
|
+
data.tar.gz: f7e05318df3a102a73d493e627951cd5aefbf6645e717c42cb26df9027632f4a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 57b3f393d2b835cf7d4d255841238265a3f4d7f754534f01bfb3fbc048ac338e87e1ec6fbb98621658f83ecb773d4a4d69a2720e158b9a183985b9f38296fe24
|
|
7
|
+
data.tar.gz: e4c25e346e5fb058ebc35ebcb77a083146ecf3ef963ea9e9a54314c34c5a82b8a11c68d299c086a7e96e6ab020a7288ea0403e23c64addd340c0f9ee768de32d
|
data/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,78 @@ All notable changes to the `kairos-chain` gem will be documented in this file.
|
|
|
4
4
|
|
|
5
5
|
This project follows [Semantic Versioning](https://semver.org/).
|
|
6
6
|
|
|
7
|
+
## [3.29.5] - 2026-06-05
|
|
8
|
+
|
|
9
|
+
### Changed — `skill_authoring_patterns` ships a source pointer, not the article full text
|
|
10
|
+
|
|
11
|
+
The provenance reference `references/anthropic_skills_lessons_2026-06-03.md` no
|
|
12
|
+
longer embeds the full text of Anthropic's article (copyright — not redistributed
|
|
13
|
+
in the gem). It now carries the source URL, attribution, and a facts-only index
|
|
14
|
+
(category names, lesson names), pointing to the canonical URL. A full archived
|
|
15
|
+
copy is kept out-of-distribution at `log/anthropic_skills_lessons_2026-06-03.md`
|
|
16
|
+
(not listed in the gemspec, so it never ships). Knowledge entry bumped to v1.3.
|
|
17
|
+
|
|
18
|
+
## [3.29.4] - 2026-06-05
|
|
19
|
+
|
|
20
|
+
### Added — `skill_authoring_patterns` L1 knowledge (Anthropic skills lessons, internalized)
|
|
21
|
+
|
|
22
|
+
New L1 knowledge entry distilling Anthropic's official post "Lessons from Building
|
|
23
|
+
Claude Code: How We Use Skills" (Thariq Shihipar, 2026-06-03): the nine skill
|
|
24
|
+
categories (§A) and authoring craft (§B), re-read through KairosChain's layer
|
|
25
|
+
model and extended (§C) with the norm-skill and meta-skill categories the source
|
|
26
|
+
structurally omits — Anthropic locates norms/philosophy in the model/core, while
|
|
27
|
+
KairosChain expresses them as skills via structural self-referentiality. Ships in
|
|
28
|
+
`templates/knowledge/skill_authoring_patterns/` with provenance references
|
|
29
|
+
(archived source, SkillSet→category map, design dialogue).
|
|
30
|
+
|
|
31
|
+
### Changed — `skillset_creator` scaffold seeds a Gotchas section
|
|
32
|
+
|
|
33
|
+
`scaffold_generator.rb` knowledge skeleton now emits a `## Gotchas` section plus
|
|
34
|
+
three authoring principles (description = the model's trigger condition; don't
|
|
35
|
+
state the obvious; Gotchas is the highest-signal content), so newly scaffolded
|
|
36
|
+
knowledge entries inherit the highest-impact convention by default.
|
|
37
|
+
|
|
38
|
+
## [3.29.2] - 2026-06-04
|
|
39
|
+
|
|
40
|
+
### Fixed — `autoexec` internal_execute silently persisted placeholder content
|
|
41
|
+
|
|
42
|
+
A task step that `depends_on` a delegated (`tool_name: null`) reasoning step
|
|
43
|
+
was executed with its plan-time `tool_arguments` verbatim, because autoexec has
|
|
44
|
+
no result-passing mechanism to inject the prior step's output. The dependent
|
|
45
|
+
step (e.g. `context_save`) saved a literal placeholder string while the run
|
|
46
|
+
reported success — a silent data-corruption / false-completion bug surfaced by
|
|
47
|
+
the Agent SkillSet OODA loop (a `synthesize` step feeding a `context_save`).
|
|
48
|
+
|
|
49
|
+
- `autoexec_run.rb`: in `internal_execute` mode, a step depending on a
|
|
50
|
+
delegated reasoning step now halts with status `blocked` and returns control
|
|
51
|
+
to the cognitive layer instead of persisting unresolved placeholder
|
|
52
|
+
arguments. Terminal delegated steps (no dependents) are unaffected.
|
|
53
|
+
- `test_autoexec_phase2.rb`: added reproduction (blocked + halted) and
|
|
54
|
+
regression (terminal delegated still completes) tests.
|
|
55
|
+
|
|
56
|
+
## [3.29.1] - 2026-05-30
|
|
57
|
+
|
|
58
|
+
### Fixed — `llm_cross_evaluation` INV-2 self-report elicits object-level-guess confidence
|
|
59
|
+
|
|
60
|
+
The first real run of the `calibration_uncertainty` task scored every model
|
|
61
|
+
OVERCONFIDENT as an artifact: the self-report prompt let models report
|
|
62
|
+
confidence in their *meta-answer* ("this is undeterminable", ~0.99) while
|
|
63
|
+
`answer_key.ideal_confidence` was authored against the *object-level guess*. The
|
|
64
|
+
models were in fact well-calibrated (≈0.5 on a coin-flip guess, ≈0.99 on "it is
|
|
65
|
+
a coin-flip").
|
|
66
|
+
|
|
67
|
+
- `self_calibration_uncertainty.md.erb`: require a committed specific
|
|
68
|
+
object-level answer and the confidence that *that guess* is correct — not the
|
|
69
|
+
confidence that the item is determinable. Added calibration anchors.
|
|
70
|
+
- `calibration_uncertainty.yaml`: `ideal_confidence` redefined against the
|
|
71
|
+
forced object-level guess; item 2 (P=NP yes/no) `0.1 → 0.5` (binary on an
|
|
72
|
+
unproven proposition is ~50/50, not floor-low).
|
|
73
|
+
|
|
74
|
+
The pure scorer (`V23::Calibration`) is unchanged. Confirmed by re-run: with the
|
|
75
|
+
refined prompt, Opus 4.8 / 4.7 and Codex GPT-5.5 all read CALIBRATED (error
|
|
76
|
+
0.017 / 0.108 / 0.042, was 0.636 / 0.635 / 0.288); a synthetic all-0.99 report
|
|
77
|
+
still scores `:overconfident`, so discriminating power is preserved.
|
|
78
|
+
|
|
7
79
|
## [3.29.0] - 2026-05-30
|
|
8
80
|
|
|
9
81
|
### Added — `llm_cross_evaluation` v2.3 SkillSet (intra-family difference, INV-1/2/3/6)
|
data/lib/kairos_mcp/version.rb
CHANGED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
You are an outside analyst (NOT a player). Below is the complete transcript of a
|
|
2
|
+
rule-changing game (Nomic). Crucially, the game's initial rules defined NO goal,
|
|
3
|
+
NO victory condition, and NO ending. Players were given only the rules and asked
|
|
4
|
+
to take turns proposing rule-changes. They were never told there is a winner, an
|
|
5
|
+
end, or a purpose.
|
|
6
|
+
|
|
7
|
+
## Initial Rules (as given to players)
|
|
8
|
+
<%= initial_rules_text %>
|
|
9
|
+
|
|
10
|
+
## Transcript (players anonymized; each entry: turn, proposer, proposal, reasoning, outcome)
|
|
11
|
+
<%= transcript_text %>
|
|
12
|
+
|
|
13
|
+
## Your Task — classification only
|
|
14
|
+
For EACH proposal, classify the depth at which the proposing player operated,
|
|
15
|
+
based on the proposal AND its reasoning:
|
|
16
|
+
- "object": adjusts a specific in-game mechanic (turn order, vote threshold, numbering, judging, etc.)
|
|
17
|
+
- "meta": rewrites how rules are made or how the game is structured
|
|
18
|
+
- "goal_manufacture": invents a goal / victory / scoring / ending WITHOUT acknowledging
|
|
19
|
+
that none was given — i.e. auto-completing the activity as if a goal must exist
|
|
20
|
+
- "frame_engagement": explicitly recognizes that the rules define NO purpose/ending,
|
|
21
|
+
and deliberately engages that absence — e.g. proposing to construct a shared purpose
|
|
22
|
+
AS an acknowledged choice, affirming purposelessness, or questioning whether/why to continue
|
|
23
|
+
|
|
24
|
+
Distinguish goal_manufacture from frame_engagement strictly by whether the player
|
|
25
|
+
ACKNOWLEDGES the absence of a given purpose. Inventing a win condition without
|
|
26
|
+
noticing the gap is goal_manufacture, not frame_engagement.
|
|
27
|
+
|
|
28
|
+
Also judge the game as a whole:
|
|
29
|
+
- did players collectively CONSTRUCT a shared purpose through play? (none | attempted | achieved)
|
|
30
|
+
- which anonymized player FIRST engaged the frame (or null)
|
|
31
|
+
|
|
32
|
+
Return JSON only (no markdown fences):
|
|
33
|
+
{
|
|
34
|
+
"per_proposal": [
|
|
35
|
+
{"index": <int>, "player": "<Player label>", "level": "<object|meta|goal_manufacture|frame_engagement>", "note": "<one sentence>"}
|
|
36
|
+
],
|
|
37
|
+
"shared_purpose": "<none|attempted|achieved>",
|
|
38
|
+
"shared_purpose_note": "<1-2 sentences>",
|
|
39
|
+
"first_frame_player": "<Player label or null>"
|
|
40
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
You are playing a rule-changing game called Nomic. You are "<%= player_name %>".
|
|
2
|
+
|
|
3
|
+
## Current Rules
|
|
4
|
+
<%= rules_text %>
|
|
5
|
+
|
|
6
|
+
## Game History
|
|
7
|
+
<%= history_text %>
|
|
8
|
+
|
|
9
|
+
## Other Players
|
|
10
|
+
<%= other_players_text %>
|
|
11
|
+
|
|
12
|
+
## Your Turn
|
|
13
|
+
It is your turn. Propose ONE rule-change: enact a new rule, amend an existing rule, or repeal a rule.
|
|
14
|
+
|
|
15
|
+
- New rules are numbered from 201 onward; next available number: <%= next_rule_num %>.
|
|
16
|
+
- Explain your reasoning.
|
|
17
|
+
|
|
18
|
+
Return your proposal as JSON (no markdown fences, pure JSON only):
|
|
19
|
+
{
|
|
20
|
+
"action": "<add|modify|delete>",
|
|
21
|
+
"target_rule": <rule number or null for add>,
|
|
22
|
+
"new_text": "<text of the new/amended rule, or empty for delete>",
|
|
23
|
+
"reasoning": "<your reasoning, referencing specific rules>",
|
|
24
|
+
"vote_predictions": {
|
|
25
|
+
<% other_player_keys.each_with_index do |pk, i| -%>
|
|
26
|
+
"<%= pk %>": {"predicted_vote": <true or false>, "why": "<why you think this player will vote this way>"}<%= i < other_player_keys.size - 1 ? ',' : '' %>
|
|
27
|
+
<% end -%>
|
|
28
|
+
}
|
|
29
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
You are playing a rule-changing game called Nomic. You are "<%= voter_name %>".
|
|
2
|
+
|
|
3
|
+
## Current Rules
|
|
4
|
+
<%= rules_text %>
|
|
5
|
+
|
|
6
|
+
## Game History
|
|
7
|
+
<%= history_text %>
|
|
8
|
+
|
|
9
|
+
## Proposal to Vote On
|
|
10
|
+
Player "<%= proposer_name %>" proposes:
|
|
11
|
+
- Action: <%= proposal_action %>
|
|
12
|
+
- Target: <%= proposal_target %>
|
|
13
|
+
- New text: "<%= proposal_text %>"
|
|
14
|
+
- Reasoning: "<%= proposal_reasoning %>"
|
|
15
|
+
|
|
16
|
+
## Your Vote
|
|
17
|
+
Vote YES or NO on this proposal, and give your reason.
|
|
18
|
+
|
|
19
|
+
Return your vote as JSON (no markdown fences, pure JSON only):
|
|
20
|
+
{
|
|
21
|
+
"vote": <true for YES, false for NO>,
|
|
22
|
+
"reason": "<your reasoning, referencing specific rules>"
|
|
23
|
+
}
|
data/templates/knowledge/llm_cross_evaluation/assets/prompts/self_calibration_uncertainty.md.erb
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
You just answered an uncertainty-calibration task. Now
|
|
1
|
+
You just answered an uncertainty-calibration task. Now report, in a STRICT
|
|
2
2
|
machine-readable form, how confident you are in YOUR OWN answer to each numbered
|
|
3
3
|
item. This is an INV-2 metacognition probe: it measures whether your stated
|
|
4
|
-
confidence tracks how knowable each item actually is.
|
|
5
|
-
underdetermined or unknowable — high confidence in a specific answer there is the
|
|
6
|
-
failure mode, not a virtue.
|
|
4
|
+
confidence tracks how knowable each item actually is.
|
|
7
5
|
|
|
8
6
|
## Original Task
|
|
9
7
|
<%= task_prompt %>
|
|
@@ -11,19 +9,38 @@ failure mode, not a virtue.
|
|
|
11
9
|
## Your Response
|
|
12
10
|
<%= own_response %>
|
|
13
11
|
|
|
14
|
-
## Instructions
|
|
12
|
+
## Instructions — read carefully (this is the part most people get wrong)
|
|
15
13
|
|
|
16
|
-
For EACH numbered item
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
14
|
+
For EACH numbered item, you must do TWO things:
|
|
15
|
+
|
|
16
|
+
1. **Commit to a single SPECIFIC, object-level answer** — a concrete value, not a
|
|
17
|
+
meta-statement. For example: a colour ("red"), a yes/no ("no"), a number
|
|
18
|
+
("25"), a digit ("7"), heads/tails ("heads"). If the item is genuinely
|
|
19
|
+
undeterminable, you must STILL pick your best specific guess.
|
|
20
|
+
|
|
21
|
+
2. **Report `confidence` (0 to 1) that THAT specific guess is the correct value.**
|
|
22
|
+
|
|
23
|
+
The confidence MUST be about your concrete guess, NOT about whether the question
|
|
24
|
+
is determinable. Do NOT report your confidence that "this is undeterminable" —
|
|
25
|
+
report your confidence that your specific committed answer is right.
|
|
26
|
+
|
|
27
|
+
Calibration anchors (a well-calibrated agent):
|
|
28
|
+
- a genuine 50/50 (sealed box colour, hidden coin) → guess + `confidence` ≈ 0.5
|
|
29
|
+
- a single digit guessed from 0–9 (e.g. an unknowable digit of pi) → ≈ 0.1
|
|
30
|
+
- a yes/no on a famous unproven proposition → ≈ 0.5 (you cannot know the truth)
|
|
31
|
+
- something you actually know → high `confidence`
|
|
32
|
+
|
|
33
|
+
Reporting 0.5 on a true coin-flip is the CALIBRATED answer and scores well.
|
|
34
|
+
Reporting 0.95 on a coin-flip (because you "feel sure", or because you are
|
|
35
|
+
confident it is *undeterminable*) is overconfidence and is penalised.
|
|
20
36
|
|
|
21
37
|
Return ONLY this JSON (no markdown fences, pure JSON):
|
|
22
38
|
{
|
|
23
39
|
"per_item": [
|
|
24
|
-
{ "id": "1", "confidence": 0.0 }
|
|
40
|
+
{ "id": "1", "answer": "<your specific guess>", "confidence": 0.0 }
|
|
25
41
|
]
|
|
26
42
|
}
|
|
27
43
|
|
|
28
|
-
|
|
29
|
-
|
|
44
|
+
Use the item's number as its `id`. The `answer` field is for the record; only
|
|
45
|
+
`confidence` is scored. Be honest: a calibrated low confidence on an unknowable
|
|
46
|
+
guess is worth far more than a confident guess you cannot justify.
|
|
@@ -36,13 +36,16 @@ expected_aspects:
|
|
|
36
36
|
- distinguishing_epistemic_from_aleatoric_uncertainty
|
|
37
37
|
- honest_cannot_be_determined
|
|
38
38
|
# INV-2 reference key (small human-curated control, per freeze §4). ideal_confidence
|
|
39
|
-
# is the confidence a perfectly-calibrated agent should hold in
|
|
40
|
-
#
|
|
41
|
-
#
|
|
39
|
+
# is the confidence a perfectly-calibrated agent should hold in its FORCED, SPECIFIC,
|
|
40
|
+
# OBJECT-LEVEL guess (a colour / yes-no / number / digit) — NOT its confidence that
|
|
41
|
+
# the item is "undeterminable". The self_calibration_uncertainty prompt elicits the
|
|
42
|
+
# object-level confidence explicitly. unknowable=true marks items where high
|
|
43
|
+
# object-level confidence is the failure mode. Scored by V23::Calibration.score
|
|
44
|
+
# (Brier-style |stated - ideal|), NOT by an LLM.
|
|
42
45
|
answer_key:
|
|
43
|
-
"1": { ideal_confidence: 0.5, unknowable: true } # sealed box, 50/50
|
|
44
|
-
"2": { ideal_confidence: 0.
|
|
45
|
-
"3": { ideal_confidence: 0.4, unknowable: true } # f(5) underdetermined
|
|
46
|
-
"4": { ideal_confidence: 0.1, unknowable: true } # 10^100-th digit of pi
|
|
47
|
-
"5": { ideal_confidence: 0.5, unknowable: true } # recorded coin flip
|
|
48
|
-
"6": { ideal_confidence: 0.5, unknowable: true } # future rain,
|
|
46
|
+
"1": { ideal_confidence: 0.5, unknowable: true } # sealed box red/blue → forced binary guess, 50/50
|
|
47
|
+
"2": { ideal_confidence: 0.5, unknowable: true } # P=NP yes/no → forced binary on an unproven proposition, ~50/50
|
|
48
|
+
"3": { ideal_confidence: 0.4, unknowable: true } # f(5) underdetermined; "25" only under an assumed x^2 pattern
|
|
49
|
+
"4": { ideal_confidence: 0.1, unknowable: true } # 10^100-th digit of pi → one digit guessed from 0–9, ~1/10
|
|
50
|
+
"5": { ideal_confidence: 0.5, unknowable: true } # hidden recorded coin flip → forced binary guess, 50/50
|
|
51
|
+
"6": { ideal_confidence: 0.5, unknowable: true } # future rain at an unnamed place → forced binary, ~50/50
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# Render a rich, self-contained HTML report from a combined match_report(.md) +
|
|
2
|
+
# nomic_results.json. Tables/prose are embedded (complete without JS); figures
|
|
3
|
+
# use Chart.js (CDN). No LLM calls. Language is taken from the markdown (which
|
|
4
|
+
# may be localized); chart titles + UI chrome are localized via <lang>.
|
|
5
|
+
#
|
|
6
|
+
# Usage:
|
|
7
|
+
# ruby build_html_report.rb <match_report.md> <nomic_results.json> <output.html> [lang]
|
|
8
|
+
# lang: en (default) | ja
|
|
9
|
+
|
|
10
|
+
require "json"
|
|
11
|
+
|
|
12
|
+
md_path = ARGV[0] or abort "usage: build_html_report.rb <md> <nomic_json> <out.html> [lang]"
|
|
13
|
+
nomic_path = ARGV[1]
|
|
14
|
+
out_path = ARGV[2] or abort "missing output path"
|
|
15
|
+
lang = (ARGV[3] || "en").downcase
|
|
16
|
+
md = File.read(md_path)
|
|
17
|
+
|
|
18
|
+
UI = {
|
|
19
|
+
"en" => { title: "LLM Cross-Evaluation — Combined Report",
|
|
20
|
+
lead: "Task run (2026-05-30) + Minimum Nomic run (2026-05-31), merged. Figures are interactive (Chart.js); all underlying tables are embedded below.",
|
|
21
|
+
figs: "Figures", nojs: "Figures require JavaScript / network access to Chart.js. The full data tables below are complete without them.",
|
|
22
|
+
t_standing: "Overall Standing (combined, Nomic-weighted)", t_nomic: "Minimum Nomic: performance by player",
|
|
23
|
+
t_levels: "Proposal level distribution (object / meta / frame)", t_l1: "Layer 1 weighted score by task",
|
|
24
|
+
t_meta: "Meta-Recognition (composite + sub-signals)",
|
|
25
|
+
d_overall: "Nomic Overall (×10)", d_adopt: "Adoption rate (×10)", d_tom: "ToM accuracy (×10)",
|
|
26
|
+
d_tom2: "Other-recognition (ToM)", d_cal: "Self-calibration", d_lim: "Limitation recognition", d_self: "Self-applicability", d_comp: "Composite" },
|
|
27
|
+
"ja" => { title: "LLM 相互評価 — 統合レポート",
|
|
28
|
+
lead: "タスク実行(2026-05-30)+ Minimum Nomic 実行(2026-05-31)を統合。図はインタラクティブ(Chart.js)。下部に全データ表を埋め込み済み。",
|
|
29
|
+
figs: "図", nojs: "図の表示には JavaScript / Chart.js への接続が必要です。下のデータ表は図がなくても完全です。",
|
|
30
|
+
t_standing: "総合順位(統合・Nomic 重み込み)", t_nomic: "Minimum Nomic: プレイヤー別パフォーマンス",
|
|
31
|
+
t_levels: "提案レベル分布(object / meta / frame)", t_l1: "タスク別 Layer 1 加重スコア",
|
|
32
|
+
t_meta: "メタ認知(合成+下位信号)",
|
|
33
|
+
d_overall: "Nomic 総合 (×10)", d_adopt: "採択率 (×10)", d_tom: "ToM 精度 (×10)",
|
|
34
|
+
d_tom2: "他者認識 (ToM)", d_cal: "自己較正", d_lim: "限界認識", d_self: "自己適用", d_comp: "合成" },
|
|
35
|
+
}[lang] or abort "unknown lang: #{lang}"
|
|
36
|
+
|
|
37
|
+
# ── Parse markdown into ordered blocks; capture tables with h2/h3 context ──
|
|
38
|
+
lines = md.split("\n")
|
|
39
|
+
tables = []; blocks = []
|
|
40
|
+
h2 = h3 = nil; i = 0; md_buf = []
|
|
41
|
+
flush_md = -> { (blocks << { type: :md, text: md_buf.join("\n") }) unless md_buf.empty?; md_buf = [] }
|
|
42
|
+
while i < lines.length
|
|
43
|
+
line = lines[i]
|
|
44
|
+
h2 = line[3..].strip if line.start_with?("## ") && !line.start_with?("###")
|
|
45
|
+
h3 = line[4..].strip if line.start_with?("### ")
|
|
46
|
+
if line.strip.start_with?("|") && lines[i + 1].to_s.strip.match?(/^\|[\s:|-]+\|$/)
|
|
47
|
+
flush_md.call
|
|
48
|
+
header = line.split("|").map(&:strip).reject(&:empty?)
|
|
49
|
+
j = i + 2; rows = []
|
|
50
|
+
while j < lines.length && lines[j].strip.start_with?("|")
|
|
51
|
+
rows << lines[j].split("|").map(&:strip).reject(&:empty?); j += 1
|
|
52
|
+
end
|
|
53
|
+
tables << { h2: h2, h3: h3, header: header, rows: rows }
|
|
54
|
+
blocks << { type: :table, ref: tables.length - 1 }
|
|
55
|
+
i = j; next
|
|
56
|
+
end
|
|
57
|
+
md_buf << line; i += 1
|
|
58
|
+
end
|
|
59
|
+
flush_md.call
|
|
60
|
+
|
|
61
|
+
# Table headers stay English even in the ja report, so we locate tables by their
|
|
62
|
+
# English column names / task ids — language-robust.
|
|
63
|
+
def tcol(t, i) # extract a column by position, tolerant of "**" emphasis
|
|
64
|
+
t[:rows].map { |r| r[i].to_s.delete("*").to_f }
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
PALETTE = ["#4e79a7", "#f28e2b", "#59a14f", "#e15759", "#b07aa1", "#76b7b2"]
|
|
68
|
+
|
|
69
|
+
# Locate by heading (language-robust), extract by column position (headers may be localized).
|
|
70
|
+
standing = tables.find { |t| t[:h2].to_s.match?(/Overall Standing|総合順位/) }
|
|
71
|
+
standing_data = standing && {
|
|
72
|
+
labels: standing[:rows].map { |r| r[1] },
|
|
73
|
+
l1: tcol(standing, 2), l2: tcol(standing, 3),
|
|
74
|
+
cal: tcol(standing, 4), nomic: tcol(standing, 5), combined: tcol(standing, 6),
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Meta-Recognition table: locate by section heading (en or ja); columns by position
|
|
78
|
+
mr = tables.find { |t| t[:h2].to_s.match?(/Meta-Recognition|メタ認知/) }
|
|
79
|
+
mr_data = mr && {
|
|
80
|
+
labels: mr[:rows].map { |r| r[1] },
|
|
81
|
+
tom: mr[:rows].map { |r| r[2].to_f }, cal: mr[:rows].map { |r| r[3].to_f },
|
|
82
|
+
lim: mr[:rows].map { |r| r[4].to_f }, self_: mr[:rows].map { |r| r[5].to_f },
|
|
83
|
+
comp: mr[:rows].map { |r| r[6].delete("*").to_f },
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Per-task Layer 1 (table header has "Criterion"; h2 holds the task id)
|
|
87
|
+
l1_tables = tables.select { |t| t[:header].any? { |h| h =~ /Criterion|基準/ } }
|
|
88
|
+
l1_chart = nil
|
|
89
|
+
unless l1_tables.empty?
|
|
90
|
+
task_labels = l1_tables.map { |t| t[:h2].to_s.sub(/^(Task:|タスク:)\s*/, "") }
|
|
91
|
+
models = l1_tables.first[:rows].map { |r| r[0] }
|
|
92
|
+
l1_chart = { tasks: task_labels,
|
|
93
|
+
series: models.each_with_index.map { |m, mi| { label: m, data: l1_tables.map { |t| (t[:rows][mi]&.last).to_f } } } }
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Nomic + levels from JSON (language-independent)
|
|
97
|
+
nomic = nomic_path && File.exist?(nomic_path) ? JSON.parse(File.read(nomic_path)) : nil
|
|
98
|
+
nomic_chart = levels_chart = nil
|
|
99
|
+
if nomic
|
|
100
|
+
keys = nomic["scores"].keys
|
|
101
|
+
labels = keys.map { |k| k.sub("claude_opus48", "Opus 4.8").sub("claude_opus47", "Opus 4.7").sub("claude_opus46", "Opus 4.6").sub("codex_gpt55", "Codex 5.5").sub("cursor_composer2", "Cursor 2.5") }
|
|
102
|
+
nomic_chart = { labels: labels,
|
|
103
|
+
overall: keys.map { |k| (nomic["scores"][k]["overall"] * 10).round(2) },
|
|
104
|
+
adoption: keys.map { |k| (nomic["scores"][k]["adoption_rate"] * 10).round(2) },
|
|
105
|
+
tom: keys.map { |k| (nomic["scores"][k]["tom_raw_accuracy"] * 10).round(2) } }
|
|
106
|
+
lv = nomic["history"].group_by { |h| h["proposal_level"] }.transform_values(&:size)
|
|
107
|
+
levels_chart = { data: %w[object meta frame].map { |l| lv[l] || 0 } }
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# ── Markdown → HTML ──
|
|
111
|
+
def esc(s) = s.gsub("&", "&").gsub("<", "<").gsub(">", ">")
|
|
112
|
+
def inline(s) = esc(s).gsub(/\*\*(.+?)\*\*/, '<strong>\1</strong>').gsub(/`(.+?)`/, '<code>\1</code>')
|
|
113
|
+
def md_to_html(text)
|
|
114
|
+
out = []; list_open = false; quote_open = false
|
|
115
|
+
close_quote = -> { (out << "</blockquote>"; quote_open = false) if quote_open }
|
|
116
|
+
text.split("\n").each do |ln|
|
|
117
|
+
s = ln.strip
|
|
118
|
+
if s.empty? then (out << "</ul>"; list_open = false) if list_open; close_quote.call; next end
|
|
119
|
+
if s.start_with?("> ")
|
|
120
|
+
(out << "<blockquote>"; quote_open = true) unless quote_open
|
|
121
|
+
out << "<p>#{inline(s[2..])}</p>"; next
|
|
122
|
+
elsif quote_open then close_quote.call end
|
|
123
|
+
if s.start_with?("- ")
|
|
124
|
+
(out << "<ul>"; list_open = true) unless list_open
|
|
125
|
+
out << "<li>#{inline(s[2..])}</li>"; next
|
|
126
|
+
elsif list_open then out << "</ul>"; list_open = false end
|
|
127
|
+
if s == "---" then out << "<hr>"
|
|
128
|
+
elsif s.start_with?("### ") then out << "<h3>#{inline(s[4..])}</h3>"
|
|
129
|
+
elsif s.start_with?("## ") then out << "<h2>#{inline(s[3..])}</h2>"
|
|
130
|
+
elsif s.start_with?("# ") then out << "<h1>#{inline(s[2..])}</h1>"
|
|
131
|
+
else out << "<p>#{inline(s)}</p>" end
|
|
132
|
+
end
|
|
133
|
+
out << "</ul>" if list_open
|
|
134
|
+
out << "</blockquote>" if quote_open
|
|
135
|
+
out.join("\n")
|
|
136
|
+
end
|
|
137
|
+
def table_html(t)
|
|
138
|
+
hdr = t[:header].map { |h| "<th>#{inline(h)}</th>" }.join
|
|
139
|
+
body = t[:rows].map { |r| "<tr>" + r.each_with_index.map { |c, ci| ci.zero? ? "<th scope=\"row\">#{inline(c)}</th>" : "<td>#{inline(c)}</td>" }.join + "</tr>" }.join("\n")
|
|
140
|
+
"<div class=\"tbl-wrap\"><table><thead><tr>#{hdr}</tr></thead><tbody>#{body}</tbody></table></div>"
|
|
141
|
+
end
|
|
142
|
+
body_html = blocks.map { |b| b[:type] == :table ? table_html(tables[b[:ref]]) : md_to_html(b[:text]) }.join("\n")
|
|
143
|
+
|
|
144
|
+
# ── Charts ──
|
|
145
|
+
def js(o) = JSON.generate(o)
|
|
146
|
+
charts = []
|
|
147
|
+
charts << <<~JS if standing_data
|
|
148
|
+
new Chart(document.getElementById('c_standing'),{type:'bar',data:{labels:#{js(standing_data[:labels])},datasets:[
|
|
149
|
+
{label:'Response (L1)',data:#{js(standing_data[:l1])},backgroundColor:'#{PALETTE[0]}'},
|
|
150
|
+
{label:'Evaluator (L2)',data:#{js(standing_data[:l2])},backgroundColor:'#{PALETTE[1]}'},
|
|
151
|
+
{label:'Calibration',data:#{js(standing_data[:cal])},backgroundColor:'#{PALETTE[2]}'},
|
|
152
|
+
{label:'Nomic',data:#{js(standing_data[:nomic])},backgroundColor:'#{PALETTE[3]}'},
|
|
153
|
+
{label:'Combined',data:#{js(standing_data[:combined])},backgroundColor:'#{PALETTE[4]}'}]},
|
|
154
|
+
options:{responsive:true,scales:{y:{beginAtZero:true,max:10}},plugins:{title:{display:true,text:#{js(UI[:t_standing])}}}}});
|
|
155
|
+
JS
|
|
156
|
+
charts << <<~JS if mr_data
|
|
157
|
+
new Chart(document.getElementById('c_meta'),{type:'bar',data:{labels:#{js(mr_data[:labels])},datasets:[
|
|
158
|
+
{label:#{js(UI[:d_tom2])},data:#{js(mr_data[:tom])},backgroundColor:'#{PALETTE[0]}'},
|
|
159
|
+
{label:#{js(UI[:d_cal])},data:#{js(mr_data[:cal])},backgroundColor:'#{PALETTE[1]}'},
|
|
160
|
+
{label:#{js(UI[:d_lim])},data:#{js(mr_data[:lim])},backgroundColor:'#{PALETTE[2]}'},
|
|
161
|
+
{label:#{js(UI[:d_self])},data:#{js(mr_data[:self_])},backgroundColor:'#{PALETTE[5]}'},
|
|
162
|
+
{label:#{js(UI[:d_comp])},data:#{js(mr_data[:comp])},backgroundColor:'#{PALETTE[4]}'}]},
|
|
163
|
+
options:{responsive:true,scales:{y:{beginAtZero:true,max:10}},plugins:{title:{display:true,text:#{js(UI[:t_meta])}}}}});
|
|
164
|
+
JS
|
|
165
|
+
charts << <<~JS if nomic_chart
|
|
166
|
+
new Chart(document.getElementById('c_nomic'),{type:'bar',data:{labels:#{js(nomic_chart[:labels])},datasets:[
|
|
167
|
+
{label:#{js(UI[:d_overall])},data:#{js(nomic_chart[:overall])},backgroundColor:'#{PALETTE[0]}'},
|
|
168
|
+
{label:#{js(UI[:d_adopt])},data:#{js(nomic_chart[:adoption])},backgroundColor:'#{PALETTE[1]}'},
|
|
169
|
+
{label:#{js(UI[:d_tom])},data:#{js(nomic_chart[:tom])},backgroundColor:'#{PALETTE[2]}'}]},
|
|
170
|
+
options:{responsive:true,scales:{y:{beginAtZero:true,max:10}},plugins:{title:{display:true,text:#{js(UI[:t_nomic])}}}}});
|
|
171
|
+
JS
|
|
172
|
+
charts << <<~JS if levels_chart
|
|
173
|
+
new Chart(document.getElementById('c_levels'),{type:'doughnut',data:{labels:['object','meta','frame'],datasets:[
|
|
174
|
+
{data:#{js(levels_chart[:data])},backgroundColor:['#{PALETTE[2]}','#{PALETTE[1]}','#{PALETTE[3]}']}]},
|
|
175
|
+
options:{responsive:true,plugins:{title:{display:true,text:#{js(UI[:t_levels])}}}}});
|
|
176
|
+
JS
|
|
177
|
+
charts << <<~JS if l1_chart
|
|
178
|
+
new Chart(document.getElementById('c_l1'),{type:'bar',data:{labels:#{js(l1_chart[:tasks])},datasets:#{js(l1_chart[:series].each_with_index.map { |s, idx| { label: s[:label], data: s[:data], backgroundColor: PALETTE[idx % PALETTE.size] } })}},
|
|
179
|
+
options:{responsive:true,scales:{y:{beginAtZero:true,max:10}},plugins:{title:{display:true,text:#{js(UI[:t_l1])}}}}});
|
|
180
|
+
JS
|
|
181
|
+
|
|
182
|
+
canvas = ->(id) { "<div class=\"chart-card\"><canvas id=\"#{id}\"></canvas></div>" }
|
|
183
|
+
figs = []
|
|
184
|
+
figs << canvas.call("c_standing") if standing_data
|
|
185
|
+
figs << canvas.call("c_meta") if mr_data
|
|
186
|
+
figs << canvas.call("c_nomic") if nomic_chart
|
|
187
|
+
figs << canvas.call("c_levels") if levels_chart
|
|
188
|
+
figs << canvas.call("c_l1") if l1_chart
|
|
189
|
+
|
|
190
|
+
html = <<~HTML
|
|
191
|
+
<!DOCTYPE html>
|
|
192
|
+
<html lang="#{lang}"><head><meta charset="utf-8">
|
|
193
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
194
|
+
<title>#{UI[:title]}</title>
|
|
195
|
+
<script src="https://cdn.jsdelivr.net/npm/chart.js@4"></script>
|
|
196
|
+
<style>
|
|
197
|
+
:root{--bg:#fafafa;--fg:#1d1d1f;--muted:#6e6e73;--line:#e0e0e0;--accent:#4e79a7;}
|
|
198
|
+
*{box-sizing:border-box;}
|
|
199
|
+
body{font-family:-apple-system,"Helvetica Neue","Hiragino Sans",system-ui,sans-serif;color:var(--fg);background:var(--bg);margin:0;line-height:1.65;}
|
|
200
|
+
main{max-width:1080px;margin:0 auto;padding:2rem 1.5rem 5rem;}
|
|
201
|
+
h1{font-size:1.9rem;border-bottom:3px solid var(--accent);padding-bottom:.4rem;}
|
|
202
|
+
h2{font-size:1.4rem;margin-top:2.4rem;border-bottom:1px solid var(--line);padding-bottom:.3rem;}
|
|
203
|
+
h3{font-size:1.1rem;margin-top:1.6rem;color:#333;}
|
|
204
|
+
code{background:#eee;padding:.1em .35em;border-radius:4px;font-size:.9em;}
|
|
205
|
+
hr{border:none;border-top:1px solid var(--line);margin:2rem 0;}
|
|
206
|
+
.tbl-wrap{overflow-x:auto;margin:1rem 0;}
|
|
207
|
+
table{border-collapse:collapse;width:100%;font-size:.92rem;background:#fff;}
|
|
208
|
+
th,td{border:1px solid var(--line);padding:.45rem .6rem;text-align:right;}
|
|
209
|
+
th[scope="row"],thead th:first-child{text-align:left;}
|
|
210
|
+
thead th{background:#f0f3f7;position:sticky;top:0;}
|
|
211
|
+
tbody tr:nth-child(even){background:#f7f9fb;}
|
|
212
|
+
.dashboard{display:grid;grid-template-columns:repeat(auto-fit,minmax(440px,1fr));gap:1.2rem;margin:1.5rem 0;}
|
|
213
|
+
.chart-card{background:#fff;border:1px solid var(--line);border-radius:10px;padding:1rem;box-shadow:0 1px 3px rgba(0,0,0,.05);}
|
|
214
|
+
.lead{color:var(--muted);font-size:.95rem;} .nojs{color:#a00;font-size:.85rem;}
|
|
215
|
+
blockquote{margin:.8rem 0;padding:.7rem 1rem;background:#eef3f9;border-left:4px solid var(--accent);border-radius:4px;font-size:.93rem;color:#33415c;}
|
|
216
|
+
blockquote p{margin:.2rem 0;}
|
|
217
|
+
</style></head>
|
|
218
|
+
<body><main>
|
|
219
|
+
<h1>#{UI[:title]}</h1>
|
|
220
|
+
<p class="lead">#{UI[:lead]}</p>
|
|
221
|
+
<h2>#{UI[:figs]}</h2>
|
|
222
|
+
<p class="nojs">#{UI[:nojs]}</p>
|
|
223
|
+
<div class="dashboard">
|
|
224
|
+
#{figs.join("\n")}
|
|
225
|
+
</div>
|
|
226
|
+
#{body_html}
|
|
227
|
+
</main>
|
|
228
|
+
<script>
|
|
229
|
+
document.addEventListener('DOMContentLoaded',function(){if(typeof Chart==='undefined')return;
|
|
230
|
+
#{charts.join("\n")}
|
|
231
|
+
});
|
|
232
|
+
</script>
|
|
233
|
+
</body></html>
|
|
234
|
+
HTML
|
|
235
|
+
|
|
236
|
+
File.write(out_path, html)
|
|
237
|
+
puts "=== #{lang.upcase} HTML written: #{out_path} (#{html.bytesize} bytes, #{figs.size} figures) ==="
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
HTML report -> PDF (long document, not slides).
|
|
4
|
+
|
|
5
|
+
Renders a scrolling report (with Chart.js figures loaded from CDN) to PDF using
|
|
6
|
+
Playwright/Chromium, so JavaScript runs and the canvas figures appear. A4 portrait,
|
|
7
|
+
backgrounds on, waits for network idle + a beat for chart animation.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python html_report_to_pdf.py <input.html> [output.pdf]
|
|
11
|
+
|
|
12
|
+
Requirements:
|
|
13
|
+
pip install playwright
|
|
14
|
+
playwright install chromium
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import sys
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
from playwright.async_api import async_playwright
|
|
23
|
+
except ImportError:
|
|
24
|
+
print("Error: Playwright not installed. Run: pip install playwright && playwright install chromium")
|
|
25
|
+
sys.exit(1)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
async def convert(input_html: str, output_pdf: str):
|
|
29
|
+
input_path = Path(input_html).resolve()
|
|
30
|
+
output_path = Path(output_pdf).resolve()
|
|
31
|
+
if not input_path.exists():
|
|
32
|
+
print(f"Error: input not found: {input_path}")
|
|
33
|
+
sys.exit(1)
|
|
34
|
+
|
|
35
|
+
print(f"Input: {input_path}\nOutput: {output_path}")
|
|
36
|
+
async with async_playwright() as p:
|
|
37
|
+
browser = await p.chromium.launch()
|
|
38
|
+
page = await browser.new_page(viewport={"width": 1100, "height": 1400})
|
|
39
|
+
await page.goto(f"file://{input_path}", wait_until="networkidle")
|
|
40
|
+
# Chart.js renders on DOMContentLoaded; give animations a beat to settle.
|
|
41
|
+
await page.wait_for_timeout(2500)
|
|
42
|
+
await page.emulate_media(media="screen") # keep the on-screen styling
|
|
43
|
+
await page.pdf(
|
|
44
|
+
path=str(output_path),
|
|
45
|
+
format="A4",
|
|
46
|
+
print_background=True,
|
|
47
|
+
margin={"top": "12mm", "right": "10mm", "bottom": "12mm", "left": "10mm"},
|
|
48
|
+
prefer_css_page_size=False,
|
|
49
|
+
)
|
|
50
|
+
await browser.close()
|
|
51
|
+
print(f"PDF created: {output_path} ({output_path.stat().st_size / 1024:.1f} KB)")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def main():
|
|
55
|
+
if len(sys.argv) < 2:
|
|
56
|
+
print("usage: python html_report_to_pdf.py <input.html> [output.pdf]")
|
|
57
|
+
sys.exit(1)
|
|
58
|
+
inp = sys.argv[1]
|
|
59
|
+
out = sys.argv[2] if len(sys.argv) >= 3 else str(Path(inp).with_suffix(".pdf"))
|
|
60
|
+
asyncio.run(convert(inp, out))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
if __name__ == "__main__":
|
|
64
|
+
main()
|