medsci-skills 4.8.0 → 4.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -1
- package/metadata/distribution_files.json +86 -36
- package/metadata/distribution_manifest.json +1 -1
- package/package.json +1 -1
- package/skills/MAINTENANCE.md +68 -0
- package/skills/analyze-stats/SKILL.md +2 -0
- package/skills/check-reporting/SKILL.md +33 -1
- package/skills/check-reporting/references/genai_image_study_object_decision_aid.md +60 -0
- package/skills/check-reporting/tests/fixtures/prisma_body.md +7 -0
- package/skills/check-reporting/tests/fixtures/prisma_fig_clean.md +10 -0
- package/skills/check-reporting/tests/fixtures/prisma_fig_mismatch.md +10 -0
- package/skills/check-reporting/tests/test_prisma_figure.sh +50 -0
- package/skills/design-ai-benchmarking/SKILL.md +16 -0
- package/skills/design-ai-benchmarking/references/anchor_rotate_reader_allocation.md +92 -0
- package/skills/find-journal/references/journal_profiles/KJR.md +1 -1
- package/skills/manage-project/SKILL.md +1 -1
- package/skills/manage-refs/SKILL.md +3 -0
- package/skills/manage-refs/citation_styles/README.md +1 -0
- package/skills/manage-refs/citation_styles/liver-international.csl +535 -0
- package/skills/manage-refs/scripts/check_csl_render.py +85 -22
- package/skills/manage-refs/scripts/check_reference_duplication.py +245 -0
- package/skills/manage-refs/tests/fixtures/csl_render_sample.bib +19 -0
- package/skills/manage-refs/tests/fixtures/refclean_text.md +11 -0
- package/skills/manage-refs/tests/fixtures/refdup_text.md +19 -0
- package/skills/manage-refs/tests/test_csl_render.sh +60 -0
- package/skills/manage-refs/tests/test_reference_duplication.sh +47 -0
- package/skills/meta-analysis/SKILL.md +13 -42
- package/skills/meta-analysis/references/empirical_lessons.md +53 -0
- package/skills/revise/SKILL.md +2 -0
- package/skills/self-review/SKILL.md +56 -0
- package/skills/self-review/scripts/check_binning_consistency.py +502 -0
- package/skills/self-review/scripts/check_citation_order.py +204 -0
- package/skills/self-review/scripts/check_classical_style.py +22 -0
- package/skills/self-review/tests/fixtures/binning_clean/primary.R +2 -0
- package/skills/self-review/tests/fixtures/binning_clean/sensitivity.R +2 -0
- package/skills/self-review/tests/fixtures/binning_drift/primary.R +5 -0
- package/skills/self-review/tests/fixtures/binning_drift/sensitivity.R +5 -0
- package/skills/self-review/tests/fixtures/citation_order_bad.md +31 -0
- package/skills/self-review/tests/fixtures/citation_order_good.md +30 -0
- package/skills/self-review/tests/fixtures/derived_clean/canonical.R +6 -0
- package/skills/self-review/tests/fixtures/derived_clean/shared.R +6 -0
- package/skills/self-review/tests/fixtures/derived_drift/canonical.R +7 -0
- package/skills/self-review/tests/fixtures/derived_drift/reanalysis.R +6 -0
- package/skills/self-review/tests/fixtures/style_bad.md +2 -1
- package/skills/self-review/tests/test_binning_consistency.sh +67 -0
- package/skills/self-review/tests/test_citation_order.sh +48 -0
- package/skills/self-review/tests/test_classical_style.sh +9 -1
- package/skills/sync-submission/SKILL.md +5 -2
- package/skills/sync-submission/scripts/_yaml_frontmatter.py +35 -0
- package/skills/sync-submission/scripts/check_checklist_dump_leak.py +228 -0
- package/skills/sync-submission/scripts/check_wordcount_cap.py +4 -12
- package/skills/sync-submission/scripts/cover_letter_drift_check.py +4 -18
- package/skills/sync-submission/scripts/preflight_gate.py +17 -2
- package/skills/sync-submission/tests/test_checklist_dump_leak.sh +89 -0
- package/skills/write-paper/references/journal_profiles/KJR.md +18 -0
- package/skills/write-paper/references/journal_profiles/Liver_International.md +23 -1
package/README.md
CHANGED
|
@@ -268,6 +268,11 @@ The E2E pipeline (`orchestrate --e2e`) produces everything up to `qc/`. The `sub
|
|
|
268
268
|
|
|
269
269
|
## What's New
|
|
270
270
|
|
|
271
|
+
**v4.9** — analysis-integrity hardening promoted from real review cycles, plus journal-mechanics additions. Additive and backward-compatible; still 45 skills / 36 guidelines, analysis-integrity detectors **32 → 36**:
|
|
272
|
+
|
|
273
|
+
- **Four new gates** — a **duplicate-bibliography** check (`check_reference_duplication.py`) for the hybrid `[@key]` + hand-typed `## References` build that renders the list twice; a **cross-script binning / composite-indicator** consistency check (`check_binning_consistency.py`, `BINNING_DRIFT` / `DERIVED_DEF_DRIFT`) for a derived categorical or composite indicator defined inconsistently across analysis scripts; a **float citation-order** check (`check_citation_order.py`) for numbered Tables/Figures not first cited in ascending order per series; and an **audit-dump leak** gate (`/sync-submission`) that blocks a `/check-reporting` output mistakenly attached as a submission file.
|
|
274
|
+
- **KJR technical-check conventions + percentage-decimal style**, reader-allocation-under-burden and generative-image-as-study-object reporting (`/design-ai-benchmarking`, `/check-reporting`), and a **Liver International** CSL with that journal's submission mechanics (`/manage-refs`).
|
|
275
|
+
|
|
271
276
|
**v4.8** is the **review-harvest batch** — deterministic detector hardening promoted from real-manuscript review cycles. Additive and backward-compatible; still 45 skills / 36 guidelines, analysis-integrity detectors **30 → 32**:
|
|
272
277
|
|
|
273
278
|
- **Two new gates** — `check_supplement_hygiene.py` lints the rendered supplement / tables / caption files (not just the manuscript) for §-labels, placeholders, build markers, response-letter framing, and unresolved body↔supplement cross-references; `check_null_calibration.py` flags a headline negative/equivalence claim made without a minimum-detectable-effect / power / equivalence statement.
|
|
@@ -356,7 +361,7 @@ Earlier in this series: analysis-integrity guards (confounding completeness, cla
|
|
|
356
361
|
| **Battle-tested** | Used on real manuscript submissions by a practicing physician-researcher | Unknown provenance and validation |
|
|
357
362
|
| **Depth per skill** | 150-600 lines of documentation + bundled reference files (curated journal profile library, checklists, formula sheets, code templates) | Typically thin SKILL.md templates |
|
|
358
363
|
|
|
359
|
-
**MedSci-Audit** — the verification edge in the first rows above is a named suite of **
|
|
364
|
+
**MedSci-Audit** — the verification edge in the first rows above is a named suite of **36 deterministic detectors** (citation & reference integrity, cohort & pool arithmetic, scope/estimand contracts, reporting compliance, and more) that catch fabricated or drifted content before a manuscript reaches a reviewer. See **[`MEDSCI_AUDIT.md`](MEDSCI_AUDIT.md)** for the suite, its six families, and its evaluation evidence.
|
|
360
365
|
|
|
361
366
|
---
|
|
362
367
|
|
|
@@ -601,6 +606,17 @@ Projects declare their source-of-truth layout in `SSOT.yaml`, and a `qc/migratio
|
|
|
601
606
|
### Skills Work Together
|
|
602
607
|
Skills call each other. `check-reporting` invokes `make-figures` for PRISMA diagrams. `write-paper` calls `search-lit` for citation verification. `self-review` delegates reporting compliance to `check-reporting`. `calc-sample-size` output feeds directly into `write-protocol`'s IRB justification section.
|
|
603
608
|
|
|
609
|
+
### Skill boundaries — which to use, and in what order
|
|
610
|
+
The skill set is deliberately *specialized, not consolidated* — each skill owns a distinct artifact or lifecycle step, so the routing stays precise. The boundaries that are easy to confuse:
|
|
611
|
+
|
|
612
|
+
- **Reference pipeline** — `search-lit` (discover candidates) → `lit-sync` (sole writer of `refs.bib`, syncs Zotero/Obsidian) → `manage-refs` (render CSL / inject CWYW / cross-ref QC, sole writer of the rendered DOCX) → `verify-refs` (read-only audit; never edits `refs.bib`). They are one pipeline, not four overlapping tools.
|
|
613
|
+
- **Language passes run in order** — `humanize` (remove AI-writing tells) → `polish-language` (deterministic ESL/house-style consistency: abbreviations, spelling, en-dashes, p-value case) → `academic-aio` (AI-search/GEO visibility). Three sequential passes with non-overlapping jobs.
|
|
614
|
+
- **Manuscript type picks the skill** — `write-paper` (original/IMRAD articles, case reports, MAs) vs `review-paper` (narrative / scoping / systematic literature reviews) vs `revise` (reviewer-response + tracked changes). Different structures and reporting guidelines.
|
|
615
|
+
- **Author vs external reviewer** — `self-review` is your own pre-submission check (anticipated comments); `peer-review` drafts a journal-facing review as an external reviewer. Same domain probes, different user and output.
|
|
616
|
+
- **Project entry** — `intake-project` classifies and scaffolds a *new or messy folder*; `orchestrate` routes a *goal or task* ("help me write a paper"). Start with `intake-project` when you have files but no structure, `orchestrate` when you have a task but no plan.
|
|
617
|
+
- **Study design** — `design-study` covers general validity (analysis unit, leakage, comparator, validation) **and** carries a design-stage ceiling gate for perceptual / observer / reader / visual-Turing-test / image-provenance studies; `design-ai-benchmarking` specializes in AI-vs-human-expert evaluation (rubrics, calibration probes, LLM-as-judge).
|
|
618
|
+
- **Content vs template** — `write-protocol` drafts IRB/ethics scientific content; `fill-protocol` renders that content into an institutional Word template without breaking its formatting.
|
|
619
|
+
|
|
604
620
|
### Validation status — available vs CI-gated vs evaluated
|
|
605
621
|
Be precise about what "validated" means here — the three tiers are different facts:
|
|
606
622
|
- **Available** — every bundled skill and deterministic detector. The current totals are the single source of truth in [`metadata/catalog_counts.json`](metadata/catalog_counts.json) and [`MEDSCI_AUDIT.md`](MEDSCI_AUDIT.md).
|
|
@@ -51,6 +51,11 @@
|
|
|
51
51
|
"size": 25500,
|
|
52
52
|
"sha256": "6a632a88617889a1ac36418822b8af3f2bcab75bfa28169e99ae4fdf0b810365"
|
|
53
53
|
},
|
|
54
|
+
{
|
|
55
|
+
"path": "skills/MAINTENANCE.md",
|
|
56
|
+
"size": 4061,
|
|
57
|
+
"sha256": "a4eaa6062e7d5879afcdac3bd954fcb783282707eea22b815d5a6f794d5a5217"
|
|
58
|
+
},
|
|
54
59
|
{
|
|
55
60
|
"path": "skills/academic-aio/SKILL.md",
|
|
56
61
|
"size": 31396,
|
|
@@ -148,8 +153,8 @@
|
|
|
148
153
|
},
|
|
149
154
|
{
|
|
150
155
|
"path": "skills/analyze-stats/SKILL.md",
|
|
151
|
-
"size":
|
|
152
|
-
"sha256": "
|
|
156
|
+
"size": 47388,
|
|
157
|
+
"sha256": "12121ea6224d8c75d4aa98a6e2ee2947c95cfc17a3902780e7bb8d7ddb0be052"
|
|
153
158
|
},
|
|
154
159
|
{
|
|
155
160
|
"path": "skills/analyze-stats/references/analysis_guides/mediation.md",
|
|
@@ -468,8 +473,8 @@
|
|
|
468
473
|
},
|
|
469
474
|
{
|
|
470
475
|
"path": "skills/check-reporting/SKILL.md",
|
|
471
|
-
"size":
|
|
472
|
-
"sha256": "
|
|
476
|
+
"size": 35835,
|
|
477
|
+
"sha256": "a11617fb2bcf03b63a788638ad68ab9dac8623281e8b58428706b7c43a02e8c3"
|
|
473
478
|
},
|
|
474
479
|
{
|
|
475
480
|
"path": "skills/check-reporting/references/LICENSES.md",
|
|
@@ -666,6 +671,11 @@
|
|
|
666
671
|
"size": 4565,
|
|
667
672
|
"sha256": "f955a0479da6474e43ece05361838f8db95923ec9f7dc56863afbf4cba66174d"
|
|
668
673
|
},
|
|
674
|
+
{
|
|
675
|
+
"path": "skills/check-reporting/references/genai_image_study_object_decision_aid.md",
|
|
676
|
+
"size": 4287,
|
|
677
|
+
"sha256": "34f79571566ef06eee0fc4c8c646be530806fca658720902d16642faadc8844b"
|
|
678
|
+
},
|
|
669
679
|
{
|
|
670
680
|
"path": "skills/check-reporting/references/step4c_registration_timing.md",
|
|
671
681
|
"size": 4197,
|
|
@@ -853,8 +863,13 @@
|
|
|
853
863
|
},
|
|
854
864
|
{
|
|
855
865
|
"path": "skills/design-ai-benchmarking/SKILL.md",
|
|
856
|
-
"size":
|
|
857
|
-
"sha256": "
|
|
866
|
+
"size": 12094,
|
|
867
|
+
"sha256": "b8f794a1f6c800d821305a4df8a797bea61cf34a602e0dc0dbea8f2c0c458ca5"
|
|
868
|
+
},
|
|
869
|
+
{
|
|
870
|
+
"path": "skills/design-ai-benchmarking/references/anchor_rotate_reader_allocation.md",
|
|
871
|
+
"size": 4585,
|
|
872
|
+
"sha256": "a763572efd764118e6ee57c950268c175cfeeecca00a43be53412e97c053421d"
|
|
858
873
|
},
|
|
859
874
|
{
|
|
860
875
|
"path": "skills/design-ai-benchmarking/references/benchmark_export_schema.json",
|
|
@@ -1198,8 +1213,8 @@
|
|
|
1198
1213
|
},
|
|
1199
1214
|
{
|
|
1200
1215
|
"path": "skills/find-journal/references/journal_profiles/KJR.md",
|
|
1201
|
-
"size":
|
|
1202
|
-
"sha256": "
|
|
1216
|
+
"size": 3036,
|
|
1217
|
+
"sha256": "a0814e6d62288389db7528b73a25db870ab91635dc4b946fb0c8bf8af47150a3"
|
|
1203
1218
|
},
|
|
1204
1219
|
{
|
|
1205
1220
|
"path": "skills/find-journal/references/journal_profiles/Korean_Circulation_Journal.md",
|
|
@@ -1968,8 +1983,8 @@
|
|
|
1968
1983
|
},
|
|
1969
1984
|
{
|
|
1970
1985
|
"path": "skills/manage-project/SKILL.md",
|
|
1971
|
-
"size":
|
|
1972
|
-
"sha256": "
|
|
1986
|
+
"size": 12315,
|
|
1987
|
+
"sha256": "40c3a0098a3729b839e132db3987ad0f3fc5f3eeaf1c5a56dd77673cffdb5dbd"
|
|
1973
1988
|
},
|
|
1974
1989
|
{
|
|
1975
1990
|
"path": "skills/manage-project/references/pre_submission_checklist.md",
|
|
@@ -2018,13 +2033,13 @@
|
|
|
2018
2033
|
},
|
|
2019
2034
|
{
|
|
2020
2035
|
"path": "skills/manage-refs/SKILL.md",
|
|
2021
|
-
"size":
|
|
2022
|
-
"sha256": "
|
|
2036
|
+
"size": 18165,
|
|
2037
|
+
"sha256": "49adc82dea2b5d7eb93946b2cd8143d66d50b53169d3ed18f4bd738bfe3af39f"
|
|
2023
2038
|
},
|
|
2024
2039
|
{
|
|
2025
2040
|
"path": "skills/manage-refs/citation_styles/README.md",
|
|
2026
|
-
"size":
|
|
2027
|
-
"sha256": "
|
|
2041
|
+
"size": 2205,
|
|
2042
|
+
"sha256": "d957bbdd13df10884fd54f9eb4efb096a73824c69167f872b0cb9819be031cdf"
|
|
2028
2043
|
},
|
|
2029
2044
|
{
|
|
2030
2045
|
"path": "skills/manage-refs/citation_styles/american-journal-of-roentgenology.csl",
|
|
@@ -2061,6 +2076,11 @@
|
|
|
2061
2076
|
"size": 5849,
|
|
2062
2077
|
"sha256": "edde670da20212820d54649dcb96594db835eb55498e88c7de41891dfb370114"
|
|
2063
2078
|
},
|
|
2079
|
+
{
|
|
2080
|
+
"path": "skills/manage-refs/citation_styles/liver-international.csl",
|
|
2081
|
+
"size": 18264,
|
|
2082
|
+
"sha256": "c7c144ff5df948fc09c9604bf9f8269c6cd427c29bd043da6ead24e75c80971f"
|
|
2083
|
+
},
|
|
2064
2084
|
{
|
|
2065
2085
|
"path": "skills/manage-refs/citation_styles/nature.csl",
|
|
2066
2086
|
"size": 6444,
|
|
@@ -2118,8 +2138,13 @@
|
|
|
2118
2138
|
},
|
|
2119
2139
|
{
|
|
2120
2140
|
"path": "skills/manage-refs/scripts/check_csl_render.py",
|
|
2121
|
-
"size":
|
|
2122
|
-
"sha256": "
|
|
2141
|
+
"size": 7718,
|
|
2142
|
+
"sha256": "a1848c33e945024719fa1b7cc996d37555801e011c6280be6644ae4f01642601"
|
|
2143
|
+
},
|
|
2144
|
+
{
|
|
2145
|
+
"path": "skills/manage-refs/scripts/check_reference_duplication.py",
|
|
2146
|
+
"size": 10210,
|
|
2147
|
+
"sha256": "439f02252338e204dadf24dc4de13e38ab3ce7b6ea394e8dee38b8ee1cf92524"
|
|
2123
2148
|
},
|
|
2124
2149
|
{
|
|
2125
2150
|
"path": "skills/manage-refs/scripts/check_xref.py",
|
|
@@ -2158,8 +2183,8 @@
|
|
|
2158
2183
|
},
|
|
2159
2184
|
{
|
|
2160
2185
|
"path": "skills/meta-analysis/SKILL.md",
|
|
2161
|
-
"size":
|
|
2162
|
-
"sha256": "
|
|
2186
|
+
"size": 49604,
|
|
2187
|
+
"sha256": "4947eae188dc2fcfba68ed991ba126da8315dc79deac0649d2beea558e73025e"
|
|
2163
2188
|
},
|
|
2164
2189
|
{
|
|
2165
2190
|
"path": "skills/meta-analysis/references/LICENSES.md",
|
|
@@ -2216,6 +2241,11 @@
|
|
|
2216
2241
|
"size": 5538,
|
|
2217
2242
|
"sha256": "9b2dc03572cb066528e1ef19b0699ec9434ec29cbad8b84f5fbab1492ead5480"
|
|
2218
2243
|
},
|
|
2244
|
+
{
|
|
2245
|
+
"path": "skills/meta-analysis/references/empirical_lessons.md",
|
|
2246
|
+
"size": 7616,
|
|
2247
|
+
"sha256": "f49ebc21369095d19661d39186d1c41368811fbe689c77762daf60c74cd73ee8"
|
|
2248
|
+
},
|
|
2219
2249
|
{
|
|
2220
2250
|
"path": "skills/meta-analysis/references/icmje_coi_guide.md",
|
|
2221
2251
|
"size": 6043,
|
|
@@ -2773,8 +2803,8 @@
|
|
|
2773
2803
|
},
|
|
2774
2804
|
{
|
|
2775
2805
|
"path": "skills/revise/SKILL.md",
|
|
2776
|
-
"size":
|
|
2777
|
-
"sha256": "
|
|
2806
|
+
"size": 27775,
|
|
2807
|
+
"sha256": "2da4f80e879c2d3ff31d2af435cb27ecae0ba09f1014b8ebbd799ac2472ff1ea"
|
|
2778
2808
|
},
|
|
2779
2809
|
{
|
|
2780
2810
|
"path": "skills/revise/references/r2r_voice.md",
|
|
@@ -2848,8 +2878,8 @@
|
|
|
2848
2878
|
},
|
|
2849
2879
|
{
|
|
2850
2880
|
"path": "skills/self-review/SKILL.md",
|
|
2851
|
-
"size":
|
|
2852
|
-
"sha256": "
|
|
2881
|
+
"size": 93517,
|
|
2882
|
+
"sha256": "92b6e1c0e6cdaa27d5f033ce28e58a210208d619793d221f9ffa944aa1055bba"
|
|
2853
2883
|
},
|
|
2854
2884
|
{
|
|
2855
2885
|
"path": "skills/self-review/references/domain-probes/ai_overclaiming.md",
|
|
@@ -2961,6 +2991,16 @@
|
|
|
2961
2991
|
"size": 17113,
|
|
2962
2992
|
"sha256": "56096c39ddb0083c04a1254f06bafa6fac9fc8a136c9246f68773f0ba5da96d4"
|
|
2963
2993
|
},
|
|
2994
|
+
{
|
|
2995
|
+
"path": "skills/self-review/scripts/check_binning_consistency.py",
|
|
2996
|
+
"size": 19541,
|
|
2997
|
+
"sha256": "e3bf7dd2e0871ce6905abc1d33a26c7afac76a93d184bfe2d431af97d0622f74"
|
|
2998
|
+
},
|
|
2999
|
+
{
|
|
3000
|
+
"path": "skills/self-review/scripts/check_citation_order.py",
|
|
3001
|
+
"size": 8705,
|
|
3002
|
+
"sha256": "38525b4dd3ca8c9d99f090e4d42b65f10baf442f56fc4eac5174fb6ba13d90bb"
|
|
3003
|
+
},
|
|
2964
3004
|
{
|
|
2965
3005
|
"path": "skills/self-review/scripts/check_claim_artifact.py",
|
|
2966
3006
|
"size": 10757,
|
|
@@ -2968,8 +3008,8 @@
|
|
|
2968
3008
|
},
|
|
2969
3009
|
{
|
|
2970
3010
|
"path": "skills/self-review/scripts/check_classical_style.py",
|
|
2971
|
-
"size":
|
|
2972
|
-
"sha256": "
|
|
3011
|
+
"size": 12210,
|
|
3012
|
+
"sha256": "c973ee8b776f28515439fb185e1254e08e62c2e1410e260f18a824241a331af0"
|
|
2973
3013
|
},
|
|
2974
3014
|
{
|
|
2975
3015
|
"path": "skills/self-review/scripts/check_cohort_arithmetic.py",
|
|
@@ -3038,14 +3078,19 @@
|
|
|
3038
3078
|
},
|
|
3039
3079
|
{
|
|
3040
3080
|
"path": "skills/sync-submission/SKILL.md",
|
|
3041
|
-
"size":
|
|
3042
|
-
"sha256": "
|
|
3081
|
+
"size": 27787,
|
|
3082
|
+
"sha256": "4da14c76c6c9326d31ee93e9515854291cba2c48692eb85cf5d9f6f4301ce465"
|
|
3043
3083
|
},
|
|
3044
3084
|
{
|
|
3045
3085
|
"path": "skills/sync-submission/references/journal_availability_policy.json",
|
|
3046
3086
|
"size": 1257,
|
|
3047
3087
|
"sha256": "6d278675d7c734aa3589165817f5413cc46c44402ea15039e51052ab2f52c0a8"
|
|
3048
3088
|
},
|
|
3089
|
+
{
|
|
3090
|
+
"path": "skills/sync-submission/scripts/_yaml_frontmatter.py",
|
|
3091
|
+
"size": 1669,
|
|
3092
|
+
"sha256": "028fa8c4f7a4440c72d693a2ba6d4799410de0c565c61bd30d68eb0e7c208c78"
|
|
3093
|
+
},
|
|
3049
3094
|
{
|
|
3050
3095
|
"path": "skills/sync-submission/scripts/assemble_supplement.py",
|
|
3051
3096
|
"size": 8979,
|
|
@@ -3066,6 +3111,11 @@
|
|
|
3066
3111
|
"size": 13869,
|
|
3067
3112
|
"sha256": "caba039c6cfbfa09aec681a9840c7e0b5650cccdf9e00ddfd869557b0fec57c8"
|
|
3068
3113
|
},
|
|
3114
|
+
{
|
|
3115
|
+
"path": "skills/sync-submission/scripts/check_checklist_dump_leak.py",
|
|
3116
|
+
"size": 8745,
|
|
3117
|
+
"sha256": "320765b9e975601fc2d73ce15a65b1419668982630c3b7546d0909158e5a5374"
|
|
3118
|
+
},
|
|
3069
3119
|
{
|
|
3070
3120
|
"path": "skills/sync-submission/scripts/check_cross_artifact_stale.py",
|
|
3071
3121
|
"size": 8286,
|
|
@@ -3078,13 +3128,13 @@
|
|
|
3078
3128
|
},
|
|
3079
3129
|
{
|
|
3080
3130
|
"path": "skills/sync-submission/scripts/check_wordcount_cap.py",
|
|
3081
|
-
"size":
|
|
3082
|
-
"sha256": "
|
|
3131
|
+
"size": 9788,
|
|
3132
|
+
"sha256": "16fecbceae672e4192a138a0509321ea367f61079ca4ef4d630667b1e64eda58"
|
|
3083
3133
|
},
|
|
3084
3134
|
{
|
|
3085
3135
|
"path": "skills/sync-submission/scripts/cover_letter_drift_check.py",
|
|
3086
|
-
"size":
|
|
3087
|
-
"sha256": "
|
|
3136
|
+
"size": 16001,
|
|
3137
|
+
"sha256": "347c5b702fbe9375899795791a34e8a60e246253bafb53b15ebb59d51dd45e7d"
|
|
3088
3138
|
},
|
|
3089
3139
|
{
|
|
3090
3140
|
"path": "skills/sync-submission/scripts/cross_document_n_check.py",
|
|
@@ -3098,8 +3148,8 @@
|
|
|
3098
3148
|
},
|
|
3099
3149
|
{
|
|
3100
3150
|
"path": "skills/sync-submission/scripts/preflight_gate.py",
|
|
3101
|
-
"size":
|
|
3102
|
-
"sha256": "
|
|
3151
|
+
"size": 20954,
|
|
3152
|
+
"sha256": "f4be9edf587ec5ea2b7fb782e4912173b78a4fe2035720e6d0681b3d8f36340f"
|
|
3103
3153
|
},
|
|
3104
3154
|
{
|
|
3105
3155
|
"path": "skills/sync-submission/scripts/scope_drift_check.py",
|
|
@@ -3418,8 +3468,8 @@
|
|
|
3418
3468
|
},
|
|
3419
3469
|
{
|
|
3420
3470
|
"path": "skills/write-paper/references/journal_profiles/KJR.md",
|
|
3421
|
-
"size":
|
|
3422
|
-
"sha256": "
|
|
3471
|
+
"size": 12737,
|
|
3472
|
+
"sha256": "ea71f1be90ff7088ba8931c97515f221486ca7ac9c7079fefba25417e7a0e932"
|
|
3423
3473
|
},
|
|
3424
3474
|
{
|
|
3425
3475
|
"path": "skills/write-paper/references/journal_profiles/Korean_Circulation_Journal.md",
|
|
@@ -3438,8 +3488,8 @@
|
|
|
3438
3488
|
},
|
|
3439
3489
|
{
|
|
3440
3490
|
"path": "skills/write-paper/references/journal_profiles/Liver_International.md",
|
|
3441
|
-
"size":
|
|
3442
|
-
"sha256": "
|
|
3491
|
+
"size": 12174,
|
|
3492
|
+
"sha256": "4a12d53605045b20aabc827e4b803edd73f603b6112ced34ec8acfef965950aa"
|
|
3443
3493
|
},
|
|
3444
3494
|
{
|
|
3445
3495
|
"path": "skills/write-paper/references/journal_profiles/Medical_Image_Analysis.md",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "medsci-skills",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.9.0",
|
|
4
4
|
"description": "MedSci Skills — a medical/scientific research skill suite for AI coding agents (Claude Code, Codex, Cursor, Copilot). The npm package is a terminal-friendly installer shortcut; the canonical distribution remains the GitHub repository and the Claude Code plugin marketplace.",
|
|
5
5
|
"license": "SEE LICENSE IN LICENSE",
|
|
6
6
|
"homepage": "https://github.com/Aperivue/medsci-skills#readme",
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# Skill Script Maintenance — taxonomy & wiring rules
|
|
2
|
+
|
|
3
|
+
Every `.py`/`.sh` under `skills/*/scripts/` and `skills/*/tests/` falls into one of
|
|
4
|
+
four categories. Misclassifying one is how a detector goes "dormant" (counted in the
|
|
5
|
+
catalog but never invoked) or how a regression test gives false coverage (exists but
|
|
6
|
+
never runs in CI). This file is the source of truth for which is which and what each
|
|
7
|
+
category must satisfy.
|
|
8
|
+
|
|
9
|
+
## 1. Counted analysis-integrity detector
|
|
10
|
+
|
|
11
|
+
A script whose **filename** matches the catalog glob — `check_*.py`, `detect_*.py`,
|
|
12
|
+
`derive_*.py`, or `verify_refs.py` — under `skills/*/scripts/`. The glob is the SSOT:
|
|
13
|
+
`scripts/gen_detectors_catalog_json.py` and `scripts/validate_catalog_consistency.py`
|
|
14
|
+
both count these and they must agree with `metadata/catalog_counts.json`
|
|
15
|
+
(`integrity_detectors`).
|
|
16
|
+
|
|
17
|
+
A counted detector MUST:
|
|
18
|
+
- be registered in `scripts/gen_detectors_catalog_json.py` `FAMILY_BY_ID` (an unmapped id
|
|
19
|
+
fails generation), and bump `metadata/catalog_counts.json` + `MEDSCI_AUDIT.md` when added;
|
|
20
|
+
- be **invoked** from its skill's `SKILL.md` (a named workflow step) — otherwise it is
|
|
21
|
+
dormant (counted but never run on a real manuscript);
|
|
22
|
+
- have a **CI-wired** regression test (a `tests/test_*.sh` step in
|
|
23
|
+
`.github/workflows/validate.yml`) with PII-free synthetic fixtures.
|
|
24
|
+
|
|
25
|
+
> Naming trap: a reusable helper must NOT be named `check_*`/`detect_*` or it inflates the
|
|
26
|
+
> detector count. Prefix helpers with `_` (see category 2).
|
|
27
|
+
|
|
28
|
+
## 2. Helper / library module
|
|
29
|
+
|
|
30
|
+
Shared logic imported by other scripts in the **same** skill (skills are self-contained —
|
|
31
|
+
no cross-skill imports). Name it with a leading underscore (`_yaml_frontmatter.py`) or a
|
|
32
|
+
plain verb (`fill_journal_abbrev.py`) so the detector glob never counts it. Helpers do not
|
|
33
|
+
need their own SKILL.md step, but if a user runs them directly they should be listed in the
|
|
34
|
+
skill's tool table (e.g. `manage-refs` documents `fill_journal_abbrev.py`).
|
|
35
|
+
|
|
36
|
+
## 3. Run-once authoring tool
|
|
37
|
+
|
|
38
|
+
A generator a maintainer runs by hand to (re)build a committed asset — NOT invoked at skill
|
|
39
|
+
invocation. These are intentionally not wired into any SKILL.md step. Keep them; document
|
|
40
|
+
their purpose in their own docstring. Current run-once tools:
|
|
41
|
+
|
|
42
|
+
- `skills/make-figures/scripts/build_jacc_template.py` — rebuilds the committed JACC Central
|
|
43
|
+
Illustration PPTX template (`references/visual_abstract_templates/jacc_central_illustration.pptx`).
|
|
44
|
+
- `skills/make-figures/scripts/extract_exemplar_from_pdf.py` — extracts a figure region from a
|
|
45
|
+
PDF page to grow the make-figures Critic-Loop exemplar reference set.
|
|
46
|
+
|
|
47
|
+
## 4. Test fixture / regression test
|
|
48
|
+
|
|
49
|
+
Lives under `skills/<skill>/tests/`. A `test_*.sh`/`test_*.py` is only real coverage if it
|
|
50
|
+
is wired into `.github/workflows/validate.yml` as a `run:` step. **Adding a test file is not
|
|
51
|
+
enough** — if it is not listed in `validate.yml` it never runs and gives false confidence.
|
|
52
|
+
When you add a detector and its test in the same PR, add the `validate.yml` step in that PR.
|
|
53
|
+
|
|
54
|
+
## When you touch a skill script — checklist
|
|
55
|
+
|
|
56
|
+
1. New `check_*`/`detect_*` detector → register in `gen_detectors_catalog_json.py`
|
|
57
|
+
(`FAMILY_BY_ID`) + bump `catalog_counts.json` + `MEDSCI_AUDIT.md` + wire into the skill's
|
|
58
|
+
`SKILL.md` + add a CI-wired test. Then run all three generators in `--check` mode.
|
|
59
|
+
2. New helper → underscore/plain name (never `check_*`), import only within the same skill.
|
|
60
|
+
3. New asset/fixture file → re-run `python3 scripts/gen_distribution_manifest.py` (it tracks
|
|
61
|
+
payload files and hashes; tests are excluded from the distributed payload but the manifest
|
|
62
|
+
`--check` still gates on edited payload scripts).
|
|
63
|
+
4. New/edited test → add its `run:` step to `.github/workflows/validate.yml`.
|
|
64
|
+
|
|
65
|
+
Run the full local CI-mirror before pushing (see the repo `CONTRIBUTING.md` / `validate.yml`
|
|
66
|
+
gates): `validate_skills.sh`, the three `gen_*.py --check`, `validate_catalog_consistency.py`,
|
|
67
|
+
`check_version_consistency.py`, `gen_skill_docs.py --check`, `check_locale_inventory.py`,
|
|
68
|
+
`validate_routing_assets.py --strict`, and the installer tests.
|
|
@@ -55,6 +55,8 @@ from `analysis_guides/` to ensure correct methodology and reporting.
|
|
|
55
55
|
|
|
56
56
|
### Phase 2: Analysis Plan
|
|
57
57
|
|
|
58
|
+
**Precondition (observational studies).** Before proposing an analysis plan for an observational design (cohort, case-control, cross-sectional, registry, or survey), confirm that a literature-grounded variable operationalization exists — a `variable_operationalization.md` from `/define-variables`, or an equivalent codebook-backed definition table. If none exists, **warn** the user and recommend running `/define-variables` first, so exposure / outcome / covariate definitions and cutoffs are citation-backed rather than invented ad hoc from the data dictionary (ad-hoc phenotype/cutoff definitions are a common reviewer-rejection trigger for observational work — see the dictionary-first discipline). This is a WARN, not a hard block: proceed on explicit user confirmation, recording that the operationalization artifact was not available. For stricter projects, treat the missing artifact as a hard stop until `/define-variables` has run. (This mirrors the same precondition already enforced in `/write-protocol` before drafting Methods.)
|
|
59
|
+
|
|
58
60
|
Based on the data structure and research question, propose an analysis plan:
|
|
59
61
|
|
|
60
62
|
1. **Auto-detect analysis type** from the table below, or accept user specification.
|
|
@@ -115,13 +115,15 @@ user specification.
|
|
|
115
115
|
| Quality of systematic reviews | AMSTAR 2 | ROBIS |
|
|
116
116
|
| Radiomics study | CLEAR | CLAIM 2024 (if deep learning component) |
|
|
117
117
|
| Educational / QI study | SQUIRE 2.0 | -- |
|
|
118
|
+
| Generative AI **images ARE the study object** (realism / real-vs-synthetic reader study / model-vs-model quality) | (no single guideline -- assemble) | see decision aid below |
|
|
118
119
|
|
|
119
120
|
**Rules:**
|
|
120
121
|
- If the study involves AI/ML, always apply the AI extension in addition to the base guideline.
|
|
121
122
|
- **Exception — TRIPOD**: TRIPOD+AI 2024 (Collins et al., BMJ 2024) is a complete rewrite, not an addendum to TRIPOD 2015 (Moons et al., Ann Intern Med 2015). For non-AI prediction models, use TRIPOD 2015 only. For AI/ML prediction models, use TRIPOD+AI 2024 only. Do NOT apply both simultaneously.
|
|
122
123
|
- **STARD-AI** (Sounderajah et al., Nat Med 2025) extends STARD 2015 with 14 new and 4 modified items (40 total). For AI diagnostic accuracy studies, use STARD-AI (which incorporates all STARD 2015 items). Do NOT apply both STARD 2015 and STARD-AI simultaneously — STARD-AI supersedes STARD 2015 for AI studies.
|
|
123
124
|
- **TRIPOD-LLM** (Gallifant et al., Nat Med 2025) is the reporting guideline for studies that develop, fine-tune, prompt, or evaluate a large language model for a clinical/biomedical task. It extends the TRIPOD family (TRIPOD 2015 → TRIPOD+AI 2024 → TRIPOD-LLM 2025); name the base instrument and the extension and cite each. It is modular — task-specific items (Annotation, Prompting, Summarization, Instruction-tuning) are N/A when that component is absent. Use TRIPOD-LLM for LLM studies in place of TRIPOD+AI; pair with MI-CLEAR-LLM when LLM accuracy is an evaluated outcome. The vendored checklist is an educational summary (own-words paraphrase of item intent); complete the official instrument for a submission checklist.
|
|
124
|
-
- **MI-CLEAR-LLM** is a supplementary checklist (6 items), not a standalone reporting guideline. Always pair it with the study's primary guideline (e.g., STARD-AI for AI diagnostic accuracy, CLAIM for imaging AI). Apply MI-CLEAR-LLM whenever the study evaluates LLM accuracy as an outcome — do NOT apply it merely because the manuscript was written with LLM assistance.
|
|
125
|
+
- **MI-CLEAR-LLM** is a supplementary checklist (6 items), not a standalone reporting guideline. Always pair it with the study's primary guideline (e.g., STARD-AI for AI diagnostic accuracy, CLAIM for imaging AI). Apply MI-CLEAR-LLM whenever the study evaluates LLM accuracy as an outcome — do NOT apply it merely because the manuscript was written with LLM assistance. Its scope is **LLM accuracy** studies (including VLMs interpreting images); it does **not** apply at study level to studies where a generative model *produces* the images under study (see next bullet).
|
|
126
|
+
- **Generative-AI images as the study object** (a generative model synthesizes images and the study evaluates their realism, controllability, real-vs-synthetic distinguishability, or model-vs-model quality) has **no single dominant checklist**. Assemble: CLAIM 2024 (imaging-AI umbrella; model-development items N/A when commercial models are used as-is) + FUTURE-AI traceability + MI-CLEAR-LLM **transparency items only** (prompt/model/version/params/runs — for generation provenance, not study-level compliance) on the generator side; STARD-AI (for real-vs-synthetic detection) + GRRAS (reader reliability) + MRMC reporting on the evaluation side. Map applicable items and cite base + extension; never claim wholesale compliance. Full decision aid: `${CLAUDE_SKILL_DIR}/references/genai_image_study_object_decision_aid.md`.
|
|
125
127
|
- If multiple guidelines apply (e.g., a diagnostic accuracy study that is also an AI study), check against all relevant guidelines and merge into one report.
|
|
126
128
|
- If the user requests a specific guideline, use that one regardless of auto-detection.
|
|
127
129
|
|
|
@@ -246,6 +248,23 @@ study's data integrity immediately.
|
|
|
246
248
|
- Reasons for exclusion (Methods + Figure legend) agree on counts and category names.
|
|
247
249
|
|
|
248
250
|
**Procedure:**
|
|
251
|
+
|
|
252
|
+
Run the deterministic implementation first — it performs steps 1, 4, 5, and 6 below
|
|
253
|
+
automatically (same keyword regex, the four arithmetic equations, the body↔figure
|
|
254
|
+
cross-reference) and writes `qc/prisma_figure_audit.json`:
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
python3 ${CLAUDE_SKILL_DIR}/scripts/check_prisma_figure.py \
|
|
258
|
+
--md <manuscript.md> --figure <Figure 1 source: .md manifest / caption / text export> \
|
|
259
|
+
--out qc/prisma_figure_audit.json
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
Exit `1` = an arithmetic or cross-reference MISMATCH (log a Part C Action Item labelled
|
|
263
|
+
`[PRISMA-FIGURE]`, `fixable_by_ai: false` — the author must reconcile the numbers); exit
|
|
264
|
+
`2` = missing/unparsable input. The manual algorithm below documents exactly what the
|
|
265
|
+
script checks and is the fallback when Figure 1 numbers live only in a PNG/SVG that must
|
|
266
|
+
be transcribed by hand:
|
|
267
|
+
|
|
249
268
|
1. Extract numbers from manuscript Results / PRISMA flow paragraph (regex: integers near
|
|
250
269
|
keywords `identified`, `duplicates`, `screened`, `excluded`, `sought`, `retrieved`,
|
|
251
270
|
`assessed`, `included`).
|
|
@@ -323,9 +342,22 @@ critical item and the journal's own required elements.
|
|
|
323
342
|
|
|
324
343
|
Produce a structured compliance report in two parts.
|
|
325
344
|
|
|
345
|
+
This report is an **internal working audit** — it carries auto-fix annotations, a
|
|
346
|
+
machine-readable JSON block (`compliance_pct`, `fixable_by_ai`, …), and Action
|
|
347
|
+
Items. It is **NOT** the official reporting checklist a journal expects (that is
|
|
348
|
+
the blank guideline form with `Item | Recommendation | Reported in page/section`,
|
|
349
|
+
which the authors fill in). Never submit this report as the submission checklist.
|
|
350
|
+
To make the file self-identifying so it cannot be reused by filename into a later
|
|
351
|
+
submission package, **the report MUST begin with the NOT-FOR-SUBMISSION banner
|
|
352
|
+
below** as its very first line. (`/sync-submission`'s `check_checklist_dump_leak`
|
|
353
|
+
gate also catches this dump if it ever lands in a submission directory.)
|
|
354
|
+
|
|
326
355
|
#### Part A: Summary
|
|
327
356
|
|
|
328
357
|
```
|
|
358
|
+
<!-- INTERNAL AUDIT — NOT FOR SUBMISSION. This is the /check-reporting working
|
|
359
|
+
report, not the official journal checklist. Do not upload to a submission portal. -->
|
|
360
|
+
|
|
329
361
|
## Reporting Guideline Compliance Report
|
|
330
362
|
|
|
331
363
|
Manuscript: {title}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Decision aid — reporting studies where generative AI images ARE the study object
|
|
2
|
+
|
|
3
|
+
**When this applies:** the study evaluates **images that a generative AI model synthesized**
|
|
4
|
+
(realism, controllability/steerability, whether human readers can distinguish synthetic from
|
|
5
|
+
real, or model-vs-model quality). The generative model's *output* is the object under study.
|
|
6
|
+
|
|
7
|
+
**When this does NOT apply:** a model (incl. a vision-language model) *interprets* images and
|
|
8
|
+
you measure its diagnostic accuracy — that is an AI-accuracy study; use the relevant
|
|
9
|
+
accuracy guideline directly (e.g., STARD-AI, CLAIM, TRIPOD+AI, MI-CLEAR-LLM).
|
|
10
|
+
|
|
11
|
+
## There is no single dominant checklist for this study type
|
|
12
|
+
Generative-image-as-study-object work (e.g., RSNA reader studies on AI-synthesized or
|
|
13
|
+
"deepfake" medical images) is reported by **assembling** existing guidelines plus a precedent
|
|
14
|
+
bar. Do not claim wholesale compliance with any one checklist; map applicable items and cite
|
|
15
|
+
the base guideline together with any AI extension (verify each item against the published
|
|
16
|
+
source — never invent items).
|
|
17
|
+
|
|
18
|
+
### Generator / provenance side
|
|
19
|
+
- **CLAIM 2024** — medical-imaging-AI umbrella; the 2024 revision covers generative/foundation
|
|
20
|
+
models. If commercial models are used **as-is** (no training/fine-tuning by the authors),
|
|
21
|
+
the model-development / training / validation-split items are **N/A**; report data sources,
|
|
22
|
+
reference/real comparators, evaluation, transparency, and limitations.
|
|
23
|
+
- **FUTURE-AI** — use the **Traceability** principle: persist verbatim prompts, a generation
|
|
24
|
+
manifest, model + version + access date, and parameters (a prompt/generation registry).
|
|
25
|
+
- **MI-CLEAR-LLM — transparency *items* only, not study-level compliance.** MI-CLEAR-LLM is
|
|
26
|
+
scoped to **LLM *accuracy* studies in healthcare** (including VLMs interpreting images); it
|
|
27
|
+
is **not** a guideline for generative-output studies. Borrow its reporting *items* for
|
|
28
|
+
prompt-driven foundation models — verbatim prompt(s), model name + version + access date,
|
|
29
|
+
access channel/API, sampling parameters, number of runs, handling of non-determinism,
|
|
30
|
+
responsible party — to document generation provenance. Cite it as the basis for prompt
|
|
31
|
+
logging, not as the study's reporting guideline.
|
|
32
|
+
|
|
33
|
+
### Reader / evaluation side
|
|
34
|
+
- **STARD 2015 + STARD-AI** — if the reader task is **real-vs-synthetic discrimination**, that
|
|
35
|
+
is a diagnostic-accuracy structure: report the reference standard (what counts as
|
|
36
|
+
"real"/"synthetic"), reader blinding, flow, and accuracy with intervals. Cite base STARD
|
|
37
|
+
**and** the STARD-AI extension.
|
|
38
|
+
- **GRRAS** (Guidelines for Reporting Reliability and Agreement Studies) — for inter-reader
|
|
39
|
+
feature/quality ratings: number and qualification of readers, blinding, the agreement
|
|
40
|
+
statistic (ICC / weighted kappa) with 95% CI, and separate reporting of any anchor/control
|
|
41
|
+
items.
|
|
42
|
+
- **MRMC reporting** — for multi-reader multi-case designs: a-priori power, per-reader
|
|
43
|
+
randomization/seed, and a real-control arm matched on non-content attributes (resolution,
|
|
44
|
+
cropping, compression) so a format-only classifier cannot rival the readers.
|
|
45
|
+
|
|
46
|
+
### Precedent bar (de-facto standard for this study type)
|
|
47
|
+
Match the methodological bar set by published generative-image-as-study-object reader studies
|
|
48
|
+
in high-impact radiology venues: a-priori power, MRMC reader platform with per-reader seeds,
|
|
49
|
+
real-control matching on non-content attributes, and **explicit, pre-specified handling of
|
|
50
|
+
failed / low-quality generations** (count them rather than silently excluding survivors).
|
|
51
|
+
|
|
52
|
+
## Cross-cutting cautions
|
|
53
|
+
- **No overclaim:** state which items of which guideline the study satisfies, verified against
|
|
54
|
+
the published checklist; do not assert blanket "reported per [guideline]".
|
|
55
|
+
- **Manuscript's own AI-use disclosure** (writing assistance) is separate from the study-object
|
|
56
|
+
reporting above — see ICMJE/COPE and the write-paper LLM-disclosure feature.
|
|
57
|
+
- **Pre-registration** of the primary estimand, frequency/realism references, and the
|
|
58
|
+
fresh-only firewall (pilot/calibration images excluded from the confirmatory set) belongs in
|
|
59
|
+
a study registry (e.g., OSF) for non-clinical reader studies — not PROSPERO (systematic
|
|
60
|
+
reviews) or a clinical-trial registry (no health-outcome intervention).
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
## PRISMA flow
|
|
2
|
+
|
|
3
|
+
A total of 1000 records identified through database searching. After 200 duplicates
|
|
4
|
+
removed, 800 records screened. Of these, 600 records excluded at screening, leaving
|
|
5
|
+
200 reports sought for retrieval. 10 reports not retrieved. 190 reports retrieved and
|
|
6
|
+
190 reports assessed for eligibility. 40 records excluded with reasons. 150 studies
|
|
7
|
+
included in the synthesis.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
1000 records identified
|
|
2
|
+
200 duplicates removed
|
|
3
|
+
800 records screened
|
|
4
|
+
600 records excluded at screening
|
|
5
|
+
200 reports sought for retrieval
|
|
6
|
+
10 reports not retrieved
|
|
7
|
+
190 reports retrieved
|
|
8
|
+
190 reports assessed for eligibility
|
|
9
|
+
40 records excluded with reasons
|
|
10
|
+
150 studies included
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
1000 records identified
|
|
2
|
+
200 duplicates removed
|
|
3
|
+
800 records screened
|
|
4
|
+
600 records excluded at screening
|
|
5
|
+
200 reports sought for retrieval
|
|
6
|
+
10 reports not retrieved
|
|
7
|
+
190 reports retrieved
|
|
8
|
+
190 reports assessed for eligibility
|
|
9
|
+
40 records excluded with reasons
|
|
10
|
+
149 studies included
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Regression test for the PRISMA Figure 1 arithmetic + cross-reference audit
|
|
3
|
+
# (check-reporting Step 4d / check_prisma_figure.py). Synthetic, PII-free fixtures.
|
|
4
|
+
# Stdlib-only (python3); no network, no pandoc.
|
|
5
|
+
set -u
|
|
6
|
+
|
|
7
|
+
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
8
|
+
SCRIPT="$HERE/../scripts/check_prisma_figure.py"
|
|
9
|
+
BODY="$HERE/fixtures/prisma_body.md"
|
|
10
|
+
CLEAN="$HERE/fixtures/prisma_fig_clean.md"
|
|
11
|
+
MM="$HERE/fixtures/prisma_fig_mismatch.md"
|
|
12
|
+
OUT="$(mktemp -t prisma_fig_XXXX).json"
|
|
13
|
+
trap 'rm -f "$OUT"' EXIT
|
|
14
|
+
|
|
15
|
+
for f in "$SCRIPT" "$BODY" "$CLEAN" "$MM"; do
|
|
16
|
+
[[ -f "$f" ]] || { echo "ENV-ERR: missing $f" >&2; exit 2; }
|
|
17
|
+
done
|
|
18
|
+
|
|
19
|
+
fail=0
|
|
20
|
+
pass() { printf ' PASS %s\n' "$1"; }
|
|
21
|
+
bad() { printf ' FAIL %s\n' "$1"; fail=$((fail+1)); }
|
|
22
|
+
|
|
23
|
+
echo "test_prisma_figure:"
|
|
24
|
+
|
|
25
|
+
# 1. Clean figure (numbers match body, arithmetic consistent) -> audit_safe, exit 0.
|
|
26
|
+
python3 "$SCRIPT" --md "$BODY" --figure "$CLEAN" --out "$OUT" >/dev/null 2>&1; rc=$?
|
|
27
|
+
if [[ $rc -eq 0 ]] && python3 -c "import json,sys; sys.exit(0 if json.load(open('$OUT'))['audit_safe'] else 1)"; then
|
|
28
|
+
pass "clean body/figure -> audit_safe, exit 0"
|
|
29
|
+
else
|
|
30
|
+
bad "clean case rc=$rc (expected 0 + audit_safe)"
|
|
31
|
+
fi
|
|
32
|
+
|
|
33
|
+
# 2. Mismatched figure (included 149 vs body 150) -> MISMATCH, exit 1, PRISMA-FIGURE flag.
|
|
34
|
+
out="$(python3 "$SCRIPT" --md "$BODY" --figure "$MM" --out "$OUT" 2>&1)"; rc=$?
|
|
35
|
+
if [[ $rc -eq 1 && "$out" == *"[PRISMA-FIGURE]"* ]] \
|
|
36
|
+
&& python3 -c "import json,sys; d=json.load(open('$OUT')); sys.exit(0 if (not d['audit_safe'] and d['action_items']) else 1)"; then
|
|
37
|
+
pass "mismatched figure -> MISMATCH flagged, exit 1"
|
|
38
|
+
else
|
|
39
|
+
bad "mismatch case rc=$rc (expected 1 + [PRISMA-FIGURE] + action_items)"
|
|
40
|
+
fi
|
|
41
|
+
|
|
42
|
+
# 3. Missing input -> clean error, exit 2 (no traceback).
|
|
43
|
+
err="$(python3 "$SCRIPT" --md /nonexistent_prisma.md --figure "$CLEAN" --out "$OUT" 2>&1)"; rc=$?
|
|
44
|
+
if [[ $rc -eq 2 && "$err" == *"not found"* && "$err" != *"Traceback"* ]]; then
|
|
45
|
+
pass "missing manuscript -> clean error, exit 2"
|
|
46
|
+
else
|
|
47
|
+
bad "missing-input case rc=$rc: $err"
|
|
48
|
+
fi
|
|
49
|
+
|
|
50
|
+
if [[ $fail -eq 0 ]]; then echo " OK"; exit 0; else echo " $fail check(s) failed"; exit 1; fi
|