medsci-skills 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (702) hide show
  1. package/LICENSE +50 -0
  2. package/README.md +602 -0
  3. package/README_FIRST.md +27 -0
  4. package/bin/medsci-skills.js +159 -0
  5. package/installers/install-macos.command +19 -0
  6. package/installers/install-windows.cmd +26 -0
  7. package/installers/install-windows.ps1 +17 -0
  8. package/installers/install.py +218 -0
  9. package/metadata/skills_catalog.json +452 -0
  10. package/package.json +48 -0
  11. package/skills/academic-aio/SKILL.md +408 -0
  12. package/skills/academic-aio/references/case_studies/kjr_mllm_2025.md +82 -0
  13. package/skills/academic-aio/references/checklists/AIO_GENERAL.md +354 -0
  14. package/skills/academic-aio/references/journal_summarybox_templates.yaml +126 -0
  15. package/skills/academic-aio/references/oac_funding_checklist.yaml +129 -0
  16. package/skills/academic-aio/references/reporting_guideline_mapping.md +39 -0
  17. package/skills/academic-aio/references/schema_markup_templates/CodeRepository.jsonld +32 -0
  18. package/skills/academic-aio/references/schema_markup_templates/Dataset.jsonld +36 -0
  19. package/skills/academic-aio/references/schema_markup_templates/Person.jsonld +30 -0
  20. package/skills/academic-aio/references/schema_markup_templates/README.md +43 -0
  21. package/skills/academic-aio/references/schema_markup_templates/ScholarlyArticle.jsonld +55 -0
  22. package/skills/academic-aio/scripts/batch_metadata_audit.py +169 -0
  23. package/skills/academic-aio/scripts/validate_schema.py +118 -0
  24. package/skills/academic-aio/skill.yml +36 -0
  25. package/skills/academic-aio/templates/aio_audit_checklist.md.j2 +108 -0
  26. package/skills/add-journal/SKILL.md +482 -0
  27. package/skills/add-journal/skill.yml +33 -0
  28. package/skills/analyze-stats/SKILL.md +598 -0
  29. package/skills/analyze-stats/references/analysis_guides/missing_data.md +109 -0
  30. package/skills/analyze-stats/references/analysis_guides/nhis_icd10_mapping.md +247 -0
  31. package/skills/analyze-stats/references/analysis_guides/propensity_score.md +132 -0
  32. package/skills/analyze-stats/references/analysis_guides/regression.md +115 -0
  33. package/skills/analyze-stats/references/analysis_guides/repeated_measures.md +160 -0
  34. package/skills/analyze-stats/references/analysis_guides/survey_weighted.md +366 -0
  35. package/skills/analyze-stats/references/analysis_guides/test_selection.md +86 -0
  36. package/skills/analyze-stats/references/style/figure_style.mplstyle +69 -0
  37. package/skills/analyze-stats/references/style/theme_publication.R +147 -0
  38. package/skills/analyze-stats/references/table-standards/journal-profiles/ajr.yaml +51 -0
  39. package/skills/analyze-stats/references/table-standards/journal-profiles/european_radiology.yaml +55 -0
  40. package/skills/analyze-stats/references/table-standards/journal-profiles/jama.yaml +66 -0
  41. package/skills/analyze-stats/references/table-standards/journal-profiles/lancet.yaml +57 -0
  42. package/skills/analyze-stats/references/table-standards/journal-profiles/nejm.yaml +51 -0
  43. package/skills/analyze-stats/references/table-standards/journal-profiles/radiology.yaml +66 -0
  44. package/skills/analyze-stats/references/table-standards/table-standards.md +287 -0
  45. package/skills/analyze-stats/references/table-standards/table-types/diagnostic_accuracy.md +36 -0
  46. package/skills/analyze-stats/references/table-standards/table-types/meta_analysis.md +58 -0
  47. package/skills/analyze-stats/references/table-standards/table-types/model_comparison.md +36 -0
  48. package/skills/analyze-stats/references/table-standards/table-types/regression_results.md +50 -0
  49. package/skills/analyze-stats/references/table-standards/table-types/table1_demographics.md +51 -0
  50. package/skills/analyze-stats/references/table-standards/tool-comparison.md +79 -0
  51. package/skills/analyze-stats/references/templates/agreement_analysis.py +436 -0
  52. package/skills/analyze-stats/references/templates/dca_plot.R +237 -0
  53. package/skills/analyze-stats/references/templates/diagnostic_accuracy.py +401 -0
  54. package/skills/analyze-stats/references/templates/dta_meta_analysis.R +384 -0
  55. package/skills/analyze-stats/references/templates/forest_plot.py +412 -0
  56. package/skills/analyze-stats/references/templates/likert_summary.py +356 -0
  57. package/skills/analyze-stats/references/templates/meta_analysis.R +365 -0
  58. package/skills/analyze-stats/references/templates/propensity_score.py +478 -0
  59. package/skills/analyze-stats/references/templates/regression.py +425 -0
  60. package/skills/analyze-stats/references/templates/repeated_measures.py +434 -0
  61. package/skills/analyze-stats/references/templates/sample_size.R +382 -0
  62. package/skills/analyze-stats/references/templates/survey_weighted_analysis.py +411 -0
  63. package/skills/analyze-stats/references/templates/survival_analysis.py +325 -0
  64. package/skills/analyze-stats/references/templates/table1_demographics.py +287 -0
  65. package/skills/analyze-stats/scripts/check_generated_code.py +335 -0
  66. package/skills/analyze-stats/skill.yml +38 -0
  67. package/skills/analyze-stats/tests/fixtures/gen_bad.R +16 -0
  68. package/skills/analyze-stats/tests/fixtures/gen_bad.py +24 -0
  69. package/skills/analyze-stats/tests/fixtures/gen_clean.py +21 -0
  70. package/skills/analyze-stats/tests/test_generated_code.sh +59 -0
  71. package/skills/analyze-stats/tests/test_survival_template.sh +53 -0
  72. package/skills/author-strategy/SKILL.md +117 -0
  73. package/skills/author-strategy/analyze_patterns.py +303 -0
  74. package/skills/author-strategy/fetch_pubmed.py +374 -0
  75. package/skills/author-strategy/skill.yml +34 -0
  76. package/skills/batch-cohort/SKILL.md +223 -0
  77. package/skills/batch-cohort/references/base_template_knhanes.R +210 -0
  78. package/skills/batch-cohort/references/batch_template_generator.R +222 -0
  79. package/skills/batch-cohort/references/variable_coding_registry.md +136 -0
  80. package/skills/batch-cohort/skill.yml +35 -0
  81. package/skills/calc-sample-size/SKILL.md +491 -0
  82. package/skills/calc-sample-size/references/formulas.md +655 -0
  83. package/skills/calc-sample-size/references/observational_cohort.md +49 -0
  84. package/skills/calc-sample-size/skill.yml +51 -0
  85. package/skills/check-reporting/SKILL.md +534 -0
  86. package/skills/check-reporting/references/LICENSES.md +41 -0
  87. package/skills/check-reporting/references/checklists/AMSTAR2.md +54 -0
  88. package/skills/check-reporting/references/checklists/ARRIVE_2.md +234 -0
  89. package/skills/check-reporting/references/checklists/CARE.md +102 -0
  90. package/skills/check-reporting/references/checklists/CLAIM_2024.md +128 -0
  91. package/skills/check-reporting/references/checklists/CLEAR.md +113 -0
  92. package/skills/check-reporting/references/checklists/CONSORT.md +86 -0
  93. package/skills/check-reporting/references/checklists/COSMIN_RoB.md +136 -0
  94. package/skills/check-reporting/references/checklists/GRRAS.md +61 -0
  95. package/skills/check-reporting/references/checklists/MI_CLEAR_LLM.md +167 -0
  96. package/skills/check-reporting/references/checklists/MOOSE.md +85 -0
  97. package/skills/check-reporting/references/checklists/NOS.md +88 -0
  98. package/skills/check-reporting/references/checklists/PRISMA_2020.md +135 -0
  99. package/skills/check-reporting/references/checklists/PRISMA_DTA.md +36 -0
  100. package/skills/check-reporting/references/checklists/PRISMA_P.md +56 -0
  101. package/skills/check-reporting/references/checklists/PROBAST.md +75 -0
  102. package/skills/check-reporting/references/checklists/PROBAST_AI.md +130 -0
  103. package/skills/check-reporting/references/checklists/QUADAS2.md +77 -0
  104. package/skills/check-reporting/references/checklists/QUADAS_C.md +131 -0
  105. package/skills/check-reporting/references/checklists/ROBINS_E.md +179 -0
  106. package/skills/check-reporting/references/checklists/ROBINS_I.md +87 -0
  107. package/skills/check-reporting/references/checklists/ROBIS.md +114 -0
  108. package/skills/check-reporting/references/checklists/ROB_ME.md +126 -0
  109. package/skills/check-reporting/references/checklists/RoB2.md +79 -0
  110. package/skills/check-reporting/references/checklists/RoB_NMA.md +96 -0
  111. package/skills/check-reporting/references/checklists/SPIRIT.md +112 -0
  112. package/skills/check-reporting/references/checklists/SQUIRE_2.md +68 -0
  113. package/skills/check-reporting/references/checklists/STARD.md +129 -0
  114. package/skills/check-reporting/references/checklists/STARD_AI.md +211 -0
  115. package/skills/check-reporting/references/checklists/STROBE.md +80 -0
  116. package/skills/check-reporting/references/checklists/SWiM.md +33 -0
  117. package/skills/check-reporting/references/checklists/TRIPOD.md +157 -0
  118. package/skills/check-reporting/references/checklists/TRIPOD_AI.md +140 -0
  119. package/skills/check-reporting/references/step4c_registration_timing.md +93 -0
  120. package/skills/check-reporting/references/step4d_prisma_figure_audit.md +137 -0
  121. package/skills/check-reporting/scripts/check_checklist_exists.py +183 -0
  122. package/skills/check-reporting/scripts/check_checklist_version.py +168 -0
  123. package/skills/check-reporting/scripts/check_framework_naming.py +206 -0
  124. package/skills/check-reporting/scripts/check_prisma_figure.py +209 -0
  125. package/skills/check-reporting/scripts/prisma_cascade_check.py +274 -0
  126. package/skills/check-reporting/skill.yml +41 -0
  127. package/skills/check-reporting/tests/fixtures/framework_bad.md +8 -0
  128. package/skills/check-reporting/tests/fixtures/framework_clean.md +7 -0
  129. package/skills/check-reporting/tests/test_checklist_fail_fast.sh +77 -0
  130. package/skills/check-reporting/tests/test_checklist_version.sh +72 -0
  131. package/skills/check-reporting/tests/test_framework_naming.sh +45 -0
  132. package/skills/check-reporting/tests/test_prisma_cascade.sh +104 -0
  133. package/skills/clean-data/SKILL.md +180 -0
  134. package/skills/clean-data/references/cleaning_patterns.md +299 -0
  135. package/skills/clean-data/references/profiling_template.py +304 -0
  136. package/skills/clean-data/scripts/check_structural_zero.py +174 -0
  137. package/skills/clean-data/skill.yml +35 -0
  138. package/skills/clean-data/tests/fixtures/smoking.csv +8 -0
  139. package/skills/clean-data/tests/test_structural_zero.sh +49 -0
  140. package/skills/cross-national/SKILL.md +264 -0
  141. package/skills/cross-national/skill.yml +37 -0
  142. package/skills/define-variables/SKILL.md +146 -0
  143. package/skills/define-variables/references/common_definitions.md +190 -0
  144. package/skills/define-variables/skill.yml +34 -0
  145. package/skills/define-variables/templates/variable_operationalization.md +64 -0
  146. package/skills/deidentify/SKILL.md +203 -0
  147. package/skills/deidentify/deidentify.py +1224 -0
  148. package/skills/deidentify/locales/_template.json +45 -0
  149. package/skills/deidentify/locales/au.json +43 -0
  150. package/skills/deidentify/locales/ca.json +44 -0
  151. package/skills/deidentify/locales/cn.json +47 -0
  152. package/skills/deidentify/locales/de.json +48 -0
  153. package/skills/deidentify/locales/fr.json +48 -0
  154. package/skills/deidentify/locales/in.json +48 -0
  155. package/skills/deidentify/locales/jp.json +48 -0
  156. package/skills/deidentify/locales/kr.json +48 -0
  157. package/skills/deidentify/locales/uk.json +45 -0
  158. package/skills/deidentify/locales/us.json +43 -0
  159. package/skills/deidentify/references/date_shift_guide.md +82 -0
  160. package/skills/deidentify/references/hipaa_18_identifiers.md +48 -0
  161. package/skills/deidentify/references/korean_phi_patterns.md +135 -0
  162. package/skills/deidentify/skill.yml +43 -0
  163. package/skills/deidentify/tests/README.md +26 -0
  164. package/skills/deidentify/tests/test_clean.csv +16 -0
  165. package/skills/deidentify/tests/test_edge_cases.csv +11 -0
  166. package/skills/deidentify/tests/test_phi_korean.csv +11 -0
  167. package/skills/design-ai-benchmarking/SKILL.md +214 -0
  168. package/skills/design-ai-benchmarking/references/benchmark_export_schema.json +69 -0
  169. package/skills/design-ai-benchmarking/references/elicitation_rubric_template.md +37 -0
  170. package/skills/design-ai-benchmarking/skill.yml +38 -0
  171. package/skills/design-study/SKILL.md +298 -0
  172. package/skills/design-study/skill.yml +33 -0
  173. package/skills/fill-icmje-coi/SKILL.md +216 -0
  174. package/skills/fill-icmje-coi/scripts/fill_icmje_coi.py +140 -0
  175. package/skills/fill-icmje-coi/skill.yml +35 -0
  176. package/skills/fill-icmje-coi/templates/icmje_coi_seed_synthetic.docx +0 -0
  177. package/skills/fill-protocol/SKILL.md +248 -0
  178. package/skills/fill-protocol/examples/example_irb_template.yaml +53 -0
  179. package/skills/fill-protocol/references/best_practices.md +121 -0
  180. package/skills/fill-protocol/scripts/doc_to_docx.py +111 -0
  181. package/skills/fill-protocol/scripts/fill_form.py +611 -0
  182. package/skills/fill-protocol/scripts/inspect_template.py +61 -0
  183. package/skills/fill-protocol/setup.sh +162 -0
  184. package/skills/fill-protocol/skill.yml +37 -0
  185. package/skills/find-cohort-gap/SKILL.md +309 -0
  186. package/skills/find-cohort-gap/references/cohort_profile_template.md +93 -0
  187. package/skills/find-cohort-gap/references/onepager_template.md +84 -0
  188. package/skills/find-cohort-gap/references/pattern_scoring_rubric.md +169 -0
  189. package/skills/find-cohort-gap/references/saturation_query_templates.md +143 -0
  190. package/skills/find-cohort-gap/skill.yml +35 -0
  191. package/skills/find-journal/POLICY.md +87 -0
  192. package/skills/find-journal/SKILL.md +340 -0
  193. package/skills/find-journal/references/journal_profiles/AJNR.md +29 -0
  194. package/skills/find-journal/references/journal_profiles/AJR.md +30 -0
  195. package/skills/find-journal/references/journal_profiles/Abdominal_Radiology.md +30 -0
  196. package/skills/find-journal/references/journal_profiles/Academic_Radiology.md +30 -0
  197. package/skills/find-journal/references/journal_profiles/Annals_of_Internal_Medicine.md +33 -0
  198. package/skills/find-journal/references/journal_profiles/Artificial_Intelligence_in_Medicine.md +28 -0
  199. package/skills/find-journal/references/journal_profiles/BMC_Medicine.md +31 -0
  200. package/skills/find-journal/references/journal_profiles/British_Journal_of_Radiology.md +39 -0
  201. package/skills/find-journal/references/journal_profiles/CVIR.md +30 -0
  202. package/skills/find-journal/references/journal_profiles/Chest.md +39 -0
  203. package/skills/find-journal/references/journal_profiles/Clinical_Radiology.md +30 -0
  204. package/skills/find-journal/references/journal_profiles/Clinical_and_Molecular_Hepatology.md +32 -0
  205. package/skills/find-journal/references/journal_profiles/Diabetes_Metabolism_Journal.md +36 -0
  206. package/skills/find-journal/references/journal_profiles/Diagnostic_and_Interventional_Radiology.md +32 -0
  207. package/skills/find-journal/references/journal_profiles/Endocrinology_and_Metabolism.md +37 -0
  208. package/skills/find-journal/references/journal_profiles/European_Journal_of_Preventive_Cardiology.md +39 -0
  209. package/skills/find-journal/references/journal_profiles/European_Radiology.md +29 -0
  210. package/skills/find-journal/references/journal_profiles/Hepatology_Communications.md +40 -0
  211. package/skills/find-journal/references/journal_profiles/Hepatology_International.md +37 -0
  212. package/skills/find-journal/references/journal_profiles/IEEE_JBHI.md +28 -0
  213. package/skills/find-journal/references/journal_profiles/IEEE_TMI.md +28 -0
  214. package/skills/find-journal/references/journal_profiles/INSI.md +29 -0
  215. package/skills/find-journal/references/journal_profiles/Investigative_Radiology.md +25 -0
  216. package/skills/find-journal/references/journal_profiles/JACC_Advances.md +41 -0
  217. package/skills/find-journal/references/journal_profiles/JACC_Asia.md +30 -0
  218. package/skills/find-journal/references/journal_profiles/JACR.md +28 -0
  219. package/skills/find-journal/references/journal_profiles/JAMA.md +40 -0
  220. package/skills/find-journal/references/journal_profiles/JAMA_Network_Open.md +30 -0
  221. package/skills/find-journal/references/journal_profiles/JCSM.md +39 -0
  222. package/skills/find-journal/references/journal_profiles/JKMS.md +32 -0
  223. package/skills/find-journal/references/journal_profiles/JMIR.md +29 -0
  224. package/skills/find-journal/references/journal_profiles/JMIR_Medical_Education.md +29 -0
  225. package/skills/find-journal/references/journal_profiles/JNIS.md +35 -0
  226. package/skills/find-journal/references/journal_profiles/JVIR.md +31 -0
  227. package/skills/find-journal/references/journal_profiles/Journal_of_Biomedical_Informatics.md +29 -0
  228. package/skills/find-journal/references/journal_profiles/Journal_of_Clinical_Endocrinology_and_Metabolism.md +40 -0
  229. package/skills/find-journal/references/journal_profiles/Journal_of_Magnetic_Resonance_Imaging.md +30 -0
  230. package/skills/find-journal/references/journal_profiles/Journal_of_Nuclear_Medicine.md +31 -0
  231. package/skills/find-journal/references/journal_profiles/Journal_of_Stroke.md +32 -0
  232. package/skills/find-journal/references/journal_profiles/KJR.md +38 -0
  233. package/skills/find-journal/references/journal_profiles/Korean_Circulation_Journal.md +38 -0
  234. package/skills/find-journal/references/journal_profiles/Korean_Journal_of_Internal_Medicine.md +36 -0
  235. package/skills/find-journal/references/journal_profiles/Lancet_Diabetes_and_Endocrinology.md +40 -0
  236. package/skills/find-journal/references/journal_profiles/Lancet_Gastroenterology_and_Hepatology.md +49 -0
  237. package/skills/find-journal/references/journal_profiles/Lancet_Infectious_Diseases.md +38 -0
  238. package/skills/find-journal/references/journal_profiles/Lancet_Neurology.md +39 -0
  239. package/skills/find-journal/references/journal_profiles/Lancet_Oncology.md +40 -0
  240. package/skills/find-journal/references/journal_profiles/Lancet_Psychiatry.md +38 -0
  241. package/skills/find-journal/references/journal_profiles/Lancet_Public_Health.md +30 -0
  242. package/skills/find-journal/references/journal_profiles/Lancet_Respiratory_Medicine.md +39 -0
  243. package/skills/find-journal/references/journal_profiles/Liver_International.md +33 -0
  244. package/skills/find-journal/references/journal_profiles/Medical_Image_Analysis.md +28 -0
  245. package/skills/find-journal/references/journal_profiles/NEJM.md +33 -0
  246. package/skills/find-journal/references/journal_profiles/Nature_Machine_Intelligence.md +31 -0
  247. package/skills/find-journal/references/journal_profiles/Nature_Medicine.md +39 -0
  248. package/skills/find-journal/references/journal_profiles/Neuroradiology.md +31 -0
  249. package/skills/find-journal/references/journal_profiles/Nutrition_Metabolism_and_Cardiovascular_Diseases.md +39 -0
  250. package/skills/find-journal/references/journal_profiles/PLOS_Medicine.md +32 -0
  251. package/skills/find-journal/references/journal_profiles/RYAI.md +28 -0
  252. package/skills/find-journal/references/journal_profiles/Radiology.md +29 -0
  253. package/skills/find-journal/references/journal_profiles/Skeletal_Radiology.md +31 -0
  254. package/skills/find-journal/references/journal_profiles/Stroke.md +37 -0
  255. package/skills/find-journal/references/journal_profiles/The_BMJ.md +31 -0
  256. package/skills/find-journal/references/journal_profiles/The_Lancet.md +31 -0
  257. package/skills/find-journal/references/journal_profiles/The_Lancet_Digital_Health.md +29 -0
  258. package/skills/find-journal/references/journal_profiles/World_Journal_of_Hepatology.md +53 -0
  259. package/skills/find-journal/references/journal_profiles/npj_Digital_Medicine.md +29 -0
  260. package/skills/find-journal/skill.yml +34 -0
  261. package/skills/fulltext-retrieval/SKILL.md +174 -0
  262. package/skills/fulltext-retrieval/fetch_oa.py +433 -0
  263. package/skills/fulltext-retrieval/pdf_to_md.py +160 -0
  264. package/skills/fulltext-retrieval/skill.yml +41 -0
  265. package/skills/generate-codebook/SKILL.md +155 -0
  266. package/skills/generate-codebook/references/codebook_schema.md +76 -0
  267. package/skills/generate-codebook/scripts/generate_codebook.py +278 -0
  268. package/skills/generate-codebook/skill.yml +35 -0
  269. package/skills/generate-codebook/tests/test_generate_codebook.sh +76 -0
  270. package/skills/grant-builder/SKILL.md +251 -0
  271. package/skills/grant-builder/skill.yml +34 -0
  272. package/skills/humanize/SKILL.md +251 -0
  273. package/skills/humanize/references/ai_patterns.md +571 -0
  274. package/skills/humanize/skill.yml +33 -0
  275. package/skills/intake-project/SKILL.md +264 -0
  276. package/skills/intake-project/skill.yml +34 -0
  277. package/skills/lit-sync/SKILL.md +448 -0
  278. package/skills/lit-sync/references/locale/ko/note_templates.md +110 -0
  279. package/skills/lit-sync/skill.yml +52 -0
  280. package/skills/lit-sync/tests/test_poll_logic.sh +92 -0
  281. package/skills/ma-scout/SKILL.md +640 -0
  282. package/skills/ma-scout/references/project_readme_template.md +95 -0
  283. package/skills/ma-scout/references/project_readme_template_ko.md +82 -0
  284. package/skills/ma-scout/skill.yml +33 -0
  285. package/skills/make-figures/SKILL.md +957 -0
  286. package/skills/make-figures/references/critic_rubrics/data_plot.md +166 -0
  287. package/skills/make-figures/references/critic_rubrics/flow_diagram.md +169 -0
  288. package/skills/make-figures/references/design_principles.md +181 -0
  289. package/skills/make-figures/references/exemplar_diagrams/README.md +65 -0
  290. package/skills/make-figures/references/exemplar_diagrams/consort/README.md +15 -0
  291. package/skills/make-figures/references/exemplar_diagrams/consort/template_input.yaml +37 -0
  292. package/skills/make-figures/references/exemplar_diagrams/consort/template_output.pdf +0 -0
  293. package/skills/make-figures/references/exemplar_diagrams/consort/template_output.png +0 -0
  294. package/skills/make-figures/references/exemplar_diagrams/consort/template_output_600.png +0 -0
  295. package/skills/make-figures/references/exemplar_diagrams/other/other_02.meta.yaml +4 -0
  296. package/skills/make-figures/references/exemplar_diagrams/other/other_02.png +0 -0
  297. package/skills/make-figures/references/exemplar_diagrams/other/other_02_why.md +13 -0
  298. package/skills/make-figures/references/exemplar_diagrams/pipeline/README.md +15 -0
  299. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_01.meta.yaml +4 -0
  300. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_01.png +0 -0
  301. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_01_why.md +13 -0
  302. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_03.meta.yaml +4 -0
  303. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_03.png +0 -0
  304. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_03_why.md +13 -0
  305. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_04.meta.yaml +4 -0
  306. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_04.png +0 -0
  307. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_04_why.md +13 -0
  308. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_05.meta.yaml +4 -0
  309. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_05.png +0 -0
  310. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_05_why.md +13 -0
  311. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_06.meta.yaml +4 -0
  312. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_06.png +0 -0
  313. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_06_why.md +13 -0
  314. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_07.meta.yaml +4 -0
  315. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_07.png +0 -0
  316. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_07_why.md +13 -0
  317. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_08.meta.yaml +4 -0
  318. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_08.png +0 -0
  319. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_08_why.md +13 -0
  320. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_09.meta.yaml +4 -0
  321. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_09.png +0 -0
  322. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_09_why.md +13 -0
  323. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_10.meta.yaml +4 -0
  324. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_10.png +0 -0
  325. package/skills/make-figures/references/exemplar_diagrams/pipeline/pipeline_10_why.md +13 -0
  326. package/skills/make-figures/references/exemplar_diagrams/prisma/README.md +15 -0
  327. package/skills/make-figures/references/exemplar_diagrams/prisma/template_input.yaml +47 -0
  328. package/skills/make-figures/references/exemplar_diagrams/prisma/template_output.pdf +0 -0
  329. package/skills/make-figures/references/exemplar_diagrams/prisma/template_output.png +0 -0
  330. package/skills/make-figures/references/exemplar_diagrams/prisma/template_output_600.png +0 -0
  331. package/skills/make-figures/references/exemplar_diagrams/stard/README.md +15 -0
  332. package/skills/make-figures/references/exemplar_diagrams/stard/template_input.yaml +40 -0
  333. package/skills/make-figures/references/exemplar_diagrams/stard/template_output.pdf +0 -0
  334. package/skills/make-figures/references/exemplar_diagrams/stard/template_output.png +0 -0
  335. package/skills/make-figures/references/exemplar_diagrams/stard/template_output_600.png +0 -0
  336. package/skills/make-figures/references/exemplar_diagrams/strobe/template_input.yaml +43 -0
  337. package/skills/make-figures/references/exemplar_diagrams/strobe/template_input_pptx.yaml +43 -0
  338. package/skills/make-figures/references/exemplar_diagrams/strobe/template_output.pdf +0 -0
  339. package/skills/make-figures/references/exemplar_diagrams/strobe/template_output.png +0 -0
  340. package/skills/make-figures/references/exemplar_diagrams/strobe/template_output.pptx +0 -0
  341. package/skills/make-figures/references/exemplar_diagrams/strobe/template_output_600.png +0 -0
  342. package/skills/make-figures/references/figure_specs.md +291 -0
  343. package/skills/make-figures/references/flow_diagram_lessons.md +164 -0
  344. package/skills/make-figures/references/jacc_central_illustration_principles.md +91 -0
  345. package/skills/make-figures/references/medical_illustration_sources.md +98 -0
  346. package/skills/make-figures/references/pipeline_concepts_medical_ai.md +240 -0
  347. package/skills/make-figures/references/reporting_guideline_figure_map.md +104 -0
  348. package/skills/make-figures/references/visual_abstract_templates/european_radiology.pptx +0 -0
  349. package/skills/make-figures/references/visual_abstract_templates/jacc_central_illustration.pptx +0 -0
  350. package/skills/make-figures/references/visual_abstract_templates/medsci_default.pptx +0 -0
  351. package/skills/make-figures/references/visual_abstract_templates/template_guide.md +114 -0
  352. package/skills/make-figures/scripts/build_jacc_template.py +77 -0
  353. package/skills/make-figures/scripts/build_prisma2020_template.py +371 -0
  354. package/skills/make-figures/scripts/build_strobe_template.py +351 -0
  355. package/skills/make-figures/scripts/critic_figure.py +264 -0
  356. package/skills/make-figures/scripts/derive_figure_legend_counts.py +138 -0
  357. package/skills/make-figures/scripts/extract_exemplar_from_pdf.py +186 -0
  358. package/skills/make-figures/scripts/fetch_official_templates.sh +88 -0
  359. package/skills/make-figures/scripts/fill_prisma_template.py +142 -0
  360. package/skills/make-figures/scripts/generate_flow_diagram.R +133 -0
  361. package/skills/make-figures/scripts/generate_image.py +99 -0
  362. package/skills/make-figures/scripts/generate_visual_abstract.py +438 -0
  363. package/skills/make-figures/scripts/validate_pptx_mac_compat.py +233 -0
  364. package/skills/make-figures/skill.yml +52 -0
  365. package/skills/make-figures/templates/official/NOTES.md +62 -0
  366. package/skills/make-figures/templates/official/consort2010/CONSORT_2025_editable_checklist.docx +0 -0
  367. package/skills/make-figures/templates/official/consort2010/CONSORT_2025_flow_diagram.docx +0 -0
  368. package/skills/make-figures/templates/official/prisma2020/PRISMA_2020_flow_new_v1.pptx +0 -0
  369. package/skills/make-figures/templates/official/prisma2020/PRISMA_2020_flow_new_v2.pptx +0 -0
  370. package/skills/make-figures/templates/official/prisma2020/PRISMA_2020_flow_updated_v2.pptx +0 -0
  371. package/skills/make-figures/templates/official/spirit2013/SPIRIT_2025_editable_checklist.docx +0 -0
  372. package/skills/make-figures/templates/official/spirit2013/SPIRIT_2025_participant_timeline.docx +0 -0
  373. package/skills/make-figures/templates/official/stard2015/STARD_2015_checklist.docx +0 -0
  374. package/skills/make-figures/templates/official/stard2015/STARD_2015_flow_diagram.pdf +0 -0
  375. package/skills/make-figures/tests/fixtures/figure1_flow.yaml +8 -0
  376. package/skills/make-figures/tests/fixtures/manuscript_ok.md +9 -0
  377. package/skills/make-figures/tests/fixtures/manuscript_stale.md +4 -0
  378. package/skills/make-figures/tests/test_legend_reconcile.sh +36 -0
  379. package/skills/manage-project/SKILL.md +358 -0
  380. package/skills/manage-project/references/pre_submission_checklist.md +53 -0
  381. package/skills/manage-project/references/project_state_template.json +37 -0
  382. package/skills/manage-project/references/scaffold_templates.md +118 -0
  383. package/skills/manage-project/references/status_output_format.md +44 -0
  384. package/skills/manage-project/references/timeline_example.md +20 -0
  385. package/skills/manage-project/skill.yml +36 -0
  386. package/skills/manage-project/templates/SSOT.yaml.template +41 -0
  387. package/skills/manage-refs/LICENSE.zotero-mcp +21 -0
  388. package/skills/manage-refs/NOTICE.md +29 -0
  389. package/skills/manage-refs/SKILL.md +289 -0
  390. package/skills/manage-refs/citation_styles/README.md +40 -0
  391. package/skills/manage-refs/citation_styles/american-journal-of-roentgenology.csl +211 -0
  392. package/skills/manage-refs/citation_styles/cardiovascular-and-interventional-radiology.csl +19 -0
  393. package/skills/manage-refs/citation_styles/european-radiology.csl +19 -0
  394. package/skills/manage-refs/citation_styles/journal-of-cachexia-sarcopenia-and-muscle.csl +150 -0
  395. package/skills/manage-refs/citation_styles/journal-of-korean-medical-science-strict.csl +533 -0
  396. package/skills/manage-refs/citation_styles/journal-of-korean-medical-science.csl +16 -0
  397. package/skills/manage-refs/citation_styles/korean-journal-of-radiology.csl +155 -0
  398. package/skills/manage-refs/citation_styles/nature.csl +189 -0
  399. package/skills/manage-refs/citation_styles/nlm-citation-sequence.csl +535 -0
  400. package/skills/manage-refs/citation_styles/radiology.csl +228 -0
  401. package/skills/manage-refs/citation_styles/springer-basic-brackets.csl +187 -0
  402. package/skills/manage-refs/citation_styles/springer-vancouver-brackets.csl +276 -0
  403. package/skills/manage-refs/citation_styles/vancouver-superscript.csl +536 -0
  404. package/skills/manage-refs/citation_styles/vancouver.csl +535 -0
  405. package/skills/manage-refs/references/REFERENCE_STYLE_SPECS.md +59 -0
  406. package/skills/manage-refs/references/check_xref_symptoms.md +35 -0
  407. package/skills/manage-refs/scripts/_vendor_citation_writer.py +600 -0
  408. package/skills/manage-refs/scripts/check_citation_keys.py +112 -0
  409. package/skills/manage-refs/scripts/check_csl_render.py +102 -0
  410. package/skills/manage-refs/scripts/check_xref.py +633 -0
  411. package/skills/manage-refs/scripts/fill_journal_abbrev.py +104 -0
  412. package/skills/manage-refs/scripts/inject_zotero_cwyw.py +133 -0
  413. package/skills/manage-refs/scripts/md_marker_convert.py +193 -0
  414. package/skills/manage-refs/scripts/pre_submission_gate.sh +238 -0
  415. package/skills/manage-refs/scripts/render_pandoc.sh +88 -0
  416. package/skills/manage-refs/skill.yml +70 -0
  417. package/skills/manage-refs/tests/fixtures/pre_submission_gate/README.md +32 -0
  418. package/skills/manage-refs/tests/fixtures/pre_submission_gate/manuscript.md +10 -0
  419. package/skills/manage-refs/tests/fixtures/pre_submission_gate/refs.bib +34 -0
  420. package/skills/manage-refs/tests/fixtures/pre_submission_gate/run.sh +117 -0
  421. package/skills/manage-refs/tests/test_vN_docx_check.sh +145 -0
  422. package/skills/meta-analysis/SKILL.md +739 -0
  423. package/skills/meta-analysis/references/LICENSES.md +21 -0
  424. package/skills/meta-analysis/references/PROSPERO_template.md +221 -0
  425. package/skills/meta-analysis/references/ai_pre_screening_template.py +245 -0
  426. package/skills/meta-analysis/references/checklists/JBI_Case_Series.md +45 -0
  427. package/skills/meta-analysis/references/checklists/NOS.md +88 -0
  428. package/skills/meta-analysis/references/checklists/PRISMA_DTA.md +36 -0
  429. package/skills/meta-analysis/references/checklists/PROBAST.md +75 -0
  430. package/skills/meta-analysis/references/checklists/QUADAS2.md +77 -0
  431. package/skills/meta-analysis/references/checklists/ROBINS_I.md +87 -0
  432. package/skills/meta-analysis/references/checklists/RoB2.md +79 -0
  433. package/skills/meta-analysis/references/data_integrity_checklist.md +57 -0
  434. package/skills/meta-analysis/references/icmje_coi_guide.md +181 -0
  435. package/skills/meta-analysis/references/phase10_recovery.md +136 -0
  436. package/skills/meta-analysis/references/phase4_km_composite.md +58 -0
  437. package/skills/meta-analysis/references/phase6_statistical_synthesis.md +148 -0
  438. package/skills/meta-analysis/references/phase9_circulation.md +84 -0
  439. package/skills/meta-analysis/references/post_submission_release_ops.md +41 -0
  440. package/skills/meta-analysis/references/r_templates.md +132 -0
  441. package/skills/meta-analysis/references/review_orchestration.md +40 -0
  442. package/skills/meta-analysis/references/submission_package_drift.md +71 -0
  443. package/skills/meta-analysis/scripts/check_pool_consistency.py +201 -0
  444. package/skills/meta-analysis/scripts/cohort_overlap_check.py +242 -0
  445. package/skills/meta-analysis/scripts/dta_extraction_qc.py +137 -0
  446. package/skills/meta-analysis/scripts/screening_reconcile.py +160 -0
  447. package/skills/meta-analysis/skill.yml +47 -0
  448. package/skills/meta-analysis/templates/FINAL_POOL_LOCK.yaml.template +70 -0
  449. package/skills/meta-analysis/templates/extraction_form_v2.md +129 -0
  450. package/skills/meta-analysis/templates/supplementary_8file_checklist.md +94 -0
  451. package/skills/meta-analysis/tests/test_pool_consistency.sh +123 -0
  452. package/skills/orchestrate/SKILL.md +501 -0
  453. package/skills/orchestrate/references/dialogue_nodes.md +196 -0
  454. package/skills/orchestrate/references/report_template.md +109 -0
  455. package/skills/orchestrate/references/report_template_ko.md +88 -0
  456. package/skills/orchestrate/skill.yml +44 -0
  457. package/skills/peer-review/SKILL.md +381 -0
  458. package/skills/peer-review/references/aczel_2021_reviewer2_patterns.md +88 -0
  459. package/skills/peer-review/references/domain-probes/ai_overclaiming.md +47 -0
  460. package/skills/peer-review/references/domain-probes/narrative_review.md +44 -0
  461. package/skills/peer-review/references/domain-probes/observational_confounding.md +48 -0
  462. package/skills/peer-review/references/domain-probes/radiomics.md +38 -0
  463. package/skills/peer-review/references/domain-probes/sr_ma.md +87 -0
  464. package/skills/peer-review/references/domain-probes/survival_prognostic.md +68 -0
  465. package/skills/peer-review/references/exemplar_reviews/README.md +43 -0
  466. package/skills/peer-review/references/exemplar_reviews/ai_overclaiming.md +47 -0
  467. package/skills/peer-review/references/exemplar_reviews/calibration_missing.md +44 -0
  468. package/skills/peer-review/references/exemplar_reviews/data_leakage.md +48 -0
  469. package/skills/peer-review/references/exemplar_reviews/reference_standard_validity.md +45 -0
  470. package/skills/peer-review/references/narrative_review_audit.md +67 -0
  471. package/skills/peer-review/references/reviewer_calibration/README.md +34 -0
  472. package/skills/peer-review/references/reviewer_calibration/compliance_floor.md +52 -0
  473. package/skills/peer-review/references/reviewer_profiles/AJR.md +82 -0
  474. package/skills/peer-review/references/reviewer_profiles/EURE.md +64 -0
  475. package/skills/peer-review/references/reviewer_profiles/INSI.md +57 -0
  476. package/skills/peer-review/references/reviewer_profiles/KJR.md +100 -0
  477. package/skills/peer-review/references/reviewer_profiles/README.md +32 -0
  478. package/skills/peer-review/references/reviewer_profiles/RYAI.md +86 -0
  479. package/skills/peer-review/skill.yml +39 -0
  480. package/skills/present-paper/SKILL.md +675 -0
  481. package/skills/present-paper/references/critic_rubrics/slide.md +155 -0
  482. package/skills/present-paper/references/generate_pptx_templates.py +604 -0
  483. package/skills/present-paper/references/medical_presentation_templates.md +277 -0
  484. package/skills/present-paper/references/slide_design_principles.md +202 -0
  485. package/skills/present-paper/references/slide_visual_styles/nature_lancet.md +168 -0
  486. package/skills/present-paper/references/workflow-checklist.md +109 -0
  487. package/skills/present-paper/scripts/extract_pdf_figures.py +243 -0
  488. package/skills/present-paper/scripts/inject_pronunciation_notes.py +178 -0
  489. package/skills/present-paper/scripts/inject_speaker_notes.py +133 -0
  490. package/skills/present-paper/scripts/strip_notes_for_sharing.py +140 -0
  491. package/skills/present-paper/scripts/trim_caption.py +271 -0
  492. package/skills/present-paper/skill.yml +41 -0
  493. package/skills/present-paper/templates/build_pptx_nature_lancet.py +688 -0
  494. package/skills/publish-skill/SKILL.md +370 -0
  495. package/skills/publish-skill/references/license-compatibility-matrix.md +132 -0
  496. package/skills/publish-skill/references/pii-patterns.md +130 -0
  497. package/skills/publish-skill/scripts/audit_skill.sh +278 -0
  498. package/skills/publish-skill/skill.yml +35 -0
  499. package/skills/render-pdf-doc/SKILL.md +146 -0
  500. package/skills/render-pdf-doc/references/known_pitfalls.md +53 -0
  501. package/skills/render-pdf-doc/references/pandoc_korean_cheatsheet.md +77 -0
  502. package/skills/render-pdf-doc/scripts/check_deps.sh +42 -0
  503. package/skills/render-pdf-doc/scripts/infer_colwidths.py +164 -0
  504. package/skills/render-pdf-doc/scripts/render_pdf.sh +98 -0
  505. package/skills/render-pdf-doc/skill.yml +57 -0
  506. package/skills/render-pdf-doc/templates/anchor-doc.md +27 -0
  507. package/skills/render-pdf-doc/templates/anchor-doc_ko.md +25 -0
  508. package/skills/render-pdf-doc/templates/briefing-handout.md +33 -0
  509. package/skills/render-pdf-doc/templates/briefing-handout_ko.md +31 -0
  510. package/skills/render-pdf-doc/templates/proposal-cover.md +33 -0
  511. package/skills/render-pdf-doc/templates/proposal-cover_ko.md +31 -0
  512. package/skills/render-pdf-doc/templates/reference-table.md +22 -0
  513. package/skills/render-pdf-doc/templates/reference-table_ko.md +20 -0
  514. package/skills/replicate-study/SKILL.md +150 -0
  515. package/skills/replicate-study/references/harmonization_3country.csv +47 -0
  516. package/skills/replicate-study/references/harmonization_knhanes_nhanes.csv +68 -0
  517. package/skills/replicate-study/references/methodology_extraction_template.md +134 -0
  518. package/skills/replicate-study/skill.yml +37 -0
  519. package/skills/review-paper/SKILL.md +104 -0
  520. package/skills/review-paper/references/macro_skeleton.md +6 -0
  521. package/skills/review-paper/skill.yml +25 -0
  522. package/skills/revise/SKILL.md +515 -0
  523. package/skills/revise/references/r2r_voice.md +346 -0
  524. package/skills/revise/skill.yml +43 -0
  525. package/skills/search-lit/SKILL.md +443 -0
  526. package/skills/search-lit/references/parse_pubmed.py +326 -0
  527. package/skills/search-lit/references/pubmed_eutils.sh +111 -0
  528. package/skills/search-lit/skill.yml +46 -0
  529. package/skills/self-review/SKILL.md +1045 -0
  530. package/skills/self-review/references/domain-probes/ai_overclaiming.md +47 -0
  531. package/skills/self-review/references/domain-probes/narrative_review.md +44 -0
  532. package/skills/self-review/references/domain-probes/observational_confounding.md +48 -0
  533. package/skills/self-review/references/domain-probes/radiomics.md +38 -0
  534. package/skills/self-review/references/domain-probes/sr_ma.md +87 -0
  535. package/skills/self-review/references/domain-probes/survival_prognostic.md +68 -0
  536. package/skills/self-review/references/exemplar_findings/README.md +43 -0
  537. package/skills/self-review/references/exemplar_findings/cohort_arithmetic_mismatch.md +35 -0
  538. package/skills/self-review/references/exemplar_findings/estimand_drift_posthoc_primary.md +39 -0
  539. package/skills/self-review/references/exemplar_findings/scope_overreach_cross_sectional.md +35 -0
  540. package/skills/self-review/references/exemplar_findings/unadjusted_confounder.md +36 -0
  541. package/skills/self-review/references/panel_review_template.md +177 -0
  542. package/skills/self-review/scripts/check_artifact_coverage.py +301 -0
  543. package/skills/self-review/scripts/check_claim_artifact.py +248 -0
  544. package/skills/self-review/scripts/check_classical_style.py +185 -0
  545. package/skills/self-review/scripts/check_cohort_arithmetic.py +481 -0
  546. package/skills/self-review/scripts/check_confounding_completeness.py +287 -0
  547. package/skills/self-review/scripts/check_panel_diversity.py +336 -0
  548. package/skills/self-review/scripts/check_reference_adequacy.py +392 -0
  549. package/skills/self-review/scripts/check_reviewer_team_consistency.py +412 -0
  550. package/skills/self-review/scripts/check_scope_coherence.py +177 -0
  551. package/skills/self-review/skill.yml +47 -0
  552. package/skills/self-review/tests/fixtures/claim_manuscript.md +17 -0
  553. package/skills/self-review/tests/fixtures/claim_prereg.md +6 -0
  554. package/skills/self-review/tests/fixtures/cohort_bad.md +21 -0
  555. package/skills/self-review/tests/fixtures/cohort_clean.md +21 -0
  556. package/skills/self-review/tests/fixtures/cohort_partition.csv +5 -0
  557. package/skills/self-review/tests/fixtures/coverage_analysis/31_delong_nested_added_value.csv +3 -0
  558. package/skills/self-review/tests/fixtures/coverage_analysis/table1_demographics.csv +3 -0
  559. package/skills/self-review/tests/fixtures/coverage_clean.md +13 -0
  560. package/skills/self-review/tests/fixtures/coverage_manuscript.md +11 -0
  561. package/skills/self-review/tests/fixtures/panel_collapse.json +27 -0
  562. package/skills/self-review/tests/fixtures/panel_good.json +32 -0
  563. package/skills/self-review/tests/fixtures/panel_monoculture.json +32 -0
  564. package/skills/self-review/tests/fixtures/refadeq_letter.md +13 -0
  565. package/skills/self-review/tests/fixtures/refadeq_original_fixed.md +42 -0
  566. package/skills/self-review/tests/fixtures/refadeq_original_uncited.md +40 -0
  567. package/skills/self-review/tests/fixtures/scope_bad.md +9 -0
  568. package/skills/self-review/tests/fixtures/scope_clean.md +8 -0
  569. package/skills/self-review/tests/fixtures/scope_surrogate.md +8 -0
  570. package/skills/self-review/tests/fixtures/style_bad.md +13 -0
  571. package/skills/self-review/tests/fixtures/style_clean.md +11 -0
  572. package/skills/self-review/tests/fixtures/table1_by_exposure.csv +11 -0
  573. package/skills/self-review/tests/test_artifact_coverage.sh +44 -0
  574. package/skills/self-review/tests/test_claim_artifact.sh +50 -0
  575. package/skills/self-review/tests/test_classical_style.sh +44 -0
  576. package/skills/self-review/tests/test_cohort_arithmetic.sh +49 -0
  577. package/skills/self-review/tests/test_confounding_completeness.sh +66 -0
  578. package/skills/self-review/tests/test_panel_diversity.sh +55 -0
  579. package/skills/self-review/tests/test_panel_mode.sh +69 -0
  580. package/skills/self-review/tests/test_reference_adequacy.sh +68 -0
  581. package/skills/self-review/tests/test_reviewer_team_consistency.sh +138 -0
  582. package/skills/self-review/tests/test_scope_coherence.sh +46 -0
  583. package/skills/setup-medsci/SKILL.md +110 -0
  584. package/skills/setup-medsci/references/setup-checklist.md +51 -0
  585. package/skills/setup-medsci/skill.yml +30 -0
  586. package/skills/sync-submission/SKILL.md +382 -0
  587. package/skills/sync-submission/scripts/author_registry_example.yaml +36 -0
  588. package/skills/sync-submission/scripts/blind_sweep.py +203 -0
  589. package/skills/sync-submission/scripts/check_asset_anonymization.py +300 -0
  590. package/skills/sync-submission/scripts/check_cross_artifact_stale.py +211 -0
  591. package/skills/sync-submission/scripts/cover_letter_drift_check.py +451 -0
  592. package/skills/sync-submission/scripts/cross_document_n_check.py +486 -0
  593. package/skills/sync-submission/scripts/detect_copy_divergence.py +136 -0
  594. package/skills/sync-submission/scripts/preflight_gate.py +458 -0
  595. package/skills/sync-submission/scripts/scope_drift_check.py +362 -0
  596. package/skills/sync-submission/scripts/sync_submission.py +169 -0
  597. package/skills/sync-submission/skill.yml +43 -0
  598. package/skills/sync-submission/tests/fixtures/copy_ok.md +5 -0
  599. package/skills/sync-submission/tests/fixtures/copy_stale.md +5 -0
  600. package/skills/sync-submission/tests/fixtures/ssot.md +5 -0
  601. package/skills/sync-submission/tests/test_asset_anonymization.sh +99 -0
  602. package/skills/sync-submission/tests/test_copy_divergence.sh +44 -0
  603. package/skills/sync-submission/tests/test_cross_artifact_stale.sh +80 -0
  604. package/skills/sync-submission/tests/test_cross_document_n.sh +132 -0
  605. package/skills/sync-submission/tests/test_preflight_gate.sh +112 -0
  606. package/skills/sync-submission/tests/test_scope_drift.sh +122 -0
  607. package/skills/sync-submission/tests/test_vN_docx_assertion.sh +51 -0
  608. package/skills/verify-refs/SKILL.md +177 -0
  609. package/skills/verify-refs/references/manual_checkpoint_guide.md +100 -0
  610. package/skills/verify-refs/scripts/verify_cli.sh +62 -0
  611. package/skills/verify-refs/scripts/verify_refs.py +782 -0
  612. package/skills/verify-refs/skill.yml +44 -0
  613. package/skills/verify-refs/tests/fixtures/pagination_placeholder.bib +17 -0
  614. package/skills/verify-refs/tests/test_pagination_placeholder.sh +42 -0
  615. package/skills/version-dataset/SKILL.md +143 -0
  616. package/skills/version-dataset/references/manifest_schema.md +72 -0
  617. package/skills/version-dataset/scripts/version_dataset.py +242 -0
  618. package/skills/version-dataset/skill.yml +35 -0
  619. package/skills/version-dataset/tests/test_version_dataset.sh +52 -0
  620. package/skills/write-paper/SKILL.md +1148 -0
  621. package/skills/write-paper/references/exemplar_methods/README.md +38 -0
  622. package/skills/write-paper/references/exemplar_methods/ai_validation_tripod_claim.md +47 -0
  623. package/skills/write-paper/references/exemplar_methods/diagnostic_accuracy_stard.md +50 -0
  624. package/skills/write-paper/references/exemplar_methods/observational_cohort_strobe.md +43 -0
  625. package/skills/write-paper/references/journal_profiles/AJNR.md +185 -0
  626. package/skills/write-paper/references/journal_profiles/AJR.md +149 -0
  627. package/skills/write-paper/references/journal_profiles/Abdominal_Radiology.md +139 -0
  628. package/skills/write-paper/references/journal_profiles/Academic_Radiology.md +90 -0
  629. package/skills/write-paper/references/journal_profiles/Annals_of_Internal_Medicine.md +150 -0
  630. package/skills/write-paper/references/journal_profiles/Artificial_Intelligence_in_Medicine.md +82 -0
  631. package/skills/write-paper/references/journal_profiles/British_Journal_of_Radiology.md +161 -0
  632. package/skills/write-paper/references/journal_profiles/CVIR.md +157 -0
  633. package/skills/write-paper/references/journal_profiles/Chest.md +270 -0
  634. package/skills/write-paper/references/journal_profiles/Clinical_Radiology.md +160 -0
  635. package/skills/write-paper/references/journal_profiles/Clinical_and_Molecular_Hepatology.md +147 -0
  636. package/skills/write-paper/references/journal_profiles/Diabetes_Metabolism_Journal.md +163 -0
  637. package/skills/write-paper/references/journal_profiles/Diagnostic_and_Interventional_Radiology.md +216 -0
  638. package/skills/write-paper/references/journal_profiles/Endocrinology_and_Metabolism.md +167 -0
  639. package/skills/write-paper/references/journal_profiles/European_Journal_of_Preventive_Cardiology.md +192 -0
  640. package/skills/write-paper/references/journal_profiles/European_Radiology.md +159 -0
  641. package/skills/write-paper/references/journal_profiles/Hepatology_Communications.md +110 -0
  642. package/skills/write-paper/references/journal_profiles/Hepatology_International.md +106 -0
  643. package/skills/write-paper/references/journal_profiles/IEEE_TMI.md +180 -0
  644. package/skills/write-paper/references/journal_profiles/INSI.md +163 -0
  645. package/skills/write-paper/references/journal_profiles/Investigative_Radiology.md +86 -0
  646. package/skills/write-paper/references/journal_profiles/JACC_Advances.md +197 -0
  647. package/skills/write-paper/references/journal_profiles/JACC_Asia.md +168 -0
  648. package/skills/write-paper/references/journal_profiles/JACR.md +87 -0
  649. package/skills/write-paper/references/journal_profiles/JAMA.md +188 -0
  650. package/skills/write-paper/references/journal_profiles/JAMA_Network_Open.md +170 -0
  651. package/skills/write-paper/references/journal_profiles/JCSM.md +266 -0
  652. package/skills/write-paper/references/journal_profiles/JKMS.md +201 -0
  653. package/skills/write-paper/references/journal_profiles/JMIR.md +88 -0
  654. package/skills/write-paper/references/journal_profiles/JMIR_Medical_Education.md +86 -0
  655. package/skills/write-paper/references/journal_profiles/JNIS.md +227 -0
  656. package/skills/write-paper/references/journal_profiles/JVIR.md +158 -0
  657. package/skills/write-paper/references/journal_profiles/Journal_of_Clinical_Endocrinology_and_Metabolism.md +191 -0
  658. package/skills/write-paper/references/journal_profiles/Journal_of_Stroke.md +176 -0
  659. package/skills/write-paper/references/journal_profiles/KJR.md +185 -0
  660. package/skills/write-paper/references/journal_profiles/Korean_Circulation_Journal.md +184 -0
  661. package/skills/write-paper/references/journal_profiles/Korean_Journal_of_Internal_Medicine.md +178 -0
  662. package/skills/write-paper/references/journal_profiles/Lancet_Gastroenterology_and_Hepatology.md +127 -0
  663. package/skills/write-paper/references/journal_profiles/Liver_International.md +165 -0
  664. package/skills/write-paper/references/journal_profiles/Medical_Image_Analysis.md +147 -0
  665. package/skills/write-paper/references/journal_profiles/NEJM.md +147 -0
  666. package/skills/write-paper/references/journal_profiles/Nature_Medicine.md +181 -0
  667. package/skills/write-paper/references/journal_profiles/Neuroradiology.md +151 -0
  668. package/skills/write-paper/references/journal_profiles/Nutrition_Metabolism_and_Cardiovascular_Diseases.md +184 -0
  669. package/skills/write-paper/references/journal_profiles/PLOS_Medicine.md +166 -0
  670. package/skills/write-paper/references/journal_profiles/RYAI.md +124 -0
  671. package/skills/write-paper/references/journal_profiles/Radiology.md +173 -0
  672. package/skills/write-paper/references/journal_profiles/Skeletal_Radiology.md +135 -0
  673. package/skills/write-paper/references/journal_profiles/Stroke.md +210 -0
  674. package/skills/write-paper/references/journal_profiles/The_BMJ.md +121 -0
  675. package/skills/write-paper/references/journal_profiles/The_Lancet.md +112 -0
  676. package/skills/write-paper/references/journal_profiles/The_Lancet_Digital_Health.md +104 -0
  677. package/skills/write-paper/references/journal_profiles/World_Journal_of_Hepatology.md +106 -0
  678. package/skills/write-paper/references/journal_profiles/npj_Digital_Medicine.md +93 -0
  679. package/skills/write-paper/references/paper_types/ai_validation.md +270 -0
  680. package/skills/write-paper/references/paper_types/animal_study.md +194 -0
  681. package/skills/write-paper/references/paper_types/case_report.md +237 -0
  682. package/skills/write-paper/references/paper_types/cross_national.md +328 -0
  683. package/skills/write-paper/references/paper_types/letter.md +127 -0
  684. package/skills/write-paper/references/paper_types/meta_analysis.md +181 -0
  685. package/skills/write-paper/references/paper_types/nhis_cohort.md +297 -0
  686. package/skills/write-paper/references/paper_types/original_article.md +221 -0
  687. package/skills/write-paper/references/paper_types/technical_note.md +131 -0
  688. package/skills/write-paper/references/section_guides/discussion.md +155 -0
  689. package/skills/write-paper/references/section_guides/introduction.md +108 -0
  690. package/skills/write-paper/references/section_guides/methods.md +144 -0
  691. package/skills/write-paper/references/section_guides/results.md +113 -0
  692. package/skills/write-paper/references/section_guides/step7_1_classical_qc.md +67 -0
  693. package/skills/write-paper/references/section_guides/step7_4a_audit_recovery.md +74 -0
  694. package/skills/write-paper/references/section_guides/title_abstract.md +123 -0
  695. package/skills/write-paper/references/section_templates/methods_statistical.md +147 -0
  696. package/skills/write-paper/scripts/check_placeholders.py +182 -0
  697. package/skills/write-paper/skill.yml +48 -0
  698. package/skills/write-paper/tests/test_placeholders.sh +107 -0
  699. package/skills/write-protocol/SKILL.md +243 -0
  700. package/skills/write-protocol/references/ethics_checklist.md +150 -0
  701. package/skills/write-protocol/references/protocol_template.md +304 -0
  702. package/skills/write-protocol/skill.yml +34 -0
@@ -0,0 +1,1224 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Clinical research data de-identification (LLM-free).
4
+
5
+ Scans Excel/CSV files for Protected Health Information (PHI) using regex
6
+ and column-name heuristics, walks the researcher through an interactive
7
+ terminal review, then produces a de-identified copy with mapping and
8
+ audit trail.
9
+
10
+ Supports 10 country locales (kr, us, jp, cn, de, uk, fr, ca, au, in)
11
+ with country-specific PHI patterns. Custom locales via --locale-file.
12
+
13
+ Usage:
14
+ python deidentify.py scan input.xlsx [--locale kr]
15
+ python deidentify.py review scan_report.json
16
+ python deidentify.py apply reviewed_report.json [--hash-mapping]
17
+ python deidentify.py full input.xlsx [--locale kr] [--auto-accept-safe]
18
+ """
19
+
20
+ import argparse
21
+ import csv
22
+ import hashlib
23
+ import json
24
+ import logging
25
+ import os
26
+ import random
27
+ import re
28
+ import stat
29
+ import sys
30
+ from datetime import datetime, timedelta
31
+ from pathlib import Path
32
+
33
+ log = logging.getLogger("deidentify")
34
+
35
+ REPORT_VERSION = 1
36
+
37
+ # ================================================================
38
+ # Section 1: Constants + Locale Loading
39
+ # ================================================================
40
+
41
+ LOCALES_DIR = Path(__file__).parent / "locales"
42
+
43
+ # Universal column names (English — common across all research locales).
44
+ UNIVERSAL_COLUMN_NAMES: dict[str, str] = {
45
+ "patient_name": "name", "patientname": "name", "pt_name": "name",
46
+ "name": "name", "first_name": "name", "last_name": "name",
47
+ "ssn": "rrn", "social_security": "rrn",
48
+ "dob": "date", "date_of_birth": "date", "birth_date": "date",
49
+ "birthdate": "date",
50
+ "phone": "phone", "telephone": "phone", "mobile": "phone",
51
+ "phone_number": "phone", "cell": "phone",
52
+ "address": "address", "home_address": "address", "street": "address",
53
+ "zip": "address", "zipcode": "address", "zip_code": "address",
54
+ "email": "email", "email_address": "email",
55
+ "mrn": "id", "medical_record": "id", "chart_no": "id",
56
+ "patient_id": "id", "patientid": "id", "chart_number": "id",
57
+ "record_number": "id", "hospital_id": "id",
58
+ "insurance_no": "insurance", "insurance_number": "insurance",
59
+ }
60
+
61
+ # Universal value patterns (always active regardless of locale).
62
+ UNIVERSAL_VALUE_PATTERNS: list[tuple[re.Pattern, str]] = [
63
+ # Email
64
+ (re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"), "email"),
65
+ # ISO date YYYY-MM-DD or YYYY.MM.DD or YYYY/MM/DD
66
+ (re.compile(r"\b(19|20)\d{2}[-/.](0[1-9]|1[0-2])[-/.](0[1-9]|[12]\d|3[01])\b"), "date"),
67
+ # YYMMDD (6 digits that look like a birthdate, standalone)
68
+ (re.compile(r"\b([5-9]\d|0[0-4])(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\b"), "date"),
69
+ ]
70
+
71
+
72
+ def list_locales() -> list[dict]:
73
+ """List available locales from the locales/ directory."""
74
+ locales = []
75
+ if not LOCALES_DIR.is_dir():
76
+ return locales
77
+ for f in sorted(LOCALES_DIR.glob("*.json")):
78
+ if f.name.startswith("_"):
79
+ continue
80
+ try:
81
+ data = json.loads(f.read_text(encoding="utf-8"))
82
+ locales.append({
83
+ "code": data.get("code", f.stem),
84
+ "name": data.get("name", f.stem),
85
+ "native_name": data.get("native_name", ""),
86
+ "path": str(f),
87
+ })
88
+ except (json.JSONDecodeError, OSError):
89
+ continue
90
+ return locales
91
+
92
+
93
+ def load_locale(code: str) -> dict:
94
+ """Load locale by country code (e.g., 'kr', 'us')."""
95
+ path = LOCALES_DIR / f"{code}.json"
96
+ if not path.exists():
97
+ sys.exit(f"Locale not found: {code}\n"
98
+ f"Available: {', '.join(l['code'] for l in list_locales())}\n"
99
+ f"Or use --locale-file for a custom locale.")
100
+ return json.loads(path.read_text(encoding="utf-8"))
101
+
102
+
103
+ def load_locale_file(path: str) -> dict:
104
+ """Load a custom locale from an arbitrary JSON file."""
105
+ p = Path(path)
106
+ if not p.exists():
107
+ sys.exit(f"Locale file not found: {path}")
108
+ return json.loads(p.read_text(encoding="utf-8"))
109
+
110
+
111
+ def select_locale_interactive() -> dict:
112
+ """Interactive country selection prompt."""
113
+ locales = list_locales()
114
+ if not locales:
115
+ sys.exit("No locale files found in locales/ directory.")
116
+
117
+ print(f"\n{_bold('Select country / 국가 선택:')}")
118
+ for i, loc in enumerate(locales, 1):
119
+ native = f" ({loc['native_name']})" if loc['native_name'] != loc['name'] else ""
120
+ print(f" {i:2d}. {loc['name']}{native}")
121
+ print(f" 0. Other (provide custom locale file)")
122
+
123
+ while True:
124
+ choice = input(f"\n> ").strip()
125
+ if choice == "0":
126
+ custom_path = input(" Path to custom locale JSON: ").strip()
127
+ locale = load_locale_file(custom_path)
128
+ print(f" Loaded custom locale: {locale.get('name', 'Custom')}")
129
+ return locale
130
+ try:
131
+ idx = int(choice)
132
+ if 1 <= idx <= len(locales):
133
+ locale = load_locale(locales[idx - 1]["code"])
134
+ print(f" Loading {locale['name']} patterns...")
135
+ return locale
136
+ except ValueError:
137
+ # Try as code
138
+ for loc in locales:
139
+ if choice.lower() == loc["code"]:
140
+ locale = load_locale(choice.lower())
141
+ print(f" Loading {locale['name']} patterns...")
142
+ return locale
143
+ print(f" Invalid choice. Enter 1-{len(locales)} or a country code.")
144
+
145
+
146
+ def build_locale_patterns(locale: dict) -> tuple[
147
+ dict[str, str],
148
+ list[tuple[re.Pattern, str]],
149
+ re.Pattern | None,
150
+ re.Pattern | None,
151
+ float,
152
+ list[str],
153
+ ]:
154
+ """Build scanning patterns from a locale dict.
155
+
156
+ Returns:
157
+ (column_names, value_patterns, address_re, name_re, name_min_ratio, name_columns)
158
+ """
159
+ # Column names: universal + locale-specific
160
+ column_names = dict(UNIVERSAL_COLUMN_NAMES)
161
+ column_names.update(locale.get("column_names", {}))
162
+
163
+ # Value patterns: universal + locale-specific
164
+ value_patterns = list(UNIVERSAL_VALUE_PATTERNS)
165
+
166
+ # National ID
167
+ nid = locale.get("national_id", {})
168
+ nid_type = nid.get("phi_type", "national_id")
169
+ for pat in nid.get("patterns", []):
170
+ value_patterns.append((re.compile(pat), nid_type))
171
+
172
+ # Phone
173
+ for phone in locale.get("phone", []):
174
+ value_patterns.append((re.compile(phone["pattern"]), "phone"))
175
+
176
+ # Extra date formats
177
+ for df in locale.get("date_formats", []):
178
+ value_patterns.append((re.compile(df["pattern"]), "date"))
179
+
180
+ # Address pattern
181
+ addr_cfg = locale.get("address", {})
182
+ address_re = None
183
+ if addr_cfg.get("type") == "suffix_regex" and addr_cfg.get("pattern"):
184
+ address_re = re.compile(addr_cfg["pattern"])
185
+ elif addr_cfg.get("type") == "keywords" and addr_cfg.get("keywords"):
186
+ # Build a regex from keywords (case-insensitive word boundary match)
187
+ escaped = [re.escape(kw) for kw in addr_cfg["keywords"]]
188
+ address_re = re.compile(r"(?:" + "|".join(escaped) + r")", re.IGNORECASE)
189
+ # Postcode pattern (if available, add to value_patterns as address type)
190
+ if addr_cfg.get("postcode_pattern"):
191
+ value_patterns.append((re.compile(addr_cfg["postcode_pattern"]), "address"))
192
+
193
+ # Name heuristic
194
+ name_cfg = locale.get("name_heuristic", {})
195
+ name_re = None
196
+ if name_cfg.get("type") == "regex" and name_cfg.get("pattern"):
197
+ name_re = re.compile(name_cfg["pattern"])
198
+ name_min_ratio = name_cfg.get("min_ratio", 0.3)
199
+
200
+ # Name columns (for restricting name heuristic)
201
+ name_columns = [k for k, v in column_names.items() if v == "name"]
202
+
203
+ return column_names, value_patterns, address_re, name_re, name_min_ratio, name_columns
204
+
205
+ # Confidence thresholds
206
+ CONF_HIGH = "high"
207
+ CONF_MEDIUM = "medium"
208
+ CONF_LOW = "low"
209
+
210
+ # ANSI helpers (respect NO_COLOR)
211
+ _NO_COLOR = bool(os.environ.get("NO_COLOR"))
212
+
213
+
214
+ def _c(code: str, text: str) -> str:
215
+ if _NO_COLOR:
216
+ return text
217
+ return f"\033[{code}m{text}\033[0m"
218
+
219
+
220
+ def _red(t: str) -> str: return _c("31", t)
221
+ def _green(t: str) -> str: return _c("32", t)
222
+ def _yellow(t: str) -> str: return _c("33", t)
223
+ def _bold(t: str) -> str: return _c("1", t)
224
+ def _dim(t: str) -> str: return _c("2", t)
225
+
226
+
227
+ # ================================================================
228
+ # Section 2: File I/O
229
+ # ================================================================
230
+
231
+ def detect_format(path: Path) -> str:
232
+ """Return 'csv', 'tsv', or 'xlsx' based on extension."""
233
+ ext = path.suffix.lower()
234
+ if ext == ".xlsx":
235
+ return "xlsx"
236
+ if ext == ".tsv":
237
+ return "tsv"
238
+ if ext in (".csv", ".txt", ""):
239
+ return "csv"
240
+ sys.exit(f"Unsupported file format: {ext}")
241
+
242
+
243
+ def detect_encoding(path: Path) -> str:
244
+ """Detect encoding: try UTF-8, fall back to EUC-KR."""
245
+ raw = path.read_bytes()
246
+ # UTF-8 BOM
247
+ if raw[:3] == b"\xef\xbb\xbf":
248
+ return "utf-8-sig"
249
+ try:
250
+ raw.decode("utf-8")
251
+ return "utf-8"
252
+ except UnicodeDecodeError:
253
+ pass
254
+ try:
255
+ raw.decode("euc-kr")
256
+ return "euc-kr"
257
+ except UnicodeDecodeError:
258
+ pass
259
+ return "utf-8" # best effort
260
+
261
+
262
+ def load_tabular(path: Path) -> tuple[list[dict], dict]:
263
+ """Load CSV/TSV/XLSX into list of row-dicts + metadata dict."""
264
+ fmt = detect_format(path)
265
+ meta = {"format": fmt, "path": str(path), "sheets_skipped": []}
266
+
267
+ if fmt == "xlsx":
268
+ try:
269
+ import openpyxl
270
+ except ImportError:
271
+ sys.exit("openpyxl is required for .xlsx files. Install: pip install openpyxl")
272
+ wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
273
+ if len(wb.sheetnames) > 1:
274
+ meta["sheets_skipped"] = wb.sheetnames[1:]
275
+ log.warning("Multiple sheets found. Processing '%s' only. Skipped: %s",
276
+ wb.sheetnames[0], ", ".join(wb.sheetnames[1:]))
277
+ ws = wb[wb.sheetnames[0]]
278
+ rows_iter = ws.iter_rows(values_only=True)
279
+ headers = [str(h) if h is not None else f"col_{i}" for i, h in enumerate(next(rows_iter))]
280
+ data = []
281
+ for row in rows_iter:
282
+ data.append({h: (str(v) if v is not None else "") for h, v in zip(headers, row)})
283
+ wb.close()
284
+ else:
285
+ delimiter = "\t" if fmt == "tsv" else ","
286
+ enc = detect_encoding(path)
287
+ with open(path, newline="", encoding=enc) as f:
288
+ reader = csv.DictReader(f, delimiter=delimiter)
289
+ headers = reader.fieldnames or []
290
+ data = list(reader)
291
+
292
+ meta["rows"] = len(data)
293
+ meta["columns"] = len(headers) if data else 0
294
+ meta["headers"] = headers
295
+ return data, meta
296
+
297
+
298
+ def save_tabular(data: list[dict], path: Path, fmt: str) -> None:
299
+ """Write de-identified data back to CSV/TSV/XLSX."""
300
+ if not data:
301
+ log.warning("No data to write.")
302
+ return
303
+ headers = list(data[0].keys())
304
+
305
+ if fmt == "xlsx":
306
+ try:
307
+ import openpyxl
308
+ except ImportError:
309
+ fmt = "csv"
310
+ path = path.with_suffix(".csv")
311
+ log.warning("openpyxl not available; writing CSV instead: %s", path)
312
+
313
+ if fmt == "xlsx":
314
+ import openpyxl
315
+ wb = openpyxl.Workbook()
316
+ ws = wb.active
317
+ ws.append(headers)
318
+ for row in data:
319
+ ws.append([row.get(h, "") for h in headers])
320
+ wb.save(path)
321
+ else:
322
+ delimiter = "\t" if fmt == "tsv" else ","
323
+ with open(path, "w", newline="", encoding="utf-8") as f:
324
+ writer = csv.DictWriter(f, fieldnames=headers, delimiter=delimiter)
325
+ writer.writeheader()
326
+ writer.writerows(data)
327
+
328
+
329
+ # ================================================================
330
+ # Section 3: PHI Scanner
331
+ # ================================================================
332
+
333
+ def _normalize_col(name: str) -> str:
334
+ """Normalize column name for matching: lowercase, strip, collapse whitespace."""
335
+ return re.sub(r"[\s_\-]+", "_", name.strip().lower())
336
+
337
+
338
+ def _col_name_matches(norm_col: str, pattern: str) -> bool:
339
+ """Check if a normalized column name matches a PHI pattern.
340
+
341
+ Rules:
342
+ - Short column names (<=10 chars): exact match or pattern equals norm
343
+ - Long column names (>10 chars): only match if pattern IS the full norm
344
+ (prevents 'cell' matching inside 'atypical cell carcinoma')
345
+ - Korean patterns: use substring match only for dedicated Korean PHI words
346
+ """
347
+ # Exact match
348
+ if norm_col == pattern:
349
+ return True
350
+ # Short column name that equals or is contained in pattern
351
+ if len(norm_col) <= 10 and norm_col in pattern:
352
+ return True
353
+ # Short column name: check if pattern matches as a whole word
354
+ if len(norm_col) <= 10 and pattern in norm_col:
355
+ return True
356
+ # Long column name: only match if the normalized name STARTS with the pattern
357
+ # (e.g., "전화번호_집" matches "전화번호", but "bronchial...cell" doesn't match "cell")
358
+ if norm_col.startswith(pattern):
359
+ return True
360
+ return False
361
+
362
+
363
+ def scan_column_names(headers: list[str],
364
+ column_names: dict[str, str] | None = None) -> dict[str, dict]:
365
+ """Match column names against PHI dictionary.
366
+
367
+ Returns {col: {"phi_type": str, "confidence": str, "source": "column_name"}}.
368
+ """
369
+ if column_names is None:
370
+ column_names = UNIVERSAL_COLUMN_NAMES
371
+ results: dict[str, dict] = {}
372
+ for col in headers:
373
+ norm = _normalize_col(col)
374
+ for pattern, phi_type in column_names.items():
375
+ if _col_name_matches(norm, pattern):
376
+ results[col] = {
377
+ "phi_type": phi_type,
378
+ "confidence": CONF_HIGH,
379
+ "source": "column_name",
380
+ }
381
+ break
382
+ return results
383
+
384
+
385
+ def _sample_values(values: list[str], n: int = 500) -> list[str]:
386
+ """Return up to n non-empty values for scanning."""
387
+ non_empty = [v for v in values if v and v.strip()]
388
+ if len(non_empty) <= n:
389
+ return non_empty
390
+ return random.sample(non_empty, n)
391
+
392
+
393
+ def scan_column_values(col: str, values: list[str],
394
+ col_phi_hint: str | None = None,
395
+ value_patterns: list[tuple[re.Pattern, str]] | None = None,
396
+ address_re: re.Pattern | None = None,
397
+ name_re: re.Pattern | None = None,
398
+ name_min_ratio: float = 0.3,
399
+ name_columns: list[str] | None = None) -> dict | None:
400
+ """Scan cell values in a column for PHI patterns.
401
+
402
+ Returns a detection dict or None.
403
+ """
404
+ if value_patterns is None:
405
+ value_patterns = UNIVERSAL_VALUE_PATTERNS
406
+ if name_columns is None:
407
+ name_columns = [k for k, v in UNIVERSAL_COLUMN_NAMES.items() if v == "name"]
408
+
409
+ sample = _sample_values(values)
410
+ if not sample:
411
+ return None
412
+
413
+ # Count matches per PHI type
414
+ type_counts: dict[str, int] = {}
415
+ for val in sample:
416
+ for regex, phi_type in value_patterns:
417
+ if regex.search(val):
418
+ type_counts[phi_type] = type_counts.get(phi_type, 0) + 1
419
+ break # one match per value is enough
420
+
421
+ # Name heuristic check (only if column name hints at a name)
422
+ if name_re is not None:
423
+ if col_phi_hint == "name" or _normalize_col(col) in name_columns:
424
+ name_count = sum(1 for v in sample if name_re.match(v.strip()))
425
+ if name_count > len(sample) * name_min_ratio:
426
+ type_counts["name"] = name_count
427
+
428
+ # Address check
429
+ if address_re is not None:
430
+ addr_count = sum(1 for v in sample if address_re.search(v))
431
+ if addr_count > len(sample) * 0.3:
432
+ type_counts["address"] = addr_count
433
+
434
+ if not type_counts:
435
+ return None
436
+
437
+ # Pick the most frequent type
438
+ best_type = max(type_counts, key=lambda k: type_counts[k])
439
+ ratio = type_counts[best_type] / len(sample)
440
+ confidence = CONF_HIGH if ratio > 0.5 else CONF_MEDIUM if ratio > 0.2 else CONF_LOW
441
+
442
+ return {
443
+ "phi_type": best_type,
444
+ "confidence": confidence,
445
+ "source": "value_pattern",
446
+ "match_ratio": round(ratio, 3),
447
+ "sample_size": len(sample),
448
+ }
449
+
450
+
451
+ def is_high_cardinality_numeric(values: list[str], threshold: float = 0.9) -> bool:
452
+ """Detect columns that look like MRN/chart numbers:
453
+ high-cardinality pure-numeric values."""
454
+ non_empty = [v.strip() for v in values if v and v.strip()]
455
+ if len(non_empty) < 10:
456
+ return False
457
+ numeric_count = sum(1 for v in non_empty if v.isdigit() and len(v) >= 5)
458
+ if numeric_count / len(non_empty) < threshold:
459
+ return False
460
+ unique_ratio = len(set(non_empty)) / len(non_empty)
461
+ return unique_ratio > 0.8
462
+
463
+
464
+ def classify_columns(data: list[dict], headers: list[str],
465
+ locale: dict | None = None) -> list[dict]:
466
+ """Classify every column as PHI, SAFE, or REVIEW_NEEDED.
467
+
468
+ Returns a list of classification dicts (one per column).
469
+ """
470
+ # Build patterns from locale (or use universal defaults)
471
+ if locale is not None:
472
+ col_names, val_patterns, addr_re, name_re, name_ratio, name_cols = \
473
+ build_locale_patterns(locale)
474
+ else:
475
+ col_names = UNIVERSAL_COLUMN_NAMES
476
+ val_patterns = UNIVERSAL_VALUE_PATTERNS
477
+ addr_re = None
478
+ name_re = None
479
+ name_ratio = 0.3
480
+ name_cols = [k for k, v in UNIVERSAL_COLUMN_NAMES.items() if v == "name"]
481
+
482
+ # Pass 1: column name matching
483
+ name_hits = scan_column_names(headers, col_names)
484
+
485
+ classifications = []
486
+ for col in headers:
487
+ values = [row.get(col, "") for row in data]
488
+
489
+ # Already matched by name?
490
+ if col in name_hits:
491
+ entry = {
492
+ "column": col,
493
+ "classification": "PHI",
494
+ **name_hits[col],
495
+ }
496
+ # Refine with value scan
497
+ val_hit = scan_column_values(
498
+ col, values, name_hits[col]["phi_type"],
499
+ val_patterns, addr_re, name_re, name_ratio, name_cols)
500
+ if val_hit:
501
+ entry["value_scan"] = val_hit
502
+ classifications.append(entry)
503
+ continue
504
+
505
+ # Pass 2: value pattern scan
506
+ val_hit = scan_column_values(
507
+ col, values, None,
508
+ val_patterns, addr_re, name_re, name_ratio, name_cols)
509
+ if val_hit:
510
+ classifications.append({
511
+ "column": col,
512
+ "classification": "PHI" if val_hit["confidence"] == CONF_HIGH else "REVIEW_NEEDED",
513
+ **val_hit,
514
+ })
515
+ continue
516
+
517
+ # Pass 3: high-cardinality numeric (possible MRN)
518
+ if is_high_cardinality_numeric(values):
519
+ # Show sample for user review
520
+ unique_sample = sorted(set(v.strip() for v in values if v.strip()))[:5]
521
+ classifications.append({
522
+ "column": col,
523
+ "classification": "REVIEW_NEEDED",
524
+ "phi_type": "id",
525
+ "confidence": CONF_LOW,
526
+ "source": "high_cardinality_numeric",
527
+ "sample_values": unique_sample,
528
+ })
529
+ continue
530
+
531
+ # Pass 4: free-text detection (long strings, mixed content)
532
+ non_empty = [v for v in values if v and v.strip()]
533
+ if non_empty:
534
+ avg_len = sum(len(v) for v in non_empty) / len(non_empty)
535
+ if avg_len > 50:
536
+ # Scan for embedded PHI in free text
537
+ embedded_phi = False
538
+ for val in _sample_values(non_empty, 100):
539
+ for regex, _ in val_patterns:
540
+ if regex.search(val):
541
+ embedded_phi = True
542
+ break
543
+ if embedded_phi:
544
+ break
545
+ if embedded_phi:
546
+ classifications.append({
547
+ "column": col,
548
+ "classification": "REVIEW_NEEDED",
549
+ "phi_type": "free_text",
550
+ "confidence": CONF_MEDIUM,
551
+ "source": "free_text_with_phi",
552
+ })
553
+ continue
554
+
555
+ # Default: SAFE
556
+ classifications.append({
557
+ "column": col,
558
+ "classification": "SAFE",
559
+ "phi_type": None,
560
+ "confidence": CONF_HIGH,
561
+ "source": "no_match",
562
+ })
563
+
564
+ return classifications
565
+
566
+
567
+ def build_scan_report(input_path: Path, data: list[dict],
568
+ meta: dict, classifications: list[dict],
569
+ locale: dict | None = None) -> dict:
570
+ """Build the full scan report JSON."""
571
+ report = {
572
+ "version": REPORT_VERSION,
573
+ "timestamp": datetime.now().isoformat(),
574
+ "input_file": str(input_path),
575
+ "meta": meta,
576
+ "classifications": classifications,
577
+ }
578
+ if locale is not None:
579
+ report["locale"] = {
580
+ "code": locale.get("code", "custom"),
581
+ "name": locale.get("name", "Custom"),
582
+ }
583
+ return report
584
+
585
+
586
+ # ================================================================
587
+ # Section 4: Interactive Reviewer
588
+ # ================================================================
589
+
590
+ def _format_classification(c: dict) -> str:
591
+ cls = c["classification"]
592
+ phi = c.get("phi_type", "")
593
+ conf = c.get("confidence", "")
594
+ if cls == "PHI":
595
+ return f"{_red('PHI')} ({phi}, {conf})"
596
+ if cls == "REVIEW_NEEDED":
597
+ return f"{_yellow('REVIEW_NEEDED')} ({phi}, {conf})"
598
+ return _green("SAFE")
599
+
600
+
601
+ def _show_sample_values(col: str, data: list[dict], n: int = 10) -> None:
602
+ """Print up to n unique sample values for a column."""
603
+ values = list(set(row.get(col, "") for row in data if row.get(col, "").strip()))
604
+ sample = values[:n]
605
+ if sample:
606
+ print(f" Sample values: {', '.join(repr(v) for v in sample)}")
607
+ if len(values) > n:
608
+ print(f" ... and {len(values) - n} more unique values")
609
+
610
+
611
+ def review_scan_report(report: dict, data: list[dict],
612
+ auto_accept_safe: bool = False) -> dict:
613
+ """Interactive three-pass review. Mutates and returns the report."""
614
+ classifications = report["classifications"]
615
+ total = len(classifications)
616
+
617
+ # ---- Pass 1: Column-level review ----
618
+ print(f"\n{_bold('=== Pass 1: Column Classification Review ===')}")
619
+ print(f"Total columns: {total}\n")
620
+
621
+ phi_count = sum(1 for c in classifications if c["classification"] == "PHI")
622
+ review_count = sum(1 for c in classifications if c["classification"] == "REVIEW_NEEDED")
623
+ safe_count = sum(1 for c in classifications if c["classification"] == "SAFE")
624
+ print(f" {_red(f'PHI: {phi_count}')} | "
625
+ f"{_yellow(f'REVIEW_NEEDED: {review_count}')} | "
626
+ f"{_green(f'SAFE: {safe_count}')}\n")
627
+
628
+ for i, c in enumerate(classifications):
629
+ col = c["column"]
630
+ cls = c["classification"]
631
+
632
+ if cls == "SAFE" and auto_accept_safe:
633
+ c["approved_action"] = "keep"
634
+ continue
635
+
636
+ print(f"[{i + 1}/{total}] {_bold(col)}: {_format_classification(c)}")
637
+ if "sample_values" in c:
638
+ print(f" Flagged samples: {c['sample_values']}")
639
+ if cls != "SAFE":
640
+ _show_sample_values(col, data)
641
+
642
+ if cls == "SAFE":
643
+ choice = input(" Action [K]eep / (r)eview_needed? ").strip().lower()
644
+ if choice == "r":
645
+ c["classification"] = "REVIEW_NEEDED"
646
+ c["approved_action"] = None
647
+ else:
648
+ c["approved_action"] = "keep"
649
+ elif cls == "PHI":
650
+ choice = input(" Action [A]nonymize / (k)eep / (r)eview? ").strip().lower()
651
+ if choice == "k":
652
+ c["approved_action"] = "keep"
653
+ elif choice == "r":
654
+ c["classification"] = "REVIEW_NEEDED"
655
+ c["approved_action"] = None
656
+ else:
657
+ c["approved_action"] = "anonymize"
658
+ else: # REVIEW_NEEDED
659
+ _show_sample_values(col, data)
660
+ choice = input(" Action (a)nonymize / [K]eep / (f)lag_free_text? ").strip().lower()
661
+ if choice == "a":
662
+ c["approved_action"] = "anonymize"
663
+ c["classification"] = "PHI"
664
+ elif choice == "f":
665
+ c["approved_action"] = "flag"
666
+ else:
667
+ c["approved_action"] = "keep"
668
+
669
+ # ---- Pass 2: Re-examine REVIEW_NEEDED items without decisions ----
670
+ undecided = [c for c in classifications if c.get("approved_action") is None]
671
+ if undecided:
672
+ print(f"\n{_bold('=== Pass 2: Undecided Items ===')}")
673
+ for c in undecided:
674
+ col = c["column"]
675
+ print(f"\n {_bold(col)}: {_format_classification(c)}")
676
+ _show_sample_values(col, data, n=15)
677
+ choice = input(" Action (a)nonymize / [K]eep? ").strip().lower()
678
+ c["approved_action"] = "anonymize" if choice == "a" else "keep"
679
+
680
+ # ---- Pass 3: Final summary ----
681
+ print(f"\n{_bold('=== Pass 3: Final Summary ===')}")
682
+ to_anonymize = [c for c in classifications if c.get("approved_action") == "anonymize"]
683
+ to_keep = [c for c in classifications if c.get("approved_action") == "keep"]
684
+ to_flag = [c for c in classifications if c.get("approved_action") == "flag"]
685
+
686
+ print(f"\n Anonymize ({len(to_anonymize)}): "
687
+ + ", ".join(c["column"] for c in to_anonymize) if to_anonymize else " Anonymize: none")
688
+ print(f" Keep ({len(to_keep)}): "
689
+ + ", ".join(c["column"] for c in to_keep) if to_keep else " Keep: none")
690
+ if to_flag:
691
+ print(f" {_yellow(f'Flagged ({len(to_flag)})')}: "
692
+ + ", ".join(c["column"] for c in to_flag))
693
+
694
+ print()
695
+ confirm = input("Proceed with these actions? [Y]es / (e)dit / (q)uit: ").strip().lower()
696
+ if confirm == "q":
697
+ sys.exit("Aborted by user.")
698
+ if confirm == "e":
699
+ # Allow editing individual items
700
+ while True:
701
+ col_name = input(" Column name to change (or 'done'): ").strip()
702
+ if col_name.lower() == "done":
703
+ break
704
+ match = [c for c in classifications if c["column"] == col_name]
705
+ if not match:
706
+ print(f" Column '{col_name}' not found.")
707
+ continue
708
+ c = match[0]
709
+ choice = input(f" New action for {col_name} — (a)nonymize / (k)eep / (f)lag: ").strip().lower()
710
+ if choice == "a":
711
+ c["approved_action"] = "anonymize"
712
+ elif choice == "f":
713
+ c["approved_action"] = "flag"
714
+ else:
715
+ c["approved_action"] = "keep"
716
+
717
+ report["reviewed"] = True
718
+ report["review_timestamp"] = datetime.now().isoformat()
719
+ return report
720
+
721
+
722
+ # ================================================================
723
+ # Section 5: Anonymizers
724
+ # ================================================================
725
+
726
+ class PseudonymGenerator:
727
+ """Maps original values to consistent pseudonyms (P001, P002, ...)."""
728
+
729
+ def __init__(self, prefix: str = "P"):
730
+ self._map: dict[str, str] = {}
731
+ self._counter = 0
732
+ self._prefix = prefix
733
+
734
+ def get(self, original: str) -> str:
735
+ if original not in self._map:
736
+ self._counter += 1
737
+ self._map[original] = f"{self._prefix}{self._counter:04d}"
738
+ return self._map[original]
739
+
740
+ @property
741
+ def mapping(self) -> dict[str, str]:
742
+ return dict(self._map)
743
+
744
+
745
+ class DateShifter:
746
+ """Shifts dates by a consistent per-entity offset.
747
+
748
+ The same entity (identified by entity_id) always gets the same offset,
749
+ preserving relative time intervals between events for the same entity.
750
+ """
751
+
752
+ def __init__(self, seed: int, max_days: int = 365):
753
+ self._rng = random.Random(seed)
754
+ self._max_days = max_days
755
+ self._offsets: dict[str, int] = {}
756
+ self.seed = seed
757
+
758
+ def _get_offset(self, entity_id: str) -> int:
759
+ if entity_id not in self._offsets:
760
+ self._offsets[entity_id] = self._rng.randint(-self._max_days, self._max_days)
761
+ return self._offsets[entity_id]
762
+
763
+ def shift(self, date_str: str, entity_id: str = "__default__") -> str:
764
+ """Attempt to parse, shift, and re-format a date string."""
765
+ offset = self._get_offset(entity_id)
766
+ delta = timedelta(days=offset)
767
+
768
+ # Try common formats
769
+ for fmt_in, fmt_out in [
770
+ ("%Y-%m-%d", "%Y-%m-%d"),
771
+ ("%Y.%m.%d", "%Y.%m.%d"),
772
+ ("%Y/%m/%d", "%Y/%m/%d"),
773
+ ("%Y%m%d", "%Y%m%d"),
774
+ ]:
775
+ try:
776
+ dt = datetime.strptime(date_str.strip(), fmt_in)
777
+ return (dt + delta).strftime(fmt_out)
778
+ except ValueError:
779
+ continue
780
+
781
+ # Korean format
782
+ m = re.match(r"(\d{4})년\s*(\d{1,2})월\s*(\d{1,2})일", date_str)
783
+ if m:
784
+ try:
785
+ dt = datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)))
786
+ shifted = dt + delta
787
+ return f"{shifted.year}년 {shifted.month}월 {shifted.day}일"
788
+ except ValueError:
789
+ pass
790
+
791
+ # Cannot parse — return suppressed
792
+ return "[DATE_SHIFTED]"
793
+
794
+ @property
795
+ def offsets(self) -> dict[str, int]:
796
+ return dict(self._offsets)
797
+
798
+
799
+ class IDReplacer:
800
+ """Replaces identifiers with sequential IDs (ID001, ID002, ...)."""
801
+
802
+ def __init__(self, prefix: str = "ID"):
803
+ self._map: dict[str, str] = {}
804
+ self._counter = 0
805
+ self._prefix = prefix
806
+
807
+ def get(self, original: str) -> str:
808
+ if original not in self._map:
809
+ self._counter += 1
810
+ self._map[original] = f"{self._prefix}{self._counter:04d}"
811
+ return self._map[original]
812
+
813
+ @property
814
+ def mapping(self) -> dict[str, str]:
815
+ return dict(self._map)
816
+
817
+
818
+ def _suppress(val: str) -> str:
819
+ return "[REDACTED]"
820
+
821
+
822
+ def _sha256(val: str) -> str:
823
+ return hashlib.sha256(val.encode("utf-8")).hexdigest()
824
+
825
+
826
+ def apply_anonymization(data: list[dict], report: dict,
827
+ date_shift_seed: int | None = None) -> tuple[list[dict], dict, list[dict]]:
828
+ """Apply approved anonymization actions.
829
+
830
+ Returns (de-identified data, mapping dict, audit entries).
831
+ """
832
+ classifications = report["classifications"]
833
+ to_anonymize = {c["column"]: c for c in classifications
834
+ if c.get("approved_action") == "anonymize"}
835
+
836
+ if not to_anonymize:
837
+ log.info("No columns marked for anonymization.")
838
+ return data, {}, []
839
+
840
+ # Detect the entity/patient ID column for date shifting
841
+ id_columns = [col for col, c in to_anonymize.items() if c.get("phi_type") == "id"]
842
+ # Also check non-anonymized columns that look like IDs
843
+ all_id_cols = id_columns + [
844
+ c["column"] for c in classifications
845
+ if c.get("phi_type") == "id" and c.get("approved_action") == "keep"
846
+ ]
847
+
848
+ # Initialize anonymizers
849
+ name_gen = PseudonymGenerator(prefix="P")
850
+ id_gen = IDReplacer(prefix="ID")
851
+ seed = date_shift_seed if date_shift_seed is not None else random.randint(1, 999999)
852
+ date_shifter = DateShifter(seed=seed)
853
+
854
+ mapping: dict[str, dict] = {
855
+ "_meta": {
856
+ "date_shift_seed": seed,
857
+ "timestamp": datetime.now().isoformat(),
858
+ "version": REPORT_VERSION,
859
+ }
860
+ }
861
+ audit: list[dict] = []
862
+
863
+ # Process each row
864
+ clean_data = []
865
+ for row_idx, row in enumerate(data):
866
+ new_row = dict(row)
867
+
868
+ # Determine entity ID for this row (for date shifting)
869
+ entity_id = "__default__"
870
+ for id_col in all_id_cols:
871
+ if row.get(id_col, "").strip():
872
+ entity_id = row[id_col].strip()
873
+ break
874
+
875
+ for col, spec in to_anonymize.items():
876
+ original = row.get(col, "")
877
+ if not original or not original.strip():
878
+ continue
879
+
880
+ phi_type = spec.get("phi_type", "unknown")
881
+ original_stripped = original.strip()
882
+
883
+ if phi_type == "name":
884
+ replacement = name_gen.get(original_stripped)
885
+ elif phi_type == "id":
886
+ replacement = id_gen.get(original_stripped)
887
+ elif phi_type == "date":
888
+ replacement = date_shifter.shift(original_stripped, entity_id)
889
+ elif phi_type in ("phone", "rrn", "email", "insurance"):
890
+ replacement = _suppress(original_stripped)
891
+ elif phi_type == "address":
892
+ replacement = _suppress(original_stripped)
893
+ elif phi_type == "free_text":
894
+ # Redact known patterns within the text
895
+ replaced = original_stripped
896
+ for regex, _ in PHI_VALUE_PATTERNS:
897
+ replaced = regex.sub("[REDACTED]", replaced)
898
+ replacement = replaced
899
+ else:
900
+ replacement = _suppress(original_stripped)
901
+
902
+ new_row[col] = replacement
903
+
904
+ audit.append({
905
+ "row": row_idx,
906
+ "column": col,
907
+ "phi_type": phi_type,
908
+ "action": "anonymize",
909
+ "before_hash": _sha256(original_stripped),
910
+ "after_value": replacement,
911
+ })
912
+
913
+ clean_data.append(new_row)
914
+
915
+ # Build mapping
916
+ mapping["names"] = name_gen.mapping
917
+ mapping["ids"] = id_gen.mapping
918
+ mapping["date_offsets"] = date_shifter.offsets
919
+
920
+ return clean_data, mapping, audit
921
+
922
+
923
+ # ================================================================
924
+ # Section 6: Output
925
+ # ================================================================
926
+
927
+ def write_deidentified_file(data: list[dict], input_path: Path,
928
+ output_dir: Path) -> Path:
929
+ """Write de-identified data to output_dir/{stem}_deidentified.{ext}."""
930
+ fmt = detect_format(input_path)
931
+ out_name = f"{input_path.stem}_deidentified{input_path.suffix}"
932
+ out_path = output_dir / out_name
933
+ save_tabular(data, out_path, fmt)
934
+ log.info("De-identified data written to: %s", out_path)
935
+ return out_path
936
+
937
+
938
+ def write_mapping(mapping: dict, path: Path, hash_mode: bool = False) -> Path:
939
+ """Write mapping file. In hash mode, original values are SHA-256 hashed."""
940
+ if hash_mode:
941
+ hashed = {"_meta": mapping.get("_meta", {})}
942
+ for section in ("names", "ids"):
943
+ if section in mapping:
944
+ hashed[section] = {_sha256(k): v for k, v in mapping[section].items()}
945
+ if "date_offsets" in mapping:
946
+ hashed["date_offsets"] = {_sha256(k): v for k, v in mapping["date_offsets"].items()}
947
+ out_data = hashed
948
+ else:
949
+ out_data = mapping
950
+
951
+ path.write_text(json.dumps(out_data, ensure_ascii=False, indent=2), encoding="utf-8")
952
+
953
+ # Set restrictive permissions (owner-only read/write)
954
+ try:
955
+ os.chmod(path, stat.S_IRUSR | stat.S_IWUSR) # 0600
956
+ except OSError:
957
+ log.warning("Could not set restrictive permissions on mapping file: %s", path)
958
+
959
+ log.info("Mapping file written to: %s (permissions: 0600)", path)
960
+ return path
961
+
962
+
963
+ def write_audit_log(audit: list[dict], path: Path) -> Path:
964
+ """Write audit log CSV. before_hash is SHA-256 of original value."""
965
+ if not audit:
966
+ log.info("No changes made; audit log is empty.")
967
+ return path
968
+
969
+ fieldnames = ["row", "column", "phi_type", "action", "before_hash", "after_value"]
970
+ with open(path, "w", newline="", encoding="utf-8") as f:
971
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
972
+ writer.writeheader()
973
+ writer.writerows(audit)
974
+
975
+ log.info("Audit log written to: %s (%d entries)", path, len(audit))
976
+ return path
977
+
978
+
979
+ # ================================================================
980
+ # Section 7: Main + CLI
981
+ # ================================================================
982
+
983
+ def _resolve_locale(args: argparse.Namespace) -> dict | None:
984
+ """Resolve locale from CLI args or interactive selection."""
985
+ if getattr(args, "locale_file", None):
986
+ locale = load_locale_file(args.locale_file)
987
+ log.info("Using custom locale: %s", locale.get("name", "Custom"))
988
+ return locale
989
+ if getattr(args, "locale", None):
990
+ locale = load_locale(args.locale)
991
+ log.info("Using locale: %s (%s)", locale["name"], locale["code"])
992
+ return locale
993
+ # Interactive selection
994
+ return select_locale_interactive()
995
+
996
+
997
+ def cmd_scan(args: argparse.Namespace) -> None:
998
+ """Scan command: profile and classify columns."""
999
+ input_path = Path(args.input_file)
1000
+ if not input_path.exists():
1001
+ sys.exit(f"File not found: {input_path}")
1002
+
1003
+ output_dir = Path(args.output_dir)
1004
+ output_dir.mkdir(parents=True, exist_ok=True)
1005
+
1006
+ locale = _resolve_locale(args)
1007
+
1008
+ log.info("Loading %s ...", input_path)
1009
+ data, meta = load_tabular(input_path)
1010
+ log.info("Loaded %d rows, %d columns", meta["rows"], meta["columns"])
1011
+
1012
+ log.info("Scanning for PHI ...")
1013
+ classifications = classify_columns(data, meta["headers"], locale)
1014
+
1015
+ report = build_scan_report(input_path, data, meta, classifications, locale)
1016
+ report_path = output_dir / "scan_report.json"
1017
+ report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
1018
+
1019
+ # Print summary
1020
+ phi = sum(1 for c in classifications if c["classification"] == "PHI")
1021
+ review = sum(1 for c in classifications if c["classification"] == "REVIEW_NEEDED")
1022
+ safe = sum(1 for c in classifications if c["classification"] == "SAFE")
1023
+ print(f"\n{_bold('Scan Results')}:")
1024
+ print(f" {_red(f'PHI: {phi}')} | {_yellow(f'REVIEW_NEEDED: {review}')} | {_green(f'SAFE: {safe}')}")
1025
+ print(f"\nReport saved: {report_path}")
1026
+ print(f"Next step: python deidentify.py review {report_path}")
1027
+
1028
+
1029
+ def cmd_review(args: argparse.Namespace) -> None:
1030
+ """Review command: interactive terminal review of scan report."""
1031
+ report_path = Path(args.report_file)
1032
+ if not report_path.exists():
1033
+ sys.exit(f"Report not found: {report_path}")
1034
+
1035
+ report = json.loads(report_path.read_text(encoding="utf-8"))
1036
+ if report.get("version", 0) != REPORT_VERSION:
1037
+ log.warning("Report version mismatch (expected %d, got %d)",
1038
+ REPORT_VERSION, report.get("version", 0))
1039
+
1040
+ # Reload original data for sample display
1041
+ input_path = Path(report["input_file"])
1042
+ if not input_path.exists():
1043
+ sys.exit(f"Original file not found: {input_path}")
1044
+ data, _ = load_tabular(input_path)
1045
+
1046
+ reviewed = review_scan_report(report, data,
1047
+ auto_accept_safe=getattr(args, "auto_accept_safe", False))
1048
+
1049
+ out_path = report_path.parent / "reviewed_report.json"
1050
+ out_path.write_text(json.dumps(reviewed, ensure_ascii=False, indent=2), encoding="utf-8")
1051
+ print(f"\nReviewed report saved: {out_path}")
1052
+ print(f"Next step: python deidentify.py apply {out_path}")
1053
+
1054
+
1055
+ def cmd_apply(args: argparse.Namespace) -> None:
1056
+ """Apply command: anonymize based on reviewed report."""
1057
+ report_path = Path(args.report_file)
1058
+ if not report_path.exists():
1059
+ sys.exit(f"Report not found: {report_path}")
1060
+
1061
+ report = json.loads(report_path.read_text(encoding="utf-8"))
1062
+ if not report.get("reviewed"):
1063
+ log.warning("Report has not been reviewed. Run 'review' first.")
1064
+
1065
+ input_path = Path(report["input_file"])
1066
+ if not input_path.exists():
1067
+ sys.exit(f"Original file not found: {input_path}")
1068
+
1069
+ output_dir = report_path.parent
1070
+ data, _ = load_tabular(input_path)
1071
+
1072
+ log.info("Applying anonymization ...")
1073
+ clean_data, mapping, audit = apply_anonymization(data, report)
1074
+
1075
+ # Write outputs
1076
+ deid_path = write_deidentified_file(clean_data, input_path, output_dir)
1077
+ mapping_path = write_mapping(mapping, output_dir / "mapping.json",
1078
+ hash_mode=getattr(args, "hash_mapping", False))
1079
+ audit_path = write_audit_log(audit, output_dir / "audit_log.csv")
1080
+
1081
+ # Warn if mapping is in same dir as de-identified data
1082
+ if mapping_path.parent == deid_path.parent:
1083
+ print(f"\n{_yellow('WARNING')}: mapping.json is in the same directory as the "
1084
+ "de-identified data. For security, store mapping.json separately.")
1085
+
1086
+ # Summary
1087
+ changes = len(audit)
1088
+ cols_changed = len(set(a["column"] for a in audit))
1089
+ print(f"\n{_bold('De-identification Complete')}:")
1090
+ print(f" Changes: {changes} cells across {cols_changed} columns")
1091
+ print(f" Output: {deid_path}")
1092
+ print(f" Mapping: {mapping_path}")
1093
+ print(f" Audit: {audit_path}")
1094
+
1095
+
1096
+ def cmd_full(args: argparse.Namespace) -> None:
1097
+ """Full pipeline: scan -> review -> apply in one go."""
1098
+ input_path = Path(args.input_file)
1099
+ if not input_path.exists():
1100
+ sys.exit(f"File not found: {input_path}")
1101
+
1102
+ output_dir = Path(args.output_dir)
1103
+ output_dir.mkdir(parents=True, exist_ok=True)
1104
+
1105
+ locale = _resolve_locale(args)
1106
+
1107
+ # Scan
1108
+ log.info("Loading %s ...", input_path)
1109
+ data, meta = load_tabular(input_path)
1110
+ log.info("Loaded %d rows, %d columns", meta["rows"], meta["columns"])
1111
+
1112
+ log.info("Scanning for PHI ...")
1113
+ classifications = classify_columns(data, meta["headers"], locale)
1114
+ report = build_scan_report(input_path, data, meta, classifications, locale)
1115
+
1116
+ # Quick summary before review
1117
+ phi = sum(1 for c in classifications if c["classification"] == "PHI")
1118
+ review_n = sum(1 for c in classifications if c["classification"] == "REVIEW_NEEDED")
1119
+ safe = sum(1 for c in classifications if c["classification"] == "SAFE")
1120
+ print(f"\n{_bold('Scan Results')}:")
1121
+ print(f" {_red(f'PHI: {phi}')} | {_yellow(f'REVIEW_NEEDED: {review_n}')} | {_green(f'SAFE: {safe}')}")
1122
+
1123
+ if phi == 0 and review_n == 0:
1124
+ print(f"\n{_green('No PHI detected.')} Your data appears clean.")
1125
+ confirm = input("Proceed anyway? (y/n) ").strip().lower()
1126
+ if confirm != "y":
1127
+ return
1128
+
1129
+ # Review
1130
+ reviewed = review_scan_report(report, data,
1131
+ auto_accept_safe=args.auto_accept_safe)
1132
+
1133
+ # Save report
1134
+ report_path = output_dir / "reviewed_report.json"
1135
+ report_path.write_text(json.dumps(reviewed, ensure_ascii=False, indent=2), encoding="utf-8")
1136
+
1137
+ # Apply
1138
+ log.info("Applying anonymization ...")
1139
+ clean_data, mapping, audit = apply_anonymization(data, reviewed)
1140
+
1141
+ # Write outputs
1142
+ deid_path = write_deidentified_file(clean_data, input_path, output_dir)
1143
+ mapping_path = write_mapping(mapping, output_dir / "mapping.json",
1144
+ hash_mode=args.hash_mapping)
1145
+ audit_path = write_audit_log(audit, output_dir / "audit_log.csv")
1146
+
1147
+ if mapping_path.parent == deid_path.parent:
1148
+ print(f"\n{_yellow('WARNING')}: mapping.json is in the same directory as the "
1149
+ "de-identified data. For security, store mapping.json separately.")
1150
+
1151
+ changes = len(audit)
1152
+ cols_changed = len(set(a["column"] for a in audit))
1153
+ print(f"\n{_bold('De-identification Complete')}:")
1154
+ print(f" Changes: {changes} cells across {cols_changed} columns")
1155
+ print(f" Output: {deid_path}")
1156
+ print(f" Mapping: {mapping_path}")
1157
+ print(f" Audit: {audit_path}")
1158
+
1159
+
1160
+ def main() -> None:
1161
+ parser = argparse.ArgumentParser(
1162
+ prog="deidentify",
1163
+ description="Clinical research data de-identification (LLM-free).",
1164
+ )
1165
+ parser.add_argument("-v", "--verbose", action="store_true",
1166
+ help="Enable verbose logging")
1167
+ sub = parser.add_subparsers(dest="command", required=True)
1168
+
1169
+ # Locale options (shared by scan and full)
1170
+ def _add_locale_args(p: argparse.ArgumentParser) -> None:
1171
+ g = p.add_mutually_exclusive_group()
1172
+ g.add_argument("--locale", type=str, metavar="CODE",
1173
+ help="Country code (kr, us, jp, cn, de, uk, fr, ca, au, in). "
1174
+ "If omitted, interactive selection is shown.")
1175
+ g.add_argument("--locale-file", type=str, metavar="PATH",
1176
+ help="Path to a custom locale JSON file")
1177
+
1178
+ # scan
1179
+ p_scan = sub.add_parser("scan", help="Scan a file for PHI")
1180
+ p_scan.add_argument("input_file", help="Path to CSV/TSV/XLSX file")
1181
+ p_scan.add_argument("-o", "--output-dir", default=".", help="Output directory (default: .)")
1182
+ _add_locale_args(p_scan)
1183
+
1184
+ # review
1185
+ p_review = sub.add_parser("review", help="Interactive review of scan report")
1186
+ p_review.add_argument("report_file", help="Path to scan_report.json")
1187
+ p_review.add_argument("--auto-accept-safe", action="store_true",
1188
+ help="Automatically accept SAFE columns without prompting")
1189
+
1190
+ # apply
1191
+ p_apply = sub.add_parser("apply", help="Apply anonymization from reviewed report")
1192
+ p_apply.add_argument("report_file", help="Path to reviewed_report.json")
1193
+ p_apply.add_argument("--hash-mapping", action="store_true",
1194
+ help="Hash original values in mapping file (one-way)")
1195
+
1196
+ # full
1197
+ p_full = sub.add_parser("full", help="Full pipeline: scan + review + apply")
1198
+ p_full.add_argument("input_file", help="Path to CSV/TSV/XLSX file")
1199
+ p_full.add_argument("-o", "--output-dir", default=".", help="Output directory (default: .)")
1200
+ _add_locale_args(p_full)
1201
+ p_full.add_argument("--auto-accept-safe", action="store_true",
1202
+ help="Automatically accept SAFE columns without prompting")
1203
+ p_full.add_argument("--hash-mapping", action="store_true",
1204
+ help="Hash original values in mapping file (one-way)")
1205
+
1206
+ args = parser.parse_args()
1207
+
1208
+ logging.basicConfig(
1209
+ level=logging.DEBUG if args.verbose else logging.INFO,
1210
+ format="%(levelname)s: %(message)s",
1211
+ )
1212
+
1213
+ if args.command == "scan":
1214
+ cmd_scan(args)
1215
+ elif args.command == "review":
1216
+ cmd_review(args)
1217
+ elif args.command == "apply":
1218
+ cmd_apply(args)
1219
+ elif args.command == "full":
1220
+ cmd_full(args)
1221
+
1222
+
1223
+ if __name__ == "__main__":
1224
+ main()