@thierrynakoa/fire-flow 12.2.1 → 13.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (500) hide show
  1. package/CREDITS.md +25 -0
  2. package/DOMINION-FLOW-OVERVIEW.md +182 -38
  3. package/README.md +399 -455
  4. package/TROUBLESHOOTING.md +264 -264
  5. package/agents/fire-debugger.md +54 -0
  6. package/agents/fire-executor.md +1610 -1033
  7. package/agents/fire-fact-checker.md +1 -1
  8. package/agents/fire-planner.md +85 -17
  9. package/agents/fire-project-researcher.md +1 -1
  10. package/agents/fire-researcher.md +4 -22
  11. package/agents/{fire-phoenix-analyst.md → fire-resurrection-analyst.md} +394 -394
  12. package/agents/fire-reviewer.md +552 -499
  13. package/agents/fire-verifier.md +114 -19
  14. package/bin/cli.js +18 -101
  15. package/commands/fire-0-orient.md +2 -2
  16. package/commands/fire-1a-new.md +50 -15
  17. package/commands/fire-1c-setup.md +33 -5
  18. package/commands/fire-1d-discuss.md +87 -1
  19. package/commands/fire-2-plan.md +556 -527
  20. package/commands/fire-3-execute.md +2046 -1356
  21. package/commands/fire-4-verify.md +975 -906
  22. package/commands/fire-5-handoff.md +46 -5
  23. package/commands/fire-6-resume.md +2 -31
  24. package/commands/fire-add-new-skill.md +138 -19
  25. package/commands/fire-autonomous.md +14 -2
  26. package/commands/fire-complete-milestone.md +1 -1
  27. package/commands/fire-cost.md +179 -183
  28. package/commands/fire-debug.md +1 -6
  29. package/commands/fire-loop-resume.md +2 -2
  30. package/commands/fire-loop-stop.md +1 -1
  31. package/commands/fire-loop.md +2 -15
  32. package/commands/fire-map-codebase.md +1 -1
  33. package/commands/fire-migrate-database.md +548 -0
  34. package/commands/fire-new-milestone.md +1 -1
  35. package/commands/fire-reflect.md +1 -2
  36. package/commands/fire-research.md +142 -21
  37. package/commands/{fire-phoenix.md → fire-resurrect.md} +859 -603
  38. package/commands/fire-scaffold.md +297 -0
  39. package/commands/fire-search.md +1 -2
  40. package/commands/fire-security-scan.md +483 -484
  41. package/commands/fire-setup.md +359 -0
  42. package/commands/fire-skill.md +770 -0
  43. package/commands/fire-skills-diff.md +506 -506
  44. package/commands/fire-skills-history.md +388 -388
  45. package/commands/fire-skills-rollback.md +7 -7
  46. package/commands/fire-skills-sync.md +470 -470
  47. package/commands/fire-test.md +5 -5
  48. package/commands/fire-todos.md +1 -1
  49. package/commands/fire-update.md +5 -5
  50. package/commands/fire-validate-skills.md +282 -0
  51. package/commands/fire-vuln-scan.md +492 -493
  52. package/hooks/run-hook.sh +8 -8
  53. package/hooks/run-session-end.sh +7 -7
  54. package/hooks/session-end.sh +90 -90
  55. package/hooks/session-start.sh +1 -1
  56. package/package.json +4 -24
  57. package/plugin.json +7 -7
  58. package/references/autonomy-levels.md +235 -0
  59. package/references/behavioral-directives.md +95 -3
  60. package/references/blocker-tracking.md +1 -1
  61. package/references/circuit-breaker.md +93 -2
  62. package/references/context-engineering.md +227 -9
  63. package/references/honesty-protocols.md +70 -1
  64. package/references/issue-to-pr-pipeline.md +149 -150
  65. package/references/metrics-and-trends.md +1 -2
  66. package/references/research-improvements.md +4 -108
  67. package/references/sdlc-mapping.md +73 -0
  68. package/references/state-machine.md +151 -0
  69. package/skills-library/AVAILABLE_TOOLS_REFERENCE.md +333 -0
  70. package/skills-library/SKILLS-INDEX.md +57 -558
  71. package/skills-library/SKILLS_LIBRARY_INDEX.md +532 -0
  72. package/skills-library/_general/api-patterns/api-field-name-mismatch.md +107 -0
  73. package/skills-library/_general/api-patterns/streaming-command-timeout.md +122 -0
  74. package/skills-library/_general/api-patterns/streaming-proxy-cors-bypass.md +102 -0
  75. package/skills-library/_general/automation/settings-gui-generator.md +172 -0
  76. package/skills-library/_general/database-solutions/data-type-mapping-reference.md +181 -0
  77. package/skills-library/_general/database-solutions/mysql-limit-offset-string-coercion.md +102 -0
  78. package/skills-library/_general/database-solutions/mysql-to-pg-migration.md +195 -0
  79. package/skills-library/_general/database-solutions/orm-schema-portability.md +193 -0
  80. package/skills-library/_general/database-solutions/persistent-analysis-storage.md +207 -0
  81. package/skills-library/_general/database-solutions/pg-to-mysql-schema-migration-methodology.md +190 -0
  82. package/skills-library/_general/database-solutions/sql-dialect-compatibility-matrix.md +306 -0
  83. package/skills-library/_general/database-solutions/sqlite-to-pg-migration.md +219 -0
  84. package/skills-library/_general/frontend/canvas-bubble-animation-grouping.md +270 -0
  85. package/skills-library/_general/frontend/color-token-migration.md +112 -0
  86. package/skills-library/_general/frontend/framer-motion-layoutid-grouping.md +150 -0
  87. package/skills-library/_general/frontend/pyqt6-settings-dialog.md +191 -0
  88. package/skills-library/_general/frontend/react-flow-animated-layout-switching.md +101 -0
  89. package/skills-library/_general/frontend/react-hooks-order-debugging.md +141 -0
  90. package/skills-library/_general/frontend/redux-localstorage-auth-desync.md +126 -0
  91. package/skills-library/_general/frontend/safari-csp-theme-color-debugging.md +124 -0
  92. package/skills-library/_general/frontend/safari-sw-cache-poisoning.md +138 -0
  93. package/skills-library/_general/frontend/svg-sparkline-no-charting-library.md +131 -0
  94. package/skills-library/_general/growth-marketing/oss-daily-growth-intelligence.md +224 -0
  95. package/skills-library/_general/integrations/claude-code-local-mcp-integration.md +250 -0
  96. package/skills-library/_general/integrations/mcp-composite-tool-orchestration.md +200 -0
  97. package/skills-library/_general/methodology/AGENT_SDK_STANDALONE_TOOLING.md +181 -0
  98. package/skills-library/_general/methodology/AGENT_TEAMS_GUIDE.md +169 -0
  99. package/skills-library/_general/methodology/ALAS_STATEFUL_EXECUTION.md +207 -0
  100. package/skills-library/_general/methodology/AUTO_REVIEWER_SUBAGENT.md +211 -0
  101. package/skills-library/_general/methodology/CONSISTENCY_CHECK_AMBIGUITY_GATE.md +96 -0
  102. package/skills-library/_general/methodology/DEAD_ENDS_SHELF.md +4 -4
  103. package/skills-library/_general/methodology/DISTILL_NOT_DUMP.md +108 -0
  104. package/skills-library/_general/methodology/EXECUTION_PROGRESS_MONITOR.md +157 -0
  105. package/skills-library/_general/methodology/HIERARCHICAL_REVIEW_MARS.md +122 -0
  106. package/skills-library/_general/methodology/MCP_INTER_AGENT_BRIDGE.md +207 -0
  107. package/skills-library/_general/methodology/MERMAID_WIZARD_DIAGRAMS.md +77 -0
  108. package/skills-library/_general/methodology/MISSING_DIMENSION_DETECTOR.md +89 -0
  109. package/skills-library/_general/methodology/MULTI_AGENT_COORDINATION.md +397 -0
  110. package/skills-library/_general/methodology/OBSERVATION_MASKING.md +100 -0
  111. package/skills-library/_general/methodology/PHOENIX_REBUILD_METHODOLOGY.md +82 -11
  112. package/skills-library/_general/methodology/REVIEW_BACKTRACK_PANEL.md +140 -0
  113. package/skills-library/_general/methodology/REVIEW_FIX_LOOP.md +117 -0
  114. package/skills-library/_general/methodology/VOTING_VERDICT_ARBITRATION.md +155 -0
  115. package/skills-library/_general/methodology/ZERO_FRICTION_CLI_SETUP.md +2 -2
  116. package/skills-library/_general/methodology/dead-code-activation.md +123 -0
  117. package/skills-library/_general/methodology/debug-swarm-researcher-escape-hatch.md +240 -240
  118. package/skills-library/_general/methodology/shell-autonomous-loop-fixplan.md +1 -1
  119. package/skills-library/_general/patterns-standards/GOF_DESIGN_PATTERNS_FOR_AI_AGENTS.md +5 -5
  120. package/skills-library/_general/patterns-standards/cascading-failure-diagnosis.md +119 -0
  121. package/skills-library/_general/patterns-standards/domain-specific-layout-algorithms.md +209 -0
  122. package/skills-library/_general/patterns-standards/python-desktop-app-architecture.md +399 -0
  123. package/skills-library/_general/patterns-standards/realtime-monitoring-dashboard.md +457 -0
  124. package/skills-library/_general/patterns-standards/togglable-processing-pipeline.md +169 -0
  125. package/skills-library/_general/performance/liveclock-extraction.md +112 -0
  126. package/skills-library/_general/performance/ref-based-canvas-animation.md +117 -0
  127. package/skills-library/_general/performance/use-visible-interval.md +131 -0
  128. package/skills-library/_general/testing/playwright-firefox-withcredentials-auth-issue.md +104 -0
  129. package/skills-library/_quarantine/README.md +30 -0
  130. package/skills-library/api-patterns/BROADCAST_SCHEDULER_SHARED_EXECUTE_FUNCTION.md +150 -0
  131. package/skills-library/api-patterns/ERROR_RESPONSE_STANDARDS.md +145 -0
  132. package/skills-library/api-patterns/EXPRESS_ROUTE_ORDERING_MIDDLEWARE_INTERCEPTION.md +326 -0
  133. package/skills-library/api-patterns/PAGINATION_PATTERNS.md +137 -0
  134. package/skills-library/api-patterns/PODCAST_PROGRESS_TRACKING_THREE_ROOT_CAUSES.md +277 -0
  135. package/skills-library/api-patterns/RATE_LIMITING_TOGGLE.md +155 -0
  136. package/skills-library/api-patterns/graphql-content-queries.md +708 -0
  137. package/skills-library/appointment-scheduler-design.md +423 -0
  138. package/skills-library/automation/AUTO_POPULATE_COMPLETE_GUIDE.md +631 -0
  139. package/skills-library/automation/CC_WORKFLOW_STUDIO.md +83 -0
  140. package/skills-library/automation/CLAUDE_CODE_SWARM_MODE.md +95 -0
  141. package/skills-library/automation/DAEMON_TRIGGER_FILE_IPC.md +195 -0
  142. package/skills-library/automation/scheduled-content-publishing.md +608 -0
  143. package/skills-library/awesome-workflows/Blogging-Platform-Instructions/view_commands.md +25 -0
  144. package/skills-library/awesome-workflows/CREDENTIAL-SECURITY-WORKFLOW.md +109 -0
  145. package/skills-library/awesome-workflows/DEBUGGING-WORKFLOW.md +124 -0
  146. package/skills-library/awesome-workflows/Design-Review-Workflow/README.md +31 -0
  147. package/skills-library/awesome-workflows/Design-Review-Workflow/design-principles-example.md +129 -0
  148. package/skills-library/awesome-workflows/Design-Review-Workflow/design-review-agent.md +107 -0
  149. package/skills-library/awesome-workflows/Design-Review-Workflow/design-review-claude-md-snippet.md +24 -0
  150. package/skills-library/awesome-workflows/Design-Review-Workflow/design-review-slash-command.md +38 -0
  151. package/skills-library/awesome-workflows/PARALLEL-RESEARCH-WORKFLOW.md +89 -0
  152. package/skills-library/awesome-workflows/PHASE-EXECUTION-WORKFLOW.md +97 -0
  153. package/skills-library/awesome-workflows/SESSION-HANDOFF-WORKFLOW.md +116 -0
  154. package/skills-library/cms-patterns/content-branch-preview.md +515 -0
  155. package/skills-library/cms-patterns/inline-visual-editing.md +666 -0
  156. package/skills-library/cms-patterns/mdx-component-content.md +649 -0
  157. package/skills-library/cms-patterns/media-manager-abstraction.md +827 -0
  158. package/skills-library/cms-patterns/schema-driven-form-generator.md +838 -0
  159. package/skills-library/complexity-metrics/complexity-divider.md +707 -0
  160. package/skills-library/complexity-metrics/work-with-complexity.md +193 -0
  161. package/skills-library/creative-multimedia/animation-stack-guide.md +577 -0
  162. package/skills-library/creative-multimedia/audio-enhancement-pipeline.md +625 -0
  163. package/skills-library/creative-multimedia/content-repurposing-pipeline.md +1146 -0
  164. package/skills-library/creative-multimedia/data-visualization-generator.md +862 -0
  165. package/skills-library/creative-multimedia/doc-to-podcast-pipeline.md +2184 -0
  166. package/skills-library/creative-multimedia/ffmpeg-command-generator.md +405 -0
  167. package/skills-library/creative-multimedia/image-optimization-pipeline.md +605 -0
  168. package/skills-library/creative-multimedia/multi-format-content-generator.md +1759 -0
  169. package/skills-library/creative-multimedia/og-image-generator.md +635 -0
  170. package/skills-library/creative-multimedia/podcast-audio-composition.md +1355 -0
  171. package/skills-library/creative-multimedia/podcast-quality-evaluation.md +1452 -0
  172. package/skills-library/creative-multimedia/podcast-script-generation.md +1841 -0
  173. package/skills-library/creative-multimedia/svg-generation.md +750 -0
  174. package/skills-library/creative-multimedia/text-to-speech-provider-selector.md +1414 -0
  175. package/skills-library/creative-multimedia/transcription-pipeline-selector.md +677 -0
  176. package/skills-library/creative-multimedia/video-streaming-setup.md +559 -0
  177. package/skills-library/database-solutions/AI_RESPONSE_DATABASE_CACHING.md +520 -0
  178. package/skills-library/database-solutions/CONDITIONAL_SQL_MIGRATION_PATTERN.md +119 -0
  179. package/skills-library/database-solutions/DATABASE_COLUMN_NAME_MISMATCH.md +393 -0
  180. package/skills-library/database-solutions/DATABASE_SCHEMA.md +394 -0
  181. package/skills-library/database-solutions/DATABASE_SCHEMA_VERIFICATION_GUIDE.md +348 -0
  182. package/skills-library/database-solutions/DATABASE_STRATEGY.md +71 -0
  183. package/skills-library/database-solutions/ES_MODULE_SEED_SCRIPT_PATTERN.md +52 -0
  184. package/skills-library/database-solutions/MIGRATION_GUIDE.md +3 -0
  185. package/skills-library/database-solutions/PLPGSQL_VARIABLE_CONFLICT_FIX.md +208 -0
  186. package/skills-library/database-solutions/POSTGRESQL_JSONB_DOUBLE_STRINGIFY_FIX.md +245 -0
  187. package/skills-library/database-solutions/POSTGRESQL_LICENSE_TABLE_DESIGN.md +393 -0
  188. package/skills-library/database-solutions/POSTGRESQL_UUID_DOCUMENT_RAG_DUAL_SCOPE.md +732 -0
  189. package/skills-library/database-solutions/POSTGRES_SQL_TEMPLATE_BINDING_ERROR.md +240 -0
  190. package/skills-library/database-solutions/PRISMA_DB_PUSH_DATA_LOSS_PREVENTION.md +141 -0
  191. package/skills-library/database-solutions/PRODUCTION_QUERY_OPTIMIZATION_RESTART_FIX.md +389 -0
  192. package/skills-library/database-solutions/RLS_SECURITY_GUIDE.md +107 -0
  193. package/skills-library/database-solutions/SCHEMA_ENHANCEMENTS_GUIDE.md +373 -0
  194. package/skills-library/database-solutions/SCHEMA_MIGRATION_GUIDE.md +368 -0
  195. package/skills-library/database-solutions/SCHEMA_VERIFICATION_QUICK_REFERENCE.md +104 -0
  196. package/skills-library/database-solutions/ai-erd-generator.md +1213 -0
  197. package/skills-library/database-solutions/content-publishing-states.md +631 -0
  198. package/skills-library/database-solutions/database-schema-designer.md +522 -0
  199. package/skills-library/database-solutions/er-diagram-components.md +569 -0
  200. package/skills-library/database-solutions/er-to-ddl-mapping.md +1405 -0
  201. package/skills-library/database-solutions/erd-creator-textbook-research.md +433 -0
  202. package/skills-library/database-solutions/erd-react-flow-architecture.md +1965 -0
  203. package/skills-library/database-solutions/mariadb-aggregate-function-replacement.md +145 -0
  204. package/skills-library/database-solutions/normalization-validator.md +778 -0
  205. package/skills-library/database-solutions/postgres-full-text-search-content.md +494 -0
  206. package/skills-library/database-solutions/postgresql-to-mysql-runtime-translation.md +286 -0
  207. package/skills-library/database-solutions/regex-alternation-ordering-sql-types.md +92 -0
  208. package/skills-library/database-solutions/reserved-word-context-aware-quoting.md +142 -0
  209. package/skills-library/database-solutions/sql-ddl-generator.md +756 -0
  210. package/skills-library/database-solutions/supabase-connection-pooler-fix.md +102 -0
  211. package/skills-library/deployment-security/CPANEL_NODE_DEPLOYMENT.md +166 -0
  212. package/skills-library/deployment-security/DEPLOYMENT.md +275 -0
  213. package/skills-library/deployment-security/DEPLOYMENT_CHECKLIST.md +363 -0
  214. package/skills-library/deployment-security/DEPLOYMENT_PLAN.md +669 -0
  215. package/skills-library/deployment-security/KNEX_DATABASE_ABSTRACTION.md +444 -0
  216. package/skills-library/deployment-security/LICENSE_KEY_SYSTEM.md +206 -0
  217. package/skills-library/deployment-security/NODE18_DEPENDENCY_COMPATIBILITY.md +284 -0
  218. package/skills-library/deployment-security/PHP_INSTALLER_WIZARD_GUIDE.md +315 -0
  219. package/skills-library/deployment-security/PM2_ENVIRONMENT_VARIABLE_CACHING.md +256 -0
  220. package/skills-library/deployment-security/PM2_MEMORY_EXHAUSTION_FIX.md +370 -0
  221. package/skills-library/deployment-security/PRODUCTION_DEPLOYMENT_GUIDE.md +592 -0
  222. package/skills-library/deployment-security/PRODUCTION_HARDENING_DOCUMENTATION.md +307 -0
  223. package/skills-library/deployment-security/PRODUCTION_RECOVERY_CHERRY_PICK_PATTERN.md +202 -0
  224. package/skills-library/deployment-security/PYINSTALLER_CUDA_WHISPER_BUNDLING.md +236 -0
  225. package/skills-library/deployment-security/SECURITY.md +41 -0
  226. package/skills-library/deployment-security/SMTP_SSL_HOSTNAME_MISMATCH_SHARED_HOSTING.md +220 -0
  227. package/skills-library/deployment-security/SPA_SEO_OPTIMIZATION_CPANEL.md +200 -0
  228. package/skills-library/deployment-security/SUPABASE_EDGE_FUNCTIONS.md +338 -0
  229. package/skills-library/deployment-security/VERCEL_GITHUB_DEPLOYMENT_GUIDE.md +858 -0
  230. package/skills-library/deployment-security/VPS_DEPLOYMENT_READINESS.md +356 -0
  231. package/skills-library/deployment-security/deployment-changes-not-applying.md +241 -0
  232. package/skills-library/deployment-security/env-file-management-production-local.md +203 -0
  233. package/skills-library/deployment-security/express-secure-file-downloads.md +413 -0
  234. package/skills-library/deployment-security/react-production-deployment-desktop-guide.md +2011 -0
  235. package/skills-library/deployment-security/self-hosted-supabase-coolify-guide.md +1684 -0
  236. package/skills-library/deployment-security/unique-features-ai-strategy-plaid-security.md +1613 -0
  237. package/skills-library/deployment-security/vps-deployment.md +135 -0
  238. package/skills-library/document-processing/WORD_EXPORT_MARKDOWN_FORMATTING.md +482 -0
  239. package/skills-library/document-processing/document-ai-landingai-integration.md +677 -0
  240. package/skills-library/document-processing/express-secure-file-downloads-mern.md +413 -0
  241. package/skills-library/document-processing/express-secure-file-downloads.md +413 -0
  242. package/skills-library/document-processing/md-to-word-converter.md +318 -0
  243. package/skills-library/document-processing/pdf-forms-integration/README.md +101 -0
  244. package/skills-library/document-processing/pdf-forms-integration/SKILL.md +662 -0
  245. package/skills-library/ecommerce/ADMIN_PRODUCTS_GUIDE.md +428 -0
  246. package/skills-library/ecommerce/ECOMMERCE_API_REFERENCE.md +776 -0
  247. package/skills-library/ecommerce/ECOMMERCE_COMPLETION_SUMMARY.md +673 -0
  248. package/skills-library/ecommerce/ECOMMERCE_IMPLEMENTATION_GUIDE.md +729 -0
  249. package/skills-library/ecommerce/ECOMMERCE_QUICK_REFERENCE.md +521 -0
  250. package/skills-library/ecommerce/ECOMMERCE_TESTING_CHECKLIST.md +565 -0
  251. package/skills-library/ecommerce/ECOMMERCE_WORKFLOW_GUIDE.md +1059 -0
  252. package/skills-library/ecommerce/PRODUCT_CREATION_EXPANDED.md +522 -0
  253. package/skills-library/ecommerce/agentic-commerce-protocol.md +203 -0
  254. package/skills-library/ecommerce/cart-abandonment-recovery.md +236 -0
  255. package/skills-library/ecommerce/cart-architecture-patterns.md +300 -0
  256. package/skills-library/ecommerce/cart-item-count-indicator.md +264 -0
  257. package/skills-library/ecommerce/checkout-ux-conversion.md +227 -0
  258. package/skills-library/ecommerce/composable-commerce-selection.md +166 -0
  259. package/skills-library/ecommerce/ecommerce-analytics-patterns.md +167 -0
  260. package/skills-library/ecommerce/fraud-detection-patterns.md +179 -0
  261. package/skills-library/ecommerce/inventory-stock-management.md +270 -0
  262. package/skills-library/ecommerce/order-saga-state-machine.md +336 -0
  263. package/skills-library/ecommerce/payment-provider-abstraction.md +245 -0
  264. package/skills-library/ecommerce/pci-compliance-checklist.md +192 -0
  265. package/skills-library/ecommerce/refund-chargeback-handling.md +177 -0
  266. package/skills-library/ecommerce/shipping-carrier-integration.md +218 -0
  267. package/skills-library/ecommerce/webhook-idempotency-patterns.md +253 -0
  268. package/skills-library/excalidraw-diagrams/.github/workflows/ci.yml +558 -0
  269. package/skills-library/excalidraw-diagrams/.github/workflows/prompt-gallery.yml +448 -0
  270. package/skills-library/excalidraw-diagrams/.github/workflows/release.yml +42 -0
  271. package/skills-library/excalidraw-diagrams/.github/workflows/test-reusable-ci.yml +25 -0
  272. package/skills-library/excalidraw-diagrams/CLAUDE.md +57 -0
  273. package/skills-library/excalidraw-diagrams/LICENSE +21 -0
  274. package/skills-library/excalidraw-diagrams/README.md +178 -0
  275. package/skills-library/excalidraw-diagrams/SKILL.md +715 -0
  276. package/skills-library/form-solutions/BUTTON_TYPE_FORM_SUBMISSION.md +336 -0
  277. package/skills-library/form-solutions/FILLABLE_PDF_IMPLEMENTATION.md +226 -0
  278. package/skills-library/form-solutions/SURVEYJS_QUESTIONNAIRE_SYSTEM.md +367 -0
  279. package/skills-library/form-solutions/tiptap-minimal-setup.md +690 -0
  280. package/skills-library/frontend/scholarly-classification-bubble-map.md +149 -0
  281. package/skills-library/infrastructure/ci-cd-pipeline-builder.md +517 -0
  282. package/skills-library/infrastructure/observability-designer.md +264 -0
  283. package/skills-library/infrastructure/performance-profiler.md +621 -0
  284. package/skills-library/installer-wizard-patterns.md +249 -0
  285. package/skills-library/integrations/CLAUDE_CODE_TOKEN_ANALYTICS.md +160 -0
  286. package/skills-library/integrations/CONFIGURABLE_AI_PROVIDER_SELECTION.md +728 -0
  287. package/skills-library/integrations/SOCKET_IO_BROADCAST_ALL_VS_ROOM.md +141 -0
  288. package/skills-library/integrations/VIRTUAL_MEETINGS_IMPLEMENTATION.md +374 -0
  289. package/skills-library/integrations/WORDPRESS_LEARNDASH_DATA_RECOVERY.md +53 -0
  290. package/skills-library/integrations/YOUTUBE_API_SETUP.md +141 -0
  291. package/skills-library/integrations/YOUTUBE_BOOKMARKING_EXPLANATION.md +252 -0
  292. package/skills-library/integrations/YOUTUBE_BOOKMARKING_SOLUTION.md +268 -0
  293. package/skills-library/integrations/YOUTUBE_OAUTH_SETUP_GUIDE.md +200 -0
  294. package/skills-library/integrations/YOUTUBE_VIDEO_FIX_COMPLETE.md +192 -0
  295. package/skills-library/integrations/ai-ml/GEMINI_AI_RAG_PIPELINE_COMPLETE_GUIDE.md +195 -0
  296. package/skills-library/integrations/ai-ml/GEMINI_IMAGE_GENERATION_SETUP.md +64 -0
  297. package/skills-library/integrations/cloudflare/cloudflare-turnstile-debugging.md +202 -0
  298. package/skills-library/integrations/cloudflare/cloudflare-turnstile-implementation.md +476 -0
  299. package/skills-library/integrations/cloudflare-turnstile-debugging.md +202 -0
  300. package/skills-library/integrations/cloudflare-turnstile-implementation.md +476 -0
  301. package/skills-library/integrations/ghost-creator-monetization-pattern.md +454 -0
  302. package/skills-library/integrations/headless-cms-architecture.md +484 -0
  303. package/skills-library/integrations/headless-cms-stack-selection.md +183 -0
  304. package/skills-library/integrations/payload-cms-patterns.md +674 -0
  305. package/skills-library/integrations/realtimestt-openwakeword-cuda-windows.md +229 -0
  306. package/skills-library/integrations/rss-podcast-integration.md +300 -0
  307. package/skills-library/integrations/wordpress/WORDPRESS_LEARNDASH_DATA_RECOVERY.md +53 -0
  308. package/skills-library/integrations/youtube/YOUTUBE_API_SETUP.md +141 -0
  309. package/skills-library/integrations/youtube/YOUTUBE_BOOKMARKING_EXPLANATION.md +252 -0
  310. package/skills-library/integrations/youtube/YOUTUBE_BOOKMARKING_SOLUTION.md +268 -0
  311. package/skills-library/integrations/youtube/YOUTUBE_OAUTH_SETUP_GUIDE.md +200 -0
  312. package/skills-library/integrations/youtube/YOUTUBE_VIDEO_FIX_COMPLETE.md +192 -0
  313. package/skills-library/marketing/campaign-analytics.md +97 -0
  314. package/skills-library/marketing/content-creator.md +105 -0
  315. package/skills-library/marketing/marketing-strategy-pmm.md +94 -0
  316. package/skills-library/marketing/social-media-analyzer.md +81 -0
  317. package/skills-library/methodology/ADVANCED_ORCHESTRATION_PATTERNS.md +401 -0
  318. package/skills-library/methodology/AGENT_SELF_IMPROVEMENT_LOOP.md +179 -0
  319. package/skills-library/methodology/BREATH_BASED_PARALLEL_EXECUTION.md +1 -1
  320. package/skills-library/methodology/CLEANSING_CYCLE.md +358 -0
  321. package/skills-library/methodology/CONFIDENCE_ANNOTATION_PATTERN.md +143 -0
  322. package/skills-library/methodology/CRITICAL_PATTERNS_DOCUMENTATION_COMPLETE.md +204 -0
  323. package/skills-library/methodology/DELIVERABLES_SUMMARY.md +341 -0
  324. package/skills-library/methodology/DIFFICULTY_AWARE_AGENT_ROUTING.md +252 -0
  325. package/skills-library/methodology/EVOLUTIONARY_SKILL_SYNTHESIS.md +219 -0
  326. package/skills-library/methodology/GLOMERULUS_DECISION_GATE.md +223 -0
  327. package/skills-library/methodology/HIBERNATION_SYSTEM.md +231 -0
  328. package/skills-library/methodology/INSTRUMENTATION_OVER_RESTRICTION.md +192 -0
  329. package/skills-library/methodology/MASTER_COMPLETION_SUMMARY.md +444 -0
  330. package/skills-library/methodology/MASTER_SESSION_COMPLETION.md +743 -0
  331. package/skills-library/methodology/MERN_QUICK_REFERENCE.md +358 -0
  332. package/skills-library/methodology/ORGAN_AGENT_MAPPING.md +177 -0
  333. package/skills-library/methodology/PARALLEL_WAVE_BASED_REFACTORING.md +440 -0
  334. package/skills-library/methodology/QUICK_REFERENCE.md +358 -0
  335. package/skills-library/methodology/SDFT_ONPOLICY_SELF_DISTILLATION.md +186 -0
  336. package/skills-library/methodology/SELF_QUESTIONING_TASK_GENERATION.md +270 -0
  337. package/skills-library/methodology/SESSION_COMPLETION_SUMMARY.md +304 -0
  338. package/skills-library/methodology/SESSION_SUMMARY.md +432 -0
  339. package/skills-library/methodology/WARRIOR_WORKFLOW_DEBUGGING_PROTOCOL.md +252 -0
  340. package/skills-library/methodology/tech-debt-tracker.md +570 -0
  341. package/skills-library/parallel-debug/SKILL.md +60 -0
  342. package/skills-library/patterns-standards/API_PATTERN_FIX_SUMMARY.md +236 -0
  343. package/skills-library/patterns-standards/BATCH_OPERATIONS_WITH_PROGRESS_MODAL.md +362 -0
  344. package/skills-library/patterns-standards/CRITICAL_CODING_PATTERNS.md +639 -0
  345. package/skills-library/patterns-standards/DARK_MODE_MODAL_VISIBILITY.md +258 -0
  346. package/skills-library/patterns-standards/ERROR_RESILIENCE_IMPLEMENTATION.md +375 -0
  347. package/skills-library/patterns-standards/ES_MODULE_IMPORT_HOISTING_DOTENV.md +298 -0
  348. package/skills-library/patterns-standards/NESTED_BACKDROP_FILTER_CSS_ARTIFACT_FIX.md +76 -0
  349. package/skills-library/patterns-standards/ORDERED_DETECTOR_PIPELINE_GRACEFUL_FALLBACK.md +333 -0
  350. package/skills-library/patterns-standards/PHASE_IMPORT_ERROR_DEBUGGING.md +271 -0
  351. package/skills-library/patterns-standards/PYNPUT_GLOBAL_HOTKEY_VK_MATCHING.md +252 -0
  352. package/skills-library/patterns-standards/REACT_USEEFFECT_CASCADE_RESET_FIX.md +132 -0
  353. package/skills-library/patterns-standards/SUBMENU_HOVER_DROPDOWN_PATTERN.md +225 -0
  354. package/skills-library/patterns-standards/TAILWIND_TEXT_VISIBILITY_OVERRIDE.md +322 -0
  355. package/skills-library/patterns-standards/THEME_AWARE_CSS_VARIABLES_PATTERN.md +209 -0
  356. package/skills-library/patterns-standards/THEME_USER_OBJECT_PROPERTY_NAMING.md +194 -0
  357. package/skills-library/patterns-standards/TOOLTIP_BLOCKING_CLICKS_FIX.md +267 -0
  358. package/skills-library/patterns-standards/claude-code-plugin-structure.md +235 -0
  359. package/skills-library/patterns-standards/react-i18next-setup.md +429 -0
  360. package/skills-library/patterns-standards/thesys-c1-generative-ui-integration.md +967 -0
  361. package/skills-library/plugin-development/CLAUDE_CODE_COMMAND_REGISTRATION_SILENT_FAILURE.md +315 -0
  362. package/skills-library/plugin-development/plugin-command-namespace-vs-global.md +390 -0
  363. package/skills-library/plugin-development/plugin-doc-auto-generation.md +172 -0
  364. package/skills-library/security/GITHUB_REPO_SECURITY_AUDIT.md +115 -0
  365. package/skills-library/security/admin-deletion-safety.md +396 -0
  366. package/skills-library/security/application-vuln-patterns.md +477 -0
  367. package/skills-library/security/env-secrets-manager.md +686 -0
  368. package/skills-library/security/secure-ai-application-templates.md +347 -0
  369. package/skills-library/security/sql-injection-prevention-postgresjs.md +151 -0
  370. package/skills-library/supabase-connection-pooler-fix.md +102 -0
  371. package/skills-library/system-context/POWERSHELL_BASH_INTEROP.md +82 -0
  372. package/skills-library/system-context/SERVICE_LIFECYCLE_MANAGEMENT.md +119 -0
  373. package/skills-library/system-context/SKILL.md +40 -0
  374. package/skills-library/system-context/WINDOWS_DEV_ENVIRONMENT.md +73 -0
  375. package/skills-library/testing/E2E_PLAYWRIGHT_PATTERNS.md +99 -0
  376. package/skills-library/testing/INTEGRATION_TEST_STRATEGY.md +82 -0
  377. package/skills-library/testing/RED_GREEN_BUGFIX_GATE.md +203 -0
  378. package/skills-library/testing/TEST_DATA_MANAGEMENT.md +69 -0
  379. package/skills-library/testing/VITEST_UNIT_TEST_PATTERNS.md +75 -0
  380. package/skills-library/testing/playwright-api-security-tests.md +202 -0
  381. package/skills-library/toolbox/SKILL.md +84 -0
  382. package/skills-library/toolbox/code-graph-and-web-scraping-mcps.md +237 -0
  383. package/skills-library/ui-ux-pro-max/ACCESSIBILITY_ESSENTIALS.md +115 -0
  384. package/skills-library/ui-ux-pro-max/DESIGN_SYSTEM_SCAFFOLDING.md +133 -0
  385. package/skills-library/ui-ux-pro-max/RESPONSIVE_LAYOUT_PATTERNS.md +119 -0
  386. package/skills-library/ui-ux-pro-max/SKILL.md +386 -0
  387. package/skills-library/ui-ux-pro-max/data/charts.csv +26 -0
  388. package/skills-library/ui-ux-pro-max/data/colors.csv +97 -0
  389. package/skills-library/ui-ux-pro-max/data/icons.csv +101 -0
  390. package/skills-library/ui-ux-pro-max/data/landing.csv +31 -0
  391. package/skills-library/ui-ux-pro-max/data/products.csv +97 -0
  392. package/skills-library/ui-ux-pro-max/data/react-performance.csv +45 -0
  393. package/skills-library/ui-ux-pro-max/data/stacks/astro.csv +54 -0
  394. package/skills-library/ui-ux-pro-max/data/stacks/flutter.csv +53 -0
  395. package/skills-library/ui-ux-pro-max/data/stacks/html-tailwind.csv +56 -0
  396. package/skills-library/ui-ux-pro-max/data/stacks/jetpack-compose.csv +53 -0
  397. package/skills-library/ui-ux-pro-max/data/stacks/nextjs.csv +53 -0
  398. package/skills-library/ui-ux-pro-max/data/stacks/nuxt-ui.csv +51 -0
  399. package/skills-library/ui-ux-pro-max/data/stacks/nuxtjs.csv +59 -0
  400. package/skills-library/ui-ux-pro-max/data/stacks/react-native.csv +52 -0
  401. package/skills-library/ui-ux-pro-max/data/stacks/react.csv +54 -0
  402. package/skills-library/ui-ux-pro-max/data/stacks/shadcn.csv +61 -0
  403. package/skills-library/ui-ux-pro-max/data/stacks/svelte.csv +54 -0
  404. package/skills-library/ui-ux-pro-max/data/stacks/swiftui.csv +51 -0
  405. package/skills-library/ui-ux-pro-max/data/stacks/vue.csv +50 -0
  406. package/skills-library/ui-ux-pro-max/data/styles.csv +68 -0
  407. package/skills-library/ui-ux-pro-max/data/typography.csv +58 -0
  408. package/skills-library/ui-ux-pro-max/data/ui-reasoning.csv +101 -0
  409. package/skills-library/ui-ux-pro-max/data/ux-guidelines.csv +100 -0
  410. package/skills-library/ui-ux-pro-max/data/web-interface.csv +31 -0
  411. package/skills-library/wordpress-style-theme-components.md +1526 -0
  412. package/templates/ASSUMPTIONS.md +1 -1
  413. package/templates/DECISION_LOG.md +0 -1
  414. package/templates/phase-prompt.md +1 -1
  415. package/templates/phoenix-comparison.md +6 -6
  416. package/templates/skill-api-integration.md +106 -0
  417. package/templates/skill-architecture-pattern.md +92 -0
  418. package/templates/skill-debug-pattern.md +98 -0
  419. package/templates/skill-devops-recipe.md +107 -0
  420. package/templates/skill-general.md +65 -0
  421. package/templates/skill-ui-component.md +113 -0
  422. package/tools/uat-runner.py +179 -0
  423. package/version.json +7 -3
  424. package/workflows/handoff-session.md +2 -2
  425. package/workflows/new-project.md +2 -2
  426. package/workflows/plan-phase.md +1 -1
  427. package/.claude-plugin/plugin.json +0 -64
  428. package/skills-library/_general/methodology/LIVE_BREADCRUMB_PROTOCOL.md +0 -242
  429. package/skills-library/_general/methodology/llm-judge-memory-crud.md +0 -241
  430. package/skills-library/methodology/REFLEXION_MEMORY_PATTERN.md +0 -183
  431. package/skills-library/methodology/RESEARCH_BACKED_WORKFLOW_UPGRADE.md +0 -263
  432. package/skills-library/methodology/SABBATH_REST_PATTERN.md +0 -267
  433. package/skills-library/methodology/STONE_AND_SCAFFOLD.md +0 -220
  434. package/skills-library/specialists/api-architecture/api-designer.md +0 -49
  435. package/skills-library/specialists/api-architecture/graphql-architect.md +0 -49
  436. package/skills-library/specialists/api-architecture/mcp-developer.md +0 -51
  437. package/skills-library/specialists/api-architecture/microservices-architect.md +0 -50
  438. package/skills-library/specialists/api-architecture/websocket-engineer.md +0 -48
  439. package/skills-library/specialists/backend/django-expert.md +0 -52
  440. package/skills-library/specialists/backend/fastapi-expert.md +0 -52
  441. package/skills-library/specialists/backend/laravel-specialist.md +0 -52
  442. package/skills-library/specialists/backend/nestjs-expert.md +0 -51
  443. package/skills-library/specialists/backend/rails-expert.md +0 -53
  444. package/skills-library/specialists/backend/spring-boot-engineer.md +0 -56
  445. package/skills-library/specialists/data-ml/fine-tuning-expert.md +0 -48
  446. package/skills-library/specialists/data-ml/ml-pipeline.md +0 -47
  447. package/skills-library/specialists/data-ml/pandas-pro.md +0 -47
  448. package/skills-library/specialists/data-ml/rag-architect.md +0 -51
  449. package/skills-library/specialists/data-ml/spark-engineer.md +0 -47
  450. package/skills-library/specialists/frontend/angular-architect.md +0 -52
  451. package/skills-library/specialists/frontend/flutter-expert.md +0 -51
  452. package/skills-library/specialists/frontend/nextjs-developer.md +0 -54
  453. package/skills-library/specialists/frontend/react-native-expert.md +0 -50
  454. package/skills-library/specialists/frontend/vue-expert.md +0 -51
  455. package/skills-library/specialists/infrastructure/chaos-engineer.md +0 -74
  456. package/skills-library/specialists/infrastructure/cloud-architect.md +0 -70
  457. package/skills-library/specialists/infrastructure/database-optimizer.md +0 -64
  458. package/skills-library/specialists/infrastructure/devops-engineer.md +0 -70
  459. package/skills-library/specialists/infrastructure/kubernetes-specialist.md +0 -52
  460. package/skills-library/specialists/infrastructure/monitoring-expert.md +0 -70
  461. package/skills-library/specialists/infrastructure/sre-engineer.md +0 -70
  462. package/skills-library/specialists/infrastructure/terraform-engineer.md +0 -51
  463. package/skills-library/specialists/languages/cpp-pro.md +0 -74
  464. package/skills-library/specialists/languages/csharp-developer.md +0 -69
  465. package/skills-library/specialists/languages/dotnet-core-expert.md +0 -54
  466. package/skills-library/specialists/languages/golang-pro.md +0 -51
  467. package/skills-library/specialists/languages/java-architect.md +0 -49
  468. package/skills-library/specialists/languages/javascript-pro.md +0 -68
  469. package/skills-library/specialists/languages/kotlin-specialist.md +0 -68
  470. package/skills-library/specialists/languages/php-pro.md +0 -49
  471. package/skills-library/specialists/languages/python-pro.md +0 -52
  472. package/skills-library/specialists/languages/react-expert.md +0 -51
  473. package/skills-library/specialists/languages/rust-engineer.md +0 -50
  474. package/skills-library/specialists/languages/sql-pro.md +0 -56
  475. package/skills-library/specialists/languages/swift-expert.md +0 -69
  476. package/skills-library/specialists/languages/typescript-pro.md +0 -51
  477. package/skills-library/specialists/platform/atlassian-mcp.md +0 -52
  478. package/skills-library/specialists/platform/embedded-systems.md +0 -53
  479. package/skills-library/specialists/platform/game-developer.md +0 -53
  480. package/skills-library/specialists/platform/salesforce-developer.md +0 -53
  481. package/skills-library/specialists/platform/shopify-expert.md +0 -49
  482. package/skills-library/specialists/platform/wordpress-pro.md +0 -49
  483. package/skills-library/specialists/quality/code-documenter.md +0 -51
  484. package/skills-library/specialists/quality/code-reviewer.md +0 -67
  485. package/skills-library/specialists/quality/debugging-wizard.md +0 -51
  486. package/skills-library/specialists/quality/fullstack-guardian.md +0 -51
  487. package/skills-library/specialists/quality/legacy-modernizer.md +0 -50
  488. package/skills-library/specialists/quality/playwright-expert.md +0 -65
  489. package/skills-library/specialists/quality/spec-miner.md +0 -56
  490. package/skills-library/specialists/quality/test-master.md +0 -65
  491. package/skills-library/specialists/security/secure-code-guardian.md +0 -55
  492. package/skills-library/specialists/security/security-reviewer.md +0 -53
  493. package/skills-library/specialists/workflow/architecture-designer.md +0 -53
  494. package/skills-library/specialists/workflow/cli-developer.md +0 -70
  495. package/skills-library/specialists/workflow/feature-forge.md +0 -65
  496. package/skills-library/specialists/workflow/prompt-engineer.md +0 -54
  497. package/skills-library/specialists/workflow/the-fool.md +0 -62
  498. /package/skills-library/{performance → _general/performance}/cache-augmented-generation.md +0 -0
  499. /package/skills-library/{debugging → parallel-debug}/FAILURE_TAXONOMY_CLASSIFICATION.md +0 -0
  500. /package/skills-library/{debugging → parallel-debug}/THREE_AGENT_HYPOTHESIS_DEBUGGING.md +0 -0
@@ -0,0 +1,2184 @@
1
+ ---
2
+ name: doc-to-podcast-pipeline
3
+ category: creative-multimedia
4
+ version: 1.0.0
5
+ contributed: 2026-03-10
6
+ contributor: dominion-flow-research
7
+ last_updated: 2026-03-10
8
+ tags: [podcast, pipeline, document-to-audio, notebooklm, architecture, end-to-end, ffmpeg, tts, rag]
9
+ difficulty: hard
10
+ ---
11
+
12
+ # Document-to-Podcast Pipeline
13
+ ## Description
14
+
15
+ End-to-end pipeline that transforms any document (PDF, DOCX, URL, YouTube transcript) into a
16
+ polished podcast episode with multiple speakers, natural conversation flow, intro/outro music,
17
+ and broadcast-quality loudness normalization. Combines RAG-based content understanding, multi-agent
18
+ script generation (PodAgent pattern), neural TTS synthesis, and FFmpeg audio composition into a
19
+ single automated workflow.
20
+
21
+ This is the "full stack" audio generation skill -- it orchestrates capabilities from several
22
+ sibling skills (transcription-pipeline-selector, ffmpeg-command-generator, audio-enhancement-pipeline,
23
+ content-repurposing-pipeline) into one cohesive pipeline.
24
+
25
+ ## When to Use
26
+
27
+ - Transforming written documents (PDFs, articles, papers) into listenable podcast episodes
28
+ - Building a NotebookLM-style "Audio Overview" feature for your application
29
+ - Converting sermons, Bible studies, or teaching notes into podcast format
30
+ - Creating educational audio content from textbooks or course materials
31
+ - Automating podcast production from blog posts or newsletters
32
+ - Building an internal tool that generates audio briefings from reports
33
+
34
+ ## Related Skills
35
+
36
+ - `transcription-pipeline-selector.md` -- Input stage: transcribe audio/video sources before processing
37
+ - `ffmpeg-command-generator.md` -- Output stage: all FFmpeg commands for audio composition
38
+ - `audio-enhancement-pipeline.md` -- Post-production: loudness normalization, noise reduction
39
+ - `content-repurposing-pipeline.md` -- Broader pipeline: podcast is one output format among many
40
+ - `podcast-script-generation.md` -- Stage 3 deep-dive: PodAgent multi-agent script writing
41
+
42
+ ---
43
+
44
+ ## Architecture Overview
45
+
46
+ ### The 4-Stage Pipeline
47
+
48
+ ```
49
+ Stage 1: INGEST --> Parse documents, extract text, chunk semantically
50
+ Stage 2: UNDERSTAND --> RAG retrieval, key point extraction, outline generation
51
+ Stage 3: SCRIPT --> Multi-agent podcast script generation (PodAgent pattern)
52
+ Stage 4: SYNTHESIZE --> TTS audio generation, mixing, post-production
53
+ ```
54
+
55
+ ### Full Architecture Diagram
56
+
57
+ ```
58
+ +--------------------------------------------------+
59
+ | DOCUMENT SOURCES |
60
+ | [PDF] [DOCX] [URL] [YouTube] [Audio/Video] |
61
+ +--------------------------------------------------+
62
+ |
63
+ v
64
+ +--------------------------------------------------+
65
+ | STAGE 1: INGEST |
66
+ | |
67
+ | Document Parser (pdf-parse / mammoth / cheerio) |
68
+ | | |
69
+ | v |
70
+ | [Clean Text + Metadata] |
71
+ | | |
72
+ | v |
73
+ | Semantic Chunker (400-600 tokens, 50 overlap) |
74
+ | | |
75
+ | v |
76
+ | [Chunks + Embeddings --> Vector DB] |
77
+ +--------------------------------------------------+
78
+ |
79
+ v
80
+ +--------------------------------------------------+
81
+ | STAGE 2: UNDERSTAND |
82
+ | |
83
+ | Key Point Extractor (AI) |
84
+ | | |
85
+ | v |
86
+ | [Ranked Discussion Points] |
87
+ | | |
88
+ | v |
89
+ | Outline Generator (AI) |
90
+ | | |
91
+ | v |
92
+ | [Podcast Outline: Intro -> Segments -> Outro] |
93
+ +--------------------------------------------------+
94
+ |
95
+ v
96
+ +--------------------------------------------------+
97
+ | STAGE 3: SCRIPT |
98
+ | |
99
+ | Multi-Agent Script Writer (PodAgent pattern) |
100
+ | - Host Agent: drives conversation |
101
+ | - Guest Agent: provides expert responses |
102
+ | - Writer Agent: structures + verifies |
103
+ | | |
104
+ | v |
105
+ | [Structured Script JSON] |
106
+ | { speaker, text, emotion, duration }[] |
107
+ +--------------------------------------------------+
108
+ |
109
+ v
110
+ +--------------------------------------------------+
111
+ | STAGE 4: SYNTHESIZE |
112
+ | |
113
+ | TTS Engine (per segment, per speaker) |
114
+ | | |
115
+ | v |
116
+ | [Audio Segments WAV] |
117
+ | | |
118
+ | v |
119
+ | FFmpeg Composer |
120
+ | - Concatenate segments |
121
+ | - Insert pauses (200-500ms) |
122
+ | - Add intro/outro music |
123
+ | - Normalize loudness (EBU R128, -16 LUFS) |
124
+ | | |
125
+ | v |
126
+ | [Final Podcast MP3 + ID3 Metadata] |
127
+ +--------------------------------------------------+
128
+ ```
129
+
130
+ ---
131
+
132
+ ## Stage 1: Document Ingestion
133
+
134
+ The ingestion stage accepts multiple document formats and produces clean, chunked text ready
135
+ for AI understanding. Each parser extracts both text content and structural metadata (titles,
136
+ headings, page numbers) to preserve document context.
137
+
138
+ ### Dependencies
139
+
140
+ ```json
141
+ {
142
+ "dependencies": {
143
+ "pdf-parse": "^1.1.1",
144
+ "mammoth": "^1.8.0",
145
+ "@mozilla/readability": "^0.5.0",
146
+ "cheerio": "^1.0.0",
147
+ "linkedom": "^0.18.0",
148
+ "youtube-transcript": "^1.2.1"
149
+ }
150
+ }
151
+ ```
152
+
153
+ ### PDF Parser
154
+
155
+ ```typescript
156
+ import pdfParse from 'pdf-parse';
157
+ import { readFile } from 'fs/promises';
158
+
159
+ interface ParsedDocument {
160
+ text: string;
161
+ metadata: {
162
+ title: string;
163
+ author: string;
164
+ source: string;
165
+ sourceType: 'pdf' | 'docx' | 'url' | 'youtube' | 'transcript';
166
+ pageCount?: number;
167
+ wordCount: number;
168
+ extractedAt: string;
169
+ };
170
+ sections: { heading: string; content: string; page?: number }[];
171
+ }
172
+
173
+ async function parsePDF(filePath: string): Promise<ParsedDocument> {
174
+ const buffer = await readFile(filePath);
175
+ const data = await pdfParse(buffer);
176
+
177
+ // Split into sections by detecting heading patterns
178
+ const lines = data.text.split('\n');
179
+ const sections: ParsedDocument['sections'] = [];
180
+ let currentSection = { heading: 'Introduction', content: '', page: 1 };
181
+
182
+ for (const line of lines) {
183
+ const trimmed = line.trim();
184
+ // Heuristic: short lines in ALL CAPS or Title Case are likely headings
185
+ if (
186
+ trimmed.length > 0 &&
187
+ trimmed.length < 100 &&
188
+ (trimmed === trimmed.toUpperCase() || /^[A-Z][a-z]/.test(trimmed)) &&
189
+ !trimmed.endsWith('.')
190
+ ) {
191
+ if (currentSection.content.trim()) {
192
+ sections.push({ ...currentSection });
193
+ }
194
+ currentSection = { heading: trimmed, content: '', page: currentSection.page };
195
+ } else {
196
+ currentSection.content += trimmed + ' ';
197
+ }
198
+ }
199
+ if (currentSection.content.trim()) {
200
+ sections.push(currentSection);
201
+ }
202
+
203
+ return {
204
+ text: data.text,
205
+ metadata: {
206
+ title: data.info?.Title || filePath.split('/').pop()?.replace('.pdf', '') || 'Untitled',
207
+ author: data.info?.Author || 'Unknown',
208
+ source: filePath,
209
+ sourceType: 'pdf',
210
+ pageCount: data.numpages,
211
+ wordCount: data.text.split(/\s+/).length,
212
+ extractedAt: new Date().toISOString(),
213
+ },
214
+ sections,
215
+ };
216
+ }
217
+ ```
218
+
219
+ ### DOCX Parser
220
+
221
+ ```typescript
222
+ import mammoth from 'mammoth';
223
+ import { readFile } from 'fs/promises';
224
+
225
+ async function parseDOCX(filePath: string): Promise<ParsedDocument> {
226
+ const buffer = await readFile(filePath);
227
+ const result = await mammoth.extractRawText({ buffer });
228
+ const text = result.value;
229
+
230
+ // Also extract with HTML to get heading structure
231
+ const htmlResult = await mammoth.convertToHtml({ buffer });
232
+ const sections = extractSectionsFromHtml(htmlResult.value);
233
+
234
+ return {
235
+ text,
236
+ metadata: {
237
+ title: filePath.split('/').pop()?.replace('.docx', '') || 'Untitled',
238
+ author: 'Unknown',
239
+ source: filePath,
240
+ sourceType: 'docx',
241
+ wordCount: text.split(/\s+/).length,
242
+ extractedAt: new Date().toISOString(),
243
+ },
244
+ sections,
245
+ };
246
+ }
247
+
248
+ function extractSectionsFromHtml(html: string): ParsedDocument['sections'] {
249
+ // Use regex to split on h1-h4 tags (lightweight, no DOM needed)
250
+ const headingPattern = /<h[1-4][^>]*>(.*?)<\/h[1-4]>/gi;
251
+ const sections: ParsedDocument['sections'] = [];
252
+ let lastIndex = 0;
253
+ let lastHeading = 'Introduction';
254
+ let match: RegExpExecArray | null;
255
+
256
+ while ((match = headingPattern.exec(html)) !== null) {
257
+ const content = html.slice(lastIndex, match.index).replace(/<[^>]*>/g, '').trim();
258
+ if (content) {
259
+ sections.push({ heading: lastHeading, content });
260
+ }
261
+ lastHeading = match[1].replace(/<[^>]*>/g, '').trim();
262
+ lastIndex = match.index + match[0].length;
263
+ }
264
+
265
+ // Remaining content after last heading
266
+ const remaining = html.slice(lastIndex).replace(/<[^>]*>/g, '').trim();
267
+ if (remaining) {
268
+ sections.push({ heading: lastHeading, content: remaining });
269
+ }
270
+
271
+ return sections;
272
+ }
273
+ ```
274
+
275
+ ### URL Parser (Web Articles)
276
+
277
+ ```typescript
278
+ import { Readability } from '@mozilla/readability';
279
+ import { parseHTML } from 'linkedom';
280
+
281
+ async function parseURL(url: string): Promise<ParsedDocument> {
282
+ const response = await fetch(url);
283
+ const html = await response.text();
284
+
285
+ // linkedom provides a DOM-like environment for Readability
286
+ const { document } = parseHTML(html);
287
+ const reader = new Readability(document as any);
288
+ const article = reader.parse();
289
+
290
+ if (!article) {
291
+ throw new Error(`Could not extract readable content from ${url}`);
292
+ }
293
+
294
+ const text = article.textContent || '';
295
+
296
+ return {
297
+ text,
298
+ metadata: {
299
+ title: article.title || url,
300
+ author: article.byline || 'Unknown',
301
+ source: url,
302
+ sourceType: 'url',
303
+ wordCount: text.split(/\s+/).length,
304
+ extractedAt: new Date().toISOString(),
305
+ },
306
+ sections: [{ heading: article.title || 'Article', content: text }],
307
+ };
308
+ }
309
+ ```
310
+
311
+ ### YouTube Transcript Parser
312
+
313
+ ```typescript
314
+ import { YoutubeTranscript } from 'youtube-transcript';
315
+
316
+ async function parseYouTube(videoUrl: string): Promise<ParsedDocument> {
317
+ const videoId = extractVideoId(videoUrl);
318
+ const transcript = await YoutubeTranscript.fetchTranscript(videoId);
319
+
320
+ const text = transcript.map((entry) => entry.text).join(' ');
321
+
322
+ // Group transcript into ~2-minute segments as "sections"
323
+ const sections: ParsedDocument['sections'] = [];
324
+ let currentSection = { heading: 'Opening', content: '' };
325
+ let segmentDuration = 0;
326
+ let segmentIndex = 1;
327
+
328
+ for (const entry of transcript) {
329
+ currentSection.content += entry.text + ' ';
330
+ segmentDuration += entry.duration;
331
+
332
+ if (segmentDuration >= 120) {
333
+ // 2-minute segments
334
+ sections.push({ ...currentSection });
335
+ segmentIndex++;
336
+ currentSection = {
337
+ heading: `Segment ${segmentIndex} (${formatTime(entry.offset)})`,
338
+ content: '',
339
+ };
340
+ segmentDuration = 0;
341
+ }
342
+ }
343
+ if (currentSection.content.trim()) {
344
+ sections.push(currentSection);
345
+ }
346
+
347
+ return {
348
+ text,
349
+ metadata: {
350
+ title: `YouTube: ${videoId}`,
351
+ author: 'Unknown',
352
+ source: videoUrl,
353
+ sourceType: 'youtube',
354
+ wordCount: text.split(/\s+/).length,
355
+ extractedAt: new Date().toISOString(),
356
+ },
357
+ sections,
358
+ };
359
+ }
360
+
361
+ function extractVideoId(url: string): string {
362
+ const match = url.match(
363
+ /(?:youtube\.com\/(?:watch\?v=|embed\/)|youtu\.be\/)([a-zA-Z0-9_-]{11})/
364
+ );
365
+ if (!match) throw new Error(`Invalid YouTube URL: ${url}`);
366
+ return match[1];
367
+ }
368
+
369
+ function formatTime(seconds: number): string {
370
+ const m = Math.floor(seconds / 60);
371
+ const s = Math.floor(seconds % 60);
372
+ return `${m}:${s.toString().padStart(2, '0')}`;
373
+ }
374
+ ```
375
+
376
+ ### Unified Document Ingestor
377
+
378
+ ```typescript
379
+ type DocumentSource =
380
+ | { type: 'pdf'; path: string }
381
+ | { type: 'docx'; path: string }
382
+ | { type: 'url'; url: string }
383
+ | { type: 'youtube'; url: string }
384
+ | { type: 'text'; content: string; title?: string };
385
+
386
+ async function ingestDocument(source: DocumentSource): Promise<ParsedDocument> {
387
+ switch (source.type) {
388
+ case 'pdf':
389
+ return parsePDF(source.path);
390
+ case 'docx':
391
+ return parseDOCX(source.path);
392
+ case 'url':
393
+ return parseURL(source.url);
394
+ case 'youtube':
395
+ return parseYouTube(source.url);
396
+ case 'text':
397
+ return {
398
+ text: source.content,
399
+ metadata: {
400
+ title: source.title || 'Direct Text',
401
+ author: 'User',
402
+ source: 'direct-input',
403
+ sourceType: 'transcript',
404
+ wordCount: source.content.split(/\s+/).length,
405
+ extractedAt: new Date().toISOString(),
406
+ },
407
+ sections: [{ heading: source.title || 'Content', content: source.content }],
408
+ };
409
+ }
410
+ }
411
+ ```
412
+
413
+ ### Semantic Chunker
414
+
415
+ Chunking strategy: 400-600 tokens per chunk with 50-token overlap. This ensures each chunk
416
+ has enough context for meaningful embedding while maintaining continuity across chunk boundaries.
417
+
418
+ ```typescript
419
+ interface TextChunk {
420
+ id: string;
421
+ text: string;
422
+ index: number;
423
+ sectionHeading: string;
424
+ tokenCount: number;
425
+ embedding?: number[];
426
+ }
427
+
428
+ function semanticChunk(
429
+ doc: ParsedDocument,
430
+ targetTokens: number = 500,
431
+ overlapTokens: number = 50
432
+ ): TextChunk[] {
433
+ const chunks: TextChunk[] = [];
434
+ let chunkIndex = 0;
435
+
436
+ for (const section of doc.sections) {
437
+ const words = section.content.split(/\s+/);
438
+ // Rough token estimate: 1 word ~ 1.3 tokens
439
+ const wordsPerChunk = Math.floor(targetTokens / 1.3);
440
+ const overlapWords = Math.floor(overlapTokens / 1.3);
441
+
442
+ let start = 0;
443
+ while (start < words.length) {
444
+ const end = Math.min(start + wordsPerChunk, words.length);
445
+ const chunkText = words.slice(start, end).join(' ');
446
+
447
+ if (chunkText.trim().length > 20) {
448
+ // Skip tiny fragments
449
+ chunks.push({
450
+ id: `chunk-${chunkIndex}`,
451
+ text: chunkText,
452
+ index: chunkIndex,
453
+ sectionHeading: section.heading,
454
+ tokenCount: Math.ceil(chunkText.split(/\s+/).length * 1.3),
455
+ });
456
+ chunkIndex++;
457
+ }
458
+
459
+ start = end - overlapWords;
460
+ if (start >= words.length - overlapWords) break;
461
+ }
462
+ }
463
+
464
+ return chunks;
465
+ }
466
+ ```
467
+
468
+ ### Embedding Generation
469
+
470
+ Two embedding options: Gemini embedding-001 (768d, cloud) or nomic-embed-text (local via Ollama).
471
+
472
+ ```typescript
473
+ // Option A: Gemini Embedding API
474
+ async function embedWithGemini(chunks: TextChunk[]): Promise<TextChunk[]> {
475
+ const API_KEY = process.env.GEMINI_API_KEY;
476
+ const BATCH_SIZE = 100; // Gemini supports batch embedding
477
+
478
+ for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
479
+ const batch = chunks.slice(i, i + BATCH_SIZE);
480
+ const response = await fetch(
481
+ `https://generativelanguage.googleapis.com/v1beta/models/embedding-001:batchEmbedContents?key=${API_KEY}`,
482
+ {
483
+ method: 'POST',
484
+ headers: { 'Content-Type': 'application/json' },
485
+ body: JSON.stringify({
486
+ requests: batch.map((chunk) => ({
487
+ model: 'models/embedding-001',
488
+ content: { parts: [{ text: chunk.text }] },
489
+ taskType: 'RETRIEVAL_DOCUMENT',
490
+ })),
491
+ }),
492
+ }
493
+ );
494
+
495
+ const data = await response.json();
496
+ for (let j = 0; j < batch.length; j++) {
497
+ batch[j].embedding = data.embeddings[j].values;
498
+ }
499
+ }
500
+
501
+ return chunks;
502
+ }
503
+
504
+ // Option B: Local embedding via Ollama (nomic-embed-text, 768d)
505
+ async function embedWithOllama(chunks: TextChunk[]): Promise<TextChunk[]> {
506
+ for (const chunk of chunks) {
507
+ const response = await fetch('http://localhost:11434/api/embeddings', {
508
+ method: 'POST',
509
+ headers: { 'Content-Type': 'application/json' },
510
+ body: JSON.stringify({
511
+ model: 'nomic-embed-text',
512
+ prompt: chunk.text,
513
+ }),
514
+ });
515
+ const data = await response.json();
516
+ chunk.embedding = data.embedding;
517
+ }
518
+ return chunks;
519
+ }
520
+ ```
521
+
522
+ ---
523
+
524
+ ## Stage 2: Understanding
525
+
526
+ The understanding stage transforms raw chunks into a structured podcast outline.
527
+ It identifies the most discussion-worthy points, ranks them by importance, and
528
+ generates a conversational flow.
529
+
530
+ ### Key Point Extraction
531
+
532
+ ```typescript
533
+ import { GoogleGenerativeAI } from '@google/generative-ai';
534
+
535
+ interface KeyPoint {
536
+ topic: string;
537
+ summary: string;
538
+ relevantChunks: string[]; // chunk IDs
539
+ importance: number; // 1-10
540
+ discussionAngle: string; // how to frame it for conversation
541
+ }
542
+
543
+ interface PodcastOutline {
544
+ title: string;
545
+ description: string;
546
+ targetDuration: string;
547
+ keyPoints: KeyPoint[];
548
+ segments: PodcastSegment[];
549
+ }
550
+
551
+ interface PodcastSegment {
552
+ type: 'intro' | 'discussion' | 'deep-dive' | 'recap' | 'outro';
553
+ title: string;
554
+ keyPointRefs: number[];
555
+ estimatedDuration: number; // seconds
556
+ notes: string;
557
+ }
558
+
559
+ async function extractKeyPoints(
560
+ doc: ParsedDocument,
561
+ chunks: TextChunk[],
562
+ maxPoints: number = 8
563
+ ): Promise<KeyPoint[]> {
564
+ const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
565
+ const model = genAI.getGenerativeModel({ model: 'gemini-2.0-flash' });
566
+
567
+ const chunkSummaries = chunks
568
+ .map((c) => `[${c.id}] (Section: ${c.sectionHeading}): ${c.text.slice(0, 200)}...`)
569
+ .join('\n');
570
+
571
+ const prompt = `You are an expert podcast producer analyzing a document for conversion into a podcast episode.
572
+
573
+ Document Title: ${doc.metadata.title}
574
+ Author: ${doc.metadata.author}
575
+ Word Count: ${doc.metadata.wordCount}
576
+
577
+ Document chunks:
578
+ ${chunkSummaries}
579
+
580
+ Identify the top ${maxPoints} most discussion-worthy points from this document.
581
+ For each point, provide:
582
+ 1. A clear topic name
583
+ 2. A 1-2 sentence summary
584
+ 3. The chunk IDs that are most relevant (as an array)
585
+ 4. An importance score (1-10)
586
+ 5. A discussion angle (how would podcast hosts naturally discuss this?)
587
+
588
+ Return ONLY valid JSON in this format:
589
+ [
590
+ {
591
+ "topic": "string",
592
+ "summary": "string",
593
+ "relevantChunks": ["chunk-0", "chunk-3"],
594
+ "importance": 8,
595
+ "discussionAngle": "string"
596
+ }
597
+ ]
598
+
599
+ Focus on points that:
600
+ - Would be interesting to a general audience
601
+ - Have enough depth for 2-3 minutes of discussion
602
+ - Connect to broader themes or real-world applications
603
+ - Would benefit from being explained conversationally`;
604
+
605
+ const result = await model.generateContent(prompt);
606
+ const text = result.response.text();
607
+
608
+ // Extract JSON from response (handle markdown code blocks)
609
+ const jsonMatch = text.match(/\[[\s\S]*\]/);
610
+ if (!jsonMatch) throw new Error('Failed to extract key points JSON from AI response');
611
+
612
+ const keyPoints: KeyPoint[] = JSON.parse(jsonMatch[0]);
613
+ return keyPoints.sort((a, b) => b.importance - a.importance).slice(0, maxPoints);
614
+ }
615
+ ```
616
+
617
+ ### Outline Generation
618
+
619
+ ```typescript
620
+ async function generateOutline(
621
+ doc: ParsedDocument,
622
+ keyPoints: KeyPoint[],
623
+ config: { format: string; duration: string }
624
+ ): Promise<PodcastOutline> {
625
+ const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
626
+ const model = genAI.getGenerativeModel({ model: 'gemini-2.0-flash' });
627
+
628
+ const durationSeconds = parseDuration(config.duration);
629
+ const pointsSummary = keyPoints
630
+ .map((kp, i) => `${i + 1}. [Importance: ${kp.importance}] ${kp.topic}: ${kp.summary}`)
631
+ .join('\n');
632
+
633
+ const prompt = `You are a podcast producer creating an episode outline.
634
+
635
+ Document: "${doc.metadata.title}" by ${doc.metadata.author}
636
+ Format: ${config.format}
637
+ Target Duration: ${config.duration} (${durationSeconds} seconds)
638
+
639
+ Key discussion points (ranked by importance):
640
+ ${pointsSummary}
641
+
642
+ Create a podcast outline with these segments:
643
+ 1. INTRO (10-15% of duration): Hook the listener, introduce the topic
644
+ 2. DISCUSSION segments (70-80%): Cover the key points in a logical flow
645
+ 3. RECAP/OUTRO (10-15%): Summarize takeaways, closing thoughts
646
+
647
+ For a "${config.format}" format:
648
+ - "deep-dive": Thorough exploration, technical depth, expert tone
649
+ - "brief": Quick overview, highlight the top 3-4 points only
650
+ - "debate": Present contrasting viewpoints on each point
651
+ - "narration": Single narrator, storytelling approach
652
+
653
+ Return ONLY valid JSON:
654
+ {
655
+ "title": "Episode title",
656
+ "description": "1-2 sentence episode description",
657
+ "targetDuration": "${config.duration}",
658
+ "segments": [
659
+ {
660
+ "type": "intro|discussion|deep-dive|recap|outro",
661
+ "title": "Segment title",
662
+ "keyPointRefs": [0, 1],
663
+ "estimatedDuration": 120,
664
+ "notes": "Production notes for script writer"
665
+ }
666
+ ]
667
+ }
668
+
669
+ Ensure total estimatedDuration across all segments equals approximately ${durationSeconds} seconds.`;
670
+
671
+ const result = await model.generateContent(prompt);
672
+ const text = result.response.text();
673
+ const jsonMatch = text.match(/\{[\s\S]*\}/);
674
+ if (!jsonMatch) throw new Error('Failed to extract outline JSON from AI response');
675
+
676
+ const outline: PodcastOutline = JSON.parse(jsonMatch[0]);
677
+ outline.keyPoints = keyPoints;
678
+ return outline;
679
+ }
680
+
681
+ function parseDuration(duration: string): number {
682
+ const match = duration.match(/(\d+)\s*min/);
683
+ return match ? parseInt(match[1]) * 60 : 900; // default 15min
684
+ }
685
+ ```
686
+
687
+ ---
688
+
689
+ ## Stage 3: Script Generation
690
+
691
+ The script generation stage uses the PodAgent pattern (ACL 2025): three specialized AI agents
692
+ collaborate to produce a natural, engaging podcast script with faithfulness verification.
693
+
694
+ ### PodAgent Multi-Agent Architecture
695
+
696
+ ```
697
+ +-------------------+ +-------------------+ +-------------------+
698
+ | HOST AGENT | | GUEST AGENT | | WRITER AGENT |
699
+ | | | | | |
700
+ | - Drives convo | | - Expert voice | | - Structures flow |
701
+ | - Asks questions | | - Provides depth | | - Verifies facts |
702
+ | - Transitions | | - Uses analogies | | - Controls timing |
703
+ | - Engages listener| | - Cites sources | | - Ensures quality |
704
+ +-------------------+ +-------------------+ +-------------------+
705
+ | | |
706
+ +-------------------------+-------------------------+
707
+ |
708
+ v
709
+ +-----------------------------+
710
+ | STRUCTURED SCRIPT JSON |
711
+ | [{speaker, text, emotion, |
712
+ | duration, segmentRef}] |
713
+ +-----------------------------+
714
+ ```
715
+
716
+ ### Script Data Types
717
+
718
+ ```typescript
719
+ interface ScriptLine {
720
+ speaker: 'host' | 'guest';
721
+ text: string;
722
+ emotion: 'neutral' | 'excited' | 'thoughtful' | 'humorous' | 'serious' | 'curious';
723
+ estimatedDuration: number; // seconds (based on ~150 words/minute speaking rate)
724
+ segmentRef: string; // which outline segment this belongs to
725
+ }
726
+
727
+ interface PodcastScript {
728
+ title: string;
729
+ totalDuration: number;
730
+ speakers: {
731
+ host: { name: string; personality: string };
732
+ guest: { name: string; personality: string };
733
+ };
734
+ lines: ScriptLine[];
735
+ }
736
+ ```
737
+
738
+ ### Script Generator
739
+
740
+ ```typescript
741
+ async function generateScript(
742
+ outline: PodcastOutline,
743
+ chunks: TextChunk[],
744
+ config: PipelineConfig
745
+ ): Promise<PodcastScript> {
746
+ const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
747
+ // Use the most capable model for creative script writing
748
+ const model = genAI.getGenerativeModel({ model: 'gemini-2.5-pro-preview-05-06' });
749
+
750
+ const speakerNames = config.speakers.length >= 2
751
+ ? { host: config.speakers[0].name, guest: config.speakers[1].name }
752
+ : { host: 'Alex', guest: 'Jordan' };
753
+
754
+ // Gather source material for each segment
755
+ const segmentContext = outline.segments.map((seg) => {
756
+ const relevantPoints = seg.keyPointRefs.map((ref) => outline.keyPoints[ref]);
757
+ const relevantChunkIds = relevantPoints.flatMap((kp) => kp.relevantChunks);
758
+ const sourceText = chunks
759
+ .filter((c) => relevantChunkIds.includes(c.id))
760
+ .map((c) => c.text)
761
+ .join('\n\n');
762
+
763
+ return {
764
+ segment: seg,
765
+ sourceText: sourceText.slice(0, 2000), // Token budget management
766
+ points: relevantPoints,
767
+ };
768
+ });
769
+
770
+ const prompt = `You are a team of three podcast production agents creating a script.
771
+
772
+ PODCAST: "${outline.title}"
773
+ DESCRIPTION: ${outline.description}
774
+ FORMAT: ${config.format}
775
+ TARGET DURATION: ${outline.targetDuration}
776
+ HOST: ${speakerNames.host} - Curious, engaging, asks great follow-up questions
777
+ GUEST: ${speakerNames.guest} - Expert, uses analogies, explains complex ideas simply
778
+
779
+ SEGMENTS AND SOURCE MATERIAL:
780
+ ${segmentContext
781
+ .map(
782
+ (sc, i) => `
783
+ --- SEGMENT ${i + 1}: ${sc.segment.title} (${sc.segment.type}, ~${sc.segment.estimatedDuration}s) ---
784
+ Key Points: ${sc.points.map((p) => p.topic).join(', ')}
785
+ Discussion Angles: ${sc.points.map((p) => p.discussionAngle).join('; ')}
786
+ Source Material: ${sc.sourceText}
787
+ `
788
+ )
789
+ .join('\n')}
790
+
791
+ SCRIPT RULES:
792
+ 1. Write natural, conversational dialogue -- NOT robotic or scripted-sounding
793
+ 2. Host asks questions, makes transitions, keeps energy up
794
+ 3. Guest provides substance, uses analogies and examples
795
+ 4. Include verbal fillers sparingly ("you know", "right", "exactly") for naturalness
796
+ 5. Each speaker turn should be 20-60 words (30 words = ~12 seconds at speaking pace)
797
+ 6. Total script must hit approximately ${parseDuration(outline.targetDuration)} seconds
798
+ 7. Speaking rate assumption: 150 words per minute (2.5 words per second)
799
+ 8. FAITHFULNESS: Every claim must be traceable to the source material. Do not fabricate facts.
800
+ 9. Include emotional tone markers for TTS guidance
801
+
802
+ Return ONLY valid JSON:
803
+ {
804
+ "title": "${outline.title}",
805
+ "totalDuration": ${parseDuration(outline.targetDuration)},
806
+ "speakers": {
807
+ "host": { "name": "${speakerNames.host}", "personality": "Curious and engaging" },
808
+ "guest": { "name": "${speakerNames.guest}", "personality": "Expert and insightful" }
809
+ },
810
+ "lines": [
811
+ {
812
+ "speaker": "host",
813
+ "text": "Welcome to the show! Today we are diving into...",
814
+ "emotion": "excited",
815
+ "estimatedDuration": 8,
816
+ "segmentRef": "intro"
817
+ }
818
+ ]
819
+ }`;
820
+
821
+ const result = await model.generateContent(prompt);
822
+ const text = result.response.text();
823
+ const jsonMatch = text.match(/\{[\s\S]*\}/);
824
+ if (!jsonMatch) throw new Error('Failed to extract script JSON from AI response');
825
+
826
+ const script: PodcastScript = JSON.parse(jsonMatch[0]);
827
+
828
+ // Verify faithfulness with a second pass
829
+ const verified = await verifyFaithfulness(script, chunks, genAI);
830
+
831
+ return verified;
832
+ }
833
+ ```
834
+
835
+ ### Faithfulness Verification
836
+
837
+ The Writer Agent's verification pass ensures no hallucinated facts sneak into the script.
838
+ This is critical -- the podcast claims must be traceable to source material.
839
+
840
+ ```typescript
841
+ async function verifyFaithfulness(
842
+ script: PodcastScript,
843
+ chunks: TextChunk[],
844
+ genAI: GoogleGenerativeAI
845
+ ): Promise<PodcastScript> {
846
+ const model = genAI.getGenerativeModel({ model: 'gemini-2.0-flash' });
847
+
848
+ const sourceText = chunks.map((c) => c.text).join('\n\n').slice(0, 8000);
849
+ const scriptText = script.lines.map((l) => `${l.speaker}: ${l.text}`).join('\n');
850
+
851
+ const prompt = `You are a fact-checker for a podcast script. Compare the script against the source material.
852
+
853
+ SOURCE MATERIAL:
854
+ ${sourceText}
855
+
856
+ PODCAST SCRIPT:
857
+ ${scriptText}
858
+
859
+ For each line in the script, check if the claims are supported by the source material.
860
+ If a line contains unsupported claims, rewrite it to be faithful to the source.
861
+ If a line is opinion/transition/question, mark it as OK.
862
+
863
+ Return ONLY valid JSON -- an array of objects:
864
+ [
865
+ { "lineIndex": 0, "status": "ok" },
866
+ { "lineIndex": 3, "status": "revised", "revisedText": "corrected text here" }
867
+ ]
868
+
869
+ Only include lines that need revision. If all lines are faithful, return an empty array [].`;
870
+
871
+ const result = await model.generateContent(prompt);
872
+ const text = result.response.text();
873
+ const jsonMatch = text.match(/\[[\s\S]*\]/);
874
+ if (!jsonMatch) return script; // If parsing fails, return original
875
+
876
+ const revisions = JSON.parse(jsonMatch[0]);
877
+ for (const rev of revisions) {
878
+ if (rev.status === 'revised' && rev.revisedText && script.lines[rev.lineIndex]) {
879
+ script.lines[rev.lineIndex].text = rev.revisedText;
880
+ }
881
+ }
882
+
883
+ return script;
884
+ }
885
+ ```
886
+
887
+ ### Duration Control
888
+
889
+ ```typescript
890
+ function validateScriptDuration(script: PodcastScript): {
891
+ actual: number;
892
+ target: number;
893
+ deviation: number;
894
+ withinTolerance: boolean;
895
+ } {
896
+ const actual = script.lines.reduce((sum, line) => sum + line.estimatedDuration, 0);
897
+ const deviation = Math.abs(actual - script.totalDuration) / script.totalDuration;
898
+
899
+ return {
900
+ actual,
901
+ target: script.totalDuration,
902
+ deviation,
903
+ withinTolerance: deviation <= 0.15, // 15% tolerance
904
+ };
905
+ }
906
+
907
+ function adjustScriptDuration(script: PodcastScript): PodcastScript {
908
+ const validation = validateScriptDuration(script);
909
+ if (validation.withinTolerance) return script;
910
+
911
+ const ratio = script.totalDuration / validation.actual;
912
+
913
+ if (ratio < 1) {
914
+ // Script too long -- trim from the middle (keep intro/outro intact)
915
+ const middleLines = script.lines.filter(
916
+ (l) => l.segmentRef !== 'intro' && l.segmentRef !== 'outro'
917
+ );
918
+ const excessSeconds = validation.actual - script.totalDuration;
919
+ let trimmed = 0;
920
+
921
+ // Remove the shortest lines from the middle until we are within target
922
+ const sortedByDuration = [...middleLines].sort(
923
+ (a, b) => a.estimatedDuration - b.estimatedDuration
924
+ );
925
+ const linesToRemove = new Set<ScriptLine>();
926
+
927
+ for (const line of sortedByDuration) {
928
+ if (trimmed >= excessSeconds) break;
929
+ linesToRemove.add(line);
930
+ trimmed += line.estimatedDuration;
931
+ }
932
+
933
+ script.lines = script.lines.filter((l) => !linesToRemove.has(l));
934
+ }
935
+
936
+ return script;
937
+ }
938
+ ```
939
+
940
+ ---
941
+
942
+ ## Stage 4: Audio Synthesis
943
+
944
+ The synthesis stage converts the structured script into a polished podcast audio file.
945
+ Each script line is synthesized individually with the appropriate speaker voice, then
946
+ composed into a final mix with pauses, optional music, and loudness normalization.
947
+
948
+ ### TTS Provider Interface
949
+
950
+ ```typescript
951
+ interface TTSProvider {
952
+ synthesize(text: string, voice: string, emotion?: string): Promise<Buffer>;
953
+ listVoices(): Promise<{ id: string; name: string; gender: string }[]>;
954
+ }
955
+
956
+ interface SpeakerConfig {
957
+ name: string;
958
+ role: 'host' | 'guest' | 'narrator';
959
+ voiceId: string;
960
+ provider: 'elevenlabs' | 'orpheus' | 'chatterbox' | 'google-cloud';
961
+ }
962
+
963
+ interface PodcastAudio {
964
+ filePath: string;
965
+ duration: number;
966
+ format: string;
967
+ fileSize: number;
968
+ metadata: {
969
+ title: string;
970
+ description: string;
971
+ speakers: string[];
972
+ generatedAt: string;
973
+ };
974
+ }
975
+ ```
976
+
977
+ ### ElevenLabs TTS Implementation
978
+
979
+ ```typescript
980
+ class ElevenLabsTTS implements TTSProvider {
981
+ private apiKey: string;
982
+ private baseUrl = 'https://api.elevenlabs.io/v1';
983
+
984
+ constructor(apiKey: string) {
985
+ this.apiKey = apiKey;
986
+ }
987
+
988
+ async synthesize(text: string, voiceId: string, emotion?: string): Promise<Buffer> {
989
+ // ElevenLabs supports emotion through stability/similarity settings
990
+ const stability = emotion === 'excited' ? 0.3 : emotion === 'serious' ? 0.8 : 0.5;
991
+ const similarityBoost = 0.75;
992
+
993
+ const response = await fetch(`${this.baseUrl}/text-to-speech/${voiceId}`, {
994
+ method: 'POST',
995
+ headers: {
996
+ 'xi-api-key': this.apiKey,
997
+ 'Content-Type': 'application/json',
998
+ },
999
+ body: JSON.stringify({
1000
+ text,
1001
+ model_id: 'eleven_multilingual_v2',
1002
+ voice_settings: {
1003
+ stability,
1004
+ similarity_boost: similarityBoost,
1005
+ style: emotion === 'excited' ? 0.7 : 0.3,
1006
+ use_speaker_boost: true,
1007
+ },
1008
+ }),
1009
+ });
1010
+
1011
+ if (!response.ok) {
1012
+ throw new Error(`ElevenLabs TTS failed: ${response.status} ${await response.text()}`);
1013
+ }
1014
+
1015
+ return Buffer.from(await response.arrayBuffer());
1016
+ }
1017
+
1018
+ async listVoices() {
1019
+ const response = await fetch(`${this.baseUrl}/voices`, {
1020
+ headers: { 'xi-api-key': this.apiKey },
1021
+ });
1022
+ const data = await response.json();
1023
+ return data.voices.map((v: any) => ({
1024
+ id: v.voice_id,
1025
+ name: v.name,
1026
+ gender: v.labels?.gender || 'unknown',
1027
+ }));
1028
+ }
1029
+ }
1030
+ ```
1031
+
1032
+ ### Google Cloud TTS Implementation
1033
+
1034
+ ```typescript
1035
+ class GoogleCloudTTS implements TTSProvider {
1036
+ private apiKey: string;
1037
+
1038
+ constructor(apiKey: string) {
1039
+ this.apiKey = apiKey;
1040
+ }
1041
+
1042
+ async synthesize(text: string, voiceId: string, emotion?: string): Promise<Buffer> {
1043
+ const response = await fetch(
1044
+ `https://texttospeech.googleapis.com/v1/text:synthesize?key=${this.apiKey}`,
1045
+ {
1046
+ method: 'POST',
1047
+ headers: { 'Content-Type': 'application/json' },
1048
+ body: JSON.stringify({
1049
+ input: { text },
1050
+ voice: {
1051
+ languageCode: 'en-US',
1052
+ name: voiceId, // e.g., 'en-US-Studio-O' (male) or 'en-US-Studio-Q' (female)
1053
+ },
1054
+ audioConfig: {
1055
+ audioEncoding: 'LINEAR16',
1056
+ sampleRateHertz: 24000,
1057
+ speakingRate: emotion === 'excited' ? 1.1 : emotion === 'thoughtful' ? 0.9 : 1.0,
1058
+ pitch: emotion === 'curious' ? 1.5 : 0,
1059
+ },
1060
+ }),
1061
+ }
1062
+ );
1063
+
1064
+ const data = await response.json();
1065
+ return Buffer.from(data.audioContent, 'base64');
1066
+ }
1067
+
1068
+ async listVoices() {
1069
+ const response = await fetch(
1070
+ `https://texttospeech.googleapis.com/v1/voices?key=${this.apiKey}`
1071
+ );
1072
+ const data = await response.json();
1073
+ return data.voices
1074
+ .filter((v: any) => v.name.includes('Studio') || v.name.includes('Neural2'))
1075
+ .map((v: any) => ({
1076
+ id: v.name,
1077
+ name: v.name,
1078
+ gender: v.ssmlGender?.toLowerCase() || 'unknown',
1079
+ }));
1080
+ }
1081
+ }
1082
+ ```
1083
+
1084
+ ### Per-Segment Audio Generation
1085
+
1086
+ ```typescript
1087
+ import { writeFile, mkdir } from 'fs/promises';
1088
+ import { join } from 'path';
1089
+
1090
+ async function generateAudioSegments(
1091
+ script: PodcastScript,
1092
+ speakers: SpeakerConfig[],
1093
+ outputDir: string
1094
+ ): Promise<string[]> {
1095
+ await mkdir(outputDir, { recursive: true });
1096
+
1097
+ // Create TTS providers for each speaker
1098
+ const ttsProviders: Record<string, { provider: TTSProvider; voiceId: string }> = {};
1099
+
1100
+ for (const speaker of speakers) {
1101
+ switch (speaker.provider) {
1102
+ case 'elevenlabs':
1103
+ ttsProviders[speaker.role] = {
1104
+ provider: new ElevenLabsTTS(process.env.ELEVENLABS_API_KEY!),
1105
+ voiceId: speaker.voiceId,
1106
+ };
1107
+ break;
1108
+ case 'google-cloud':
1109
+ ttsProviders[speaker.role] = {
1110
+ provider: new GoogleCloudTTS(process.env.GOOGLE_TTS_API_KEY!),
1111
+ voiceId: speaker.voiceId,
1112
+ };
1113
+ break;
1114
+ // Add other providers as needed
1115
+ }
1116
+ }
1117
+
1118
+ const segmentPaths: string[] = [];
1119
+ const totalLines = script.lines.length;
1120
+
1121
+ for (let i = 0; i < totalLines; i++) {
1122
+ const line = script.lines[i];
1123
+ const tts = ttsProviders[line.speaker];
1124
+
1125
+ if (!tts) {
1126
+ console.warn(`No TTS provider configured for speaker: ${line.speaker}, skipping`);
1127
+ continue;
1128
+ }
1129
+
1130
+ console.log(`Synthesizing line ${i + 1}/${totalLines}: ${line.speaker} (${line.emotion})`);
1131
+
1132
+ const audioBuffer = await tts.provider.synthesize(line.text, tts.voiceId, line.emotion);
1133
+ const segmentPath = join(outputDir, `segment-${String(i).padStart(4, '0')}.wav`);
1134
+ await writeFile(segmentPath, audioBuffer);
1135
+ segmentPaths.push(segmentPath);
1136
+
1137
+ // Rate limiting: avoid API throttling
1138
+ await new Promise((resolve) => setTimeout(resolve, 200));
1139
+ }
1140
+
1141
+ return segmentPaths;
1142
+ }
1143
+ ```
1144
+
1145
+ ### FFmpeg Audio Composition
1146
+
1147
+ The final composition pipeline concatenates all speech segments with natural pauses,
1148
+ optionally adds intro/outro music, and normalizes to podcast-standard loudness.
1149
+
1150
+ ```typescript
1151
+ import { execFile } from 'child_process';
1152
+ import { promisify } from 'util';
1153
+ import { writeFile as writeFileAsync } from 'fs/promises';
1154
+ import { join } from 'path';
1155
+
1156
+ const execFileAsync = promisify(execFile);
1157
+
1158
+ /**
1159
+ * Run an FFmpeg command safely using execFile (no shell injection risk).
1160
+ * For complex filter graphs, use the -filter_complex flag as a single argument.
1161
+ */
1162
+ async function runFFmpeg(args: string[]): Promise<{ stdout: string; stderr: string }> {
1163
+ return execFileAsync('ffmpeg', args);
1164
+ }
1165
+
1166
+ async function composeAudio(
1167
+ segmentPaths: string[],
1168
+ config: PipelineConfig,
1169
+ outputDir: string,
1170
+ title: string
1171
+ ): Promise<string> {
1172
+ // Step 1: Generate silence segments for natural pauses
1173
+ const pauseDuration = '0.35'; // 350ms between speaker turns
1174
+ const longPauseDuration = '0.8'; // 800ms between segments/topics
1175
+ const silencePath = join(outputDir, 'silence-short.wav');
1176
+ const longSilencePath = join(outputDir, 'silence-long.wav');
1177
+
1178
+ await runFFmpeg([
1179
+ '-y', '-f', 'lavfi', '-i', `anullsrc=r=24000:cl=mono`,
1180
+ '-t', pauseDuration, silencePath,
1181
+ ]);
1182
+ await runFFmpeg([
1183
+ '-y', '-f', 'lavfi', '-i', `anullsrc=r=24000:cl=mono`,
1184
+ '-t', longPauseDuration, longSilencePath,
1185
+ ]);
1186
+
1187
+ // Step 2: Build concat file list
1188
+ const concatListPath = join(outputDir, 'concat-list.txt');
1189
+ const concatEntries: string[] = [];
1190
+
1191
+ for (let i = 0; i < segmentPaths.length; i++) {
1192
+ concatEntries.push(`file '${segmentPaths[i].replace(/\\/g, '/')}'`);
1193
+
1194
+ if (i < segmentPaths.length - 1) {
1195
+ concatEntries.push(`file '${silencePath.replace(/\\/g, '/')}'`);
1196
+ }
1197
+ }
1198
+
1199
+ await writeFileAsync(concatListPath, concatEntries.join('\n'));
1200
+
1201
+ // Step 3: Concatenate all segments
1202
+ const rawConcatPath = join(outputDir, 'raw-concat.wav');
1203
+ await runFFmpeg([
1204
+ '-y', '-f', 'concat', '-safe', '0',
1205
+ '-i', concatListPath, '-c', 'copy', rawConcatPath,
1206
+ ]);
1207
+
1208
+ // Step 4: Optionally add intro/outro music
1209
+ let preMasterPath = rawConcatPath;
1210
+
1211
+ if (config.includeMusic) {
1212
+ preMasterPath = join(outputDir, 'with-music.wav');
1213
+ const introMusicPath = join(outputDir, '..', 'assets', 'intro-music.wav');
1214
+ const outroMusicPath = join(outputDir, '..', 'assets', 'outro-music.wav');
1215
+
1216
+ // Overlay intro music (ducked under speech) and append outro
1217
+ await runFFmpeg([
1218
+ '-y',
1219
+ '-i', rawConcatPath,
1220
+ '-i', introMusicPath,
1221
+ '-i', outroMusicPath,
1222
+ '-filter_complex',
1223
+ '[1:a]atrim=0:8,afade=t=in:d=1:st=0,afade=t=out:d=2:st=6,volume=0.15[intro_music];' +
1224
+ '[2:a]afade=t=in:d=1:st=0,afade=t=out:d=2:st=6,volume=0.15[outro_music];' +
1225
+ '[intro_music][0:a][outro_music]concat=n=3:v=0:a=1[mixed]',
1226
+ '-map', '[mixed]', preMasterPath,
1227
+ ]);
1228
+ }
1229
+
1230
+ // Step 5: Loudness normalization (EBU R128, -16 LUFS for podcasts)
1231
+ const normalizedPath = join(outputDir, 'normalized.wav');
1232
+ await runFFmpeg([
1233
+ '-y', '-i', preMasterPath,
1234
+ '-af', 'loudnorm=I=-16:TP=-1.5:LRA=11:print_format=json',
1235
+ normalizedPath,
1236
+ ]);
1237
+
1238
+ // Step 6: Export as final format with metadata
1239
+ const outputExt = config.outputFormat || 'mp3';
1240
+ const finalPath = join(outputDir, `podcast-final.${outputExt}`);
1241
+
1242
+ if (outputExt === 'mp3') {
1243
+ await runFFmpeg([
1244
+ '-y', '-i', normalizedPath,
1245
+ '-codec:a', 'libmp3lame', '-b:a', '192k',
1246
+ '-metadata', `title=${title}`,
1247
+ '-metadata', 'artist=Generated Podcast',
1248
+ '-metadata', 'album=Document-to-Podcast',
1249
+ '-metadata', 'genre=Podcast',
1250
+ '-metadata', `date=${new Date().getFullYear()}`,
1251
+ finalPath,
1252
+ ]);
1253
+ } else if (outputExt === 'ogg') {
1254
+ await runFFmpeg([
1255
+ '-y', '-i', normalizedPath,
1256
+ '-codec:a', 'libvorbis', '-q:a', '6',
1257
+ '-metadata', `title=${title}`,
1258
+ finalPath,
1259
+ ]);
1260
+ } else {
1261
+ // WAV -- just copy
1262
+ await runFFmpeg(['-y', '-i', normalizedPath, finalPath]);
1263
+ }
1264
+
1265
+ return finalPath;
1266
+ }
1267
+ ```
1268
+
1269
+ ### FFmpeg Command Reference (Standalone)
1270
+
1271
+ For manual use or debugging, here are the key FFmpeg commands in the pipeline:
1272
+
1273
+ ```bash
1274
+ # Generate silence (350ms pause between turns)
1275
+ ffmpeg -y -f lavfi -i anullsrc=r=24000:cl=mono -t 0.35 silence.wav
1276
+
1277
+ # Concatenate segments from a file list
1278
+ ffmpeg -y -f concat -safe 0 -i concat-list.txt -c copy raw-concat.wav
1279
+
1280
+ # Normalize to podcast standard (-16 LUFS, EBU R128)
1281
+ ffmpeg -y -i raw-concat.wav -af "loudnorm=I=-16:TP=-1.5:LRA=11" normalized.wav
1282
+
1283
+ # Two-pass loudness normalization (higher precision)
1284
+ # Pass 1: Measure
1285
+ ffmpeg -i raw-concat.wav -af "loudnorm=I=-16:TP=-1.5:LRA=11:print_format=json" -f null /dev/null
1286
+ # Pass 2: Apply measured values (replace measured_* with Pass 1 output)
1287
+ ffmpeg -i raw-concat.wav -af "loudnorm=I=-16:TP=-1.5:LRA=11:measured_I=-23.5:measured_TP=-4.2:measured_LRA=7.1:measured_thresh=-34.0:offset=-0.3:linear=true" normalized.wav
1288
+
1289
+ # Export as MP3 with metadata tags
1290
+ ffmpeg -y -i normalized.wav -codec:a libmp3lame -b:a 192k \
1291
+ -metadata title="Episode Title" \
1292
+ -metadata artist="Podcast Name" \
1293
+ -metadata album="Season 1" \
1294
+ -metadata genre="Podcast" \
1295
+ podcast-final.mp3
1296
+
1297
+ # Add intro music (ducked under speech)
1298
+ ffmpeg -y -i speech.wav -i intro.wav \
1299
+ -filter_complex "[1:a]volume=0.15,afade=t=out:d=2:st=6[music];[music][0:a]concat=n=2:v=0:a=1[out]" \
1300
+ -map "[out]" with-intro.wav
1301
+
1302
+ # Quick quality check: get loudness stats
1303
+ ffmpeg -i podcast-final.mp3 -af "loudnorm=print_format=json" -f null /dev/null 2>&1 | tail -20
1304
+ ```
1305
+
1306
+ ---
1307
+
1308
+ ## Two Reference Implementations
1309
+
1310
+ ### A) Full-Scale (GPU Server / Cloud API)
1311
+
1312
+ Based on Meta NotebookLlama's tiered model approach. Best quality, requires GPU or API budget.
1313
+
1314
+ ```
1315
+ Architecture:
1316
+ Stage 1 (Ingest): pdf-parse / mammoth / readability (same for both)
1317
+ Stage 2 (Understand): Gemini 2.5 Pro for key point extraction + outline
1318
+ Gemini embedding-001 for chunk embeddings
1319
+ pgvector for vector storage
1320
+ Stage 3 (Script): Claude Opus / Gemini 2.5 Pro for script generation
1321
+ Gemini Flash for faithfulness verification
1322
+ (Large model = better creative writing)
1323
+ Stage 4 (Synthesize): ElevenLabs Multilingual V2 for TTS
1324
+ (Or Orpheus TTS 3B on local GPU -- open source, near-commercial quality)
1325
+ FFmpeg for composition + mastering
1326
+
1327
+ Cost estimate (15-min episode from 10-page PDF):
1328
+ - Gemini 2.5 Pro: ~$0.15 (input) + $0.30 (output) = ~$0.45
1329
+ - Gemini Flash (verification): ~$0.01
1330
+ - Gemini Embedding: ~$0.001
1331
+ - ElevenLabs TTS: ~$0.50 (15 min at scale tier)
1332
+ - Total: ~$1.00 per episode
1333
+
1334
+ Hardware: Any machine. All processing is API-based.
1335
+ Latency: 3-8 minutes for a 15-minute episode.
1336
+ ```
1337
+
1338
+ **Meta NotebookLlama Model Tiers (for self-hosted GPU):**
1339
+
1340
+ | Stage | Model | Purpose | Why This Size |
1341
+ |-------|-------|---------|---------------|
1342
+ | Text cleanup | Llama-3.2-1B | Strip headers, fix OCR errors | Small = fast, simple task |
1343
+ | Script writing | Llama-3.1-70B (or API) | Creative multi-speaker dialogue | Large = better creativity |
1344
+ | TTS prep | Llama-3.1-8B | Add SSML/emotion markers | Medium = good enough |
1345
+ | Audio | Orpheus TTS 3B | Speech synthesis | Specialized model |
1346
+
1347
+ Key insight from Meta's research: **do not use the same model for every stage.** Match model
1348
+ capability to task complexity. Small models are better (faster, cheaper) for mechanical tasks;
1349
+ large models are needed only for creative generation.
1350
+
1351
+ ### B) Local / CPU-Only
1352
+
1353
+ Based on Mozilla AI's Document-to-Podcast Blueprint. Fully private, zero API cost, runs on
1354
+ consumer hardware. Lower quality but completely offline.
1355
+
1356
+ ```
1357
+ Architecture:
1358
+ Stage 1 (Ingest): Same parsers (pdf-parse, mammoth, readability)
1359
+ Stage 2 (Understand): Llama 3.2 3B GGUF via llama_cpp (Q4_K_M quantization)
1360
+ nomic-embed-text via Ollama for embeddings
1361
+ Qdrant (local Docker) for vector storage
1362
+ Stage 3 (Script): Llama 3.1 8B GGUF via llama_cpp (Q5_K_M quantization)
1363
+ Self-verification (same model, second pass)
1364
+ Stage 4 (Synthesize): Orpheus TTS 150M (CPU-optimized) or Parler TTS Mini
1365
+ FFmpeg for composition + mastering
1366
+
1367
+ Cost: $0.00 (no API calls)
1368
+
1369
+ Hardware requirements:
1370
+ - RAM: 16GB minimum (8B model needs ~6GB in Q4)
1371
+ - Storage: ~15GB for all models
1372
+ - CPU: Modern 8-core (Intel 12th+ / AMD 5000+)
1373
+ - GPU: None required (but CUDA/Metal accelerates if available)
1374
+
1375
+ Latency: 15-45 minutes for a 15-minute episode (CPU-bound on TTS).
1376
+
1377
+ Model downloads (one-time):
1378
+ # Via Ollama (easiest)
1379
+ ollama pull llama3.2:3b # Understanding stage
1380
+ ollama pull llama3.1:8b # Script generation
1381
+ ollama pull nomic-embed-text # Embeddings
1382
+
1383
+ # Via llama_cpp (more control)
1384
+ # Download GGUF from huggingface.co/TheBloke or official repos
1385
+ ```
1386
+
1387
+ **Quality Comparison:**
1388
+
1389
+ | Aspect | Full-Scale (API) | Local (CPU) |
1390
+ |--------|-----------------|-------------|
1391
+ | Script naturalness | 9/10 | 6/10 |
1392
+ | Voice quality | 9/10 (ElevenLabs) | 5/10 (Orpheus 150M) |
1393
+ | Faithfulness | 9/10 (separate verifier) | 7/10 (self-verify) |
1394
+ | Latency (15min ep) | 3-8 min | 15-45 min |
1395
+ | Cost per episode | ~$1.00 | $0.00 |
1396
+ | Privacy | Data sent to APIs | Fully local |
1397
+ | Offline capable | No | Yes |
1398
+
1399
+ ---
1400
+
1401
+ ## Complete TypeScript Pipeline Class
1402
+
1403
+ ```typescript
1404
+ import { mkdir, writeFile, stat, readFile, rm } from 'fs/promises';
1405
+ import { join } from 'path';
1406
+ import { execFile } from 'child_process';
1407
+ import { promisify } from 'util';
1408
+ import { GoogleGenerativeAI } from '@google/generative-ai';
1409
+
1410
+ const execFileAsync = promisify(execFile);
1411
+
1412
+ // --- Configuration ---------------------------------------------------------
1413
+
1414
+ interface PipelineConfig {
1415
+ ttsProvider: 'elevenlabs' | 'orpheus' | 'chatterbox' | 'google-cloud';
1416
+ aiProvider: 'gemini' | 'claude';
1417
+ format: 'deep-dive' | 'brief' | 'debate' | 'narration';
1418
+ duration: '5min' | '15min' | '30min' | '60min';
1419
+ speakers: SpeakerConfig[];
1420
+ outputFormat: 'mp3' | 'wav' | 'ogg';
1421
+ includeMusic: boolean;
1422
+ language: string;
1423
+ outputDir: string;
1424
+ /** If true, keep intermediate files (segments, concat list) for debugging */
1425
+ keepIntermediates: boolean;
1426
+ }
1427
+
1428
+ const DEFAULT_CONFIG: PipelineConfig = {
1429
+ ttsProvider: 'elevenlabs',
1430
+ aiProvider: 'gemini',
1431
+ format: 'deep-dive',
1432
+ duration: '15min',
1433
+ speakers: [
1434
+ { name: 'Alex', role: 'host', voiceId: 'pNInz6obpgDQGcFmaJgB', provider: 'elevenlabs' },
1435
+ { name: 'Jordan', role: 'guest', voiceId: '21m00Tcm4TlvDq8ikWAM', provider: 'elevenlabs' },
1436
+ ],
1437
+ outputFormat: 'mp3',
1438
+ includeMusic: false,
1439
+ language: 'en',
1440
+ outputDir: './podcast-output',
1441
+ keepIntermediates: false,
1442
+ };
1443
+
1444
+ // --- Pipeline Class --------------------------------------------------------
1445
+
1446
+ class DocToPodcastPipeline {
1447
+ private config: PipelineConfig;
1448
+ private genAI: GoogleGenerativeAI;
1449
+ private chunks: TextChunk[] = [];
1450
+
1451
+ constructor(config: Partial<PipelineConfig> = {}) {
1452
+ this.config = { ...DEFAULT_CONFIG, ...config };
1453
+ this.genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
1454
+ }
1455
+
1456
+ /**
1457
+ * Run the complete pipeline: Document -> Podcast Audio
1458
+ */
1459
+ async run(source: DocumentSource): Promise<PodcastAudio> {
1460
+ const startTime = Date.now();
1461
+ const outputDir = this.config.outputDir;
1462
+ const segmentsDir = join(outputDir, 'segments');
1463
+
1464
+ await mkdir(outputDir, { recursive: true });
1465
+ await mkdir(segmentsDir, { recursive: true });
1466
+
1467
+ console.log('[Pipeline] Stage 1: Ingesting document...');
1468
+ const doc = await this.ingest(source);
1469
+ console.log(
1470
+ `[Pipeline] Ingested: "${doc.metadata.title}" (${doc.metadata.wordCount} words)`
1471
+ );
1472
+
1473
+ console.log('[Pipeline] Stage 2: Analyzing content...');
1474
+ const outline = await this.understand(doc);
1475
+ console.log(
1476
+ `[Pipeline] Outline: ${outline.segments.length} segments, ` +
1477
+ `${outline.keyPoints.length} key points`
1478
+ );
1479
+
1480
+ console.log('[Pipeline] Stage 3: Generating script...');
1481
+ const script = await this.generateScript(outline, doc);
1482
+ console.log(
1483
+ `[Pipeline] Script: ${script.lines.length} lines, ~${script.totalDuration}s`
1484
+ );
1485
+
1486
+ // Save script for reference
1487
+ await writeFile(join(outputDir, 'script.json'), JSON.stringify(script, null, 2));
1488
+
1489
+ console.log('[Pipeline] Stage 4: Synthesizing audio...');
1490
+ const audio = await this.synthesize(script);
1491
+
1492
+ const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
1493
+ console.log(`[Pipeline] Complete in ${elapsed}s -> ${audio.filePath}`);
1494
+
1495
+ // Cleanup intermediates unless debugging
1496
+ if (!this.config.keepIntermediates) {
1497
+ await rm(segmentsDir, { recursive: true, force: true }).catch(() => {});
1498
+ }
1499
+
1500
+ return audio;
1501
+ }
1502
+
1503
+ /**
1504
+ * Stage 1: Parse document source into clean text with metadata
1505
+ */
1506
+ async ingest(source: DocumentSource): Promise<ParsedDocument> {
1507
+ return ingestDocument(source);
1508
+ }
1509
+
1510
+ /**
1511
+ * Stage 2: Analyze document, extract key points, generate podcast outline
1512
+ */
1513
+ async understand(doc: ParsedDocument): Promise<PodcastOutline> {
1514
+ // Chunk the document
1515
+ const chunks = semanticChunk(doc);
1516
+
1517
+ // Embed chunks (for potential RAG retrieval in future iterations)
1518
+ this.chunks = await embedWithGemini(chunks);
1519
+
1520
+ // Extract key points
1521
+ const maxPoints = this.config.format === 'brief' ? 4 : 8;
1522
+ const keyPoints = await extractKeyPoints(doc, this.chunks, maxPoints);
1523
+
1524
+ // Generate outline
1525
+ const outline = await generateOutline(doc, keyPoints, {
1526
+ format: this.config.format,
1527
+ duration: this.config.duration,
1528
+ });
1529
+
1530
+ return outline;
1531
+ }
1532
+
1533
+ /**
1534
+ * Stage 3: Generate multi-speaker podcast script from outline
1535
+ */
1536
+ async generateScript(
1537
+ outline: PodcastOutline,
1538
+ doc?: ParsedDocument
1539
+ ): Promise<PodcastScript> {
1540
+ const chunks = this.chunks.length > 0 ? this.chunks : semanticChunk(doc!);
1541
+ const script = await generateScript(outline, chunks, this.config);
1542
+ return adjustScriptDuration(script);
1543
+ }
1544
+
1545
+ /**
1546
+ * Stage 4: Synthesize audio from script, compose final podcast
1547
+ */
1548
+ async synthesize(script: PodcastScript): Promise<PodcastAudio> {
1549
+ const segmentsDir = join(this.config.outputDir, 'segments');
1550
+
1551
+ // Generate individual audio segments via TTS
1552
+ const segmentPaths = await generateAudioSegments(
1553
+ script,
1554
+ this.config.speakers,
1555
+ segmentsDir
1556
+ );
1557
+
1558
+ // Compose final audio with FFmpeg
1559
+ const finalPath = await composeAudio(
1560
+ segmentPaths,
1561
+ this.config,
1562
+ this.config.outputDir,
1563
+ script.title
1564
+ );
1565
+
1566
+ // Get file stats
1567
+ const fileStat = await stat(finalPath);
1568
+
1569
+ // Calculate actual duration from FFmpeg probe
1570
+ let duration = script.totalDuration;
1571
+ try {
1572
+ const { stdout } = await execFileAsync('ffprobe', [
1573
+ '-v', 'quiet',
1574
+ '-show_entries', 'format=duration',
1575
+ '-of', 'csv=p=0',
1576
+ finalPath,
1577
+ ]);
1578
+ duration = parseFloat(stdout.trim()) || duration;
1579
+ } catch {
1580
+ // Fall back to estimated duration
1581
+ }
1582
+
1583
+ return {
1584
+ filePath: finalPath,
1585
+ duration,
1586
+ format: this.config.outputFormat,
1587
+ fileSize: fileStat.size,
1588
+ metadata: {
1589
+ title: script.title,
1590
+ description:
1591
+ `Generated podcast from document. ${script.lines.length} script lines.`,
1592
+ speakers: [script.speakers.host.name, script.speakers.guest.name],
1593
+ generatedAt: new Date().toISOString(),
1594
+ },
1595
+ };
1596
+ }
1597
+ }
1598
+ ```
1599
+
1600
+ ### Usage Examples
1601
+
1602
+ ```typescript
1603
+ // Example 1: PDF to podcast (cloud API, full quality)
1604
+ const pipeline = new DocToPodcastPipeline({
1605
+ ttsProvider: 'elevenlabs',
1606
+ aiProvider: 'gemini',
1607
+ format: 'deep-dive',
1608
+ duration: '15min',
1609
+ outputDir: './output/my-podcast',
1610
+ speakers: [
1611
+ { name: 'Alex', role: 'host', voiceId: 'pNInz6obpgDQGcFmaJgB', provider: 'elevenlabs' },
1612
+ { name: 'Jordan', role: 'guest', voiceId: '21m00Tcm4TlvDq8ikWAM', provider: 'elevenlabs' },
1613
+ ],
1614
+ });
1615
+
1616
+ const result = await pipeline.run({
1617
+ type: 'pdf',
1618
+ path: './documents/research-paper.pdf',
1619
+ });
1620
+ console.log(`Podcast: ${result.filePath} (${result.duration}s, ${result.fileSize} bytes)`);
1621
+
1622
+ // Example 2: URL to brief podcast
1623
+ const brief = await new DocToPodcastPipeline({
1624
+ format: 'brief',
1625
+ duration: '5min',
1626
+ outputDir: './output/quick-brief',
1627
+ }).run({ type: 'url', url: 'https://example.com/article' });
1628
+
1629
+ // Example 3: YouTube video to podcast episode
1630
+ const ytPodcast = await new DocToPodcastPipeline({
1631
+ format: 'deep-dive',
1632
+ duration: '30min',
1633
+ outputDir: './output/yt-episode',
1634
+ }).run({ type: 'youtube', url: 'https://youtube.com/watch?v=example123' });
1635
+
1636
+ // Example 4: Direct text input (e.g., from a database or CMS)
1637
+ const textPodcast = await new DocToPodcastPipeline({
1638
+ format: 'narration',
1639
+ duration: '5min',
1640
+ speakers: [
1641
+ { name: 'Narrator', role: 'narrator', voiceId: 'en-US-Studio-O', provider: 'google-cloud' },
1642
+ { name: 'Expert', role: 'guest', voiceId: 'en-US-Studio-Q', provider: 'google-cloud' },
1643
+ ],
1644
+ outputDir: './output/text-podcast',
1645
+ }).run({
1646
+ type: 'text',
1647
+ content: 'Your document text here...',
1648
+ title: 'Weekly Update',
1649
+ });
1650
+ ```
1651
+
1652
+ ---
1653
+
1654
+ ## Ministry / Church Use Cases
1655
+
1656
+ Since this pipeline is designed with Thierry's ministry application stack in mind, here are
1657
+ specific configurations for common church content scenarios.
1658
+
1659
+ ### Sermon to Podcast Episode
1660
+
1661
+ Transform a recorded sermon transcript or notes into a polished podcast discussion.
1662
+
1663
+ ```typescript
1664
+ const sermonPipeline = new DocToPodcastPipeline({
1665
+ format: 'deep-dive',
1666
+ duration: '30min',
1667
+ aiProvider: 'gemini',
1668
+ ttsProvider: 'elevenlabs',
1669
+ outputDir: './output/sermon-podcast',
1670
+ speakers: [
1671
+ { name: 'Pastor Mike', role: 'host', voiceId: 'voice-id-1', provider: 'elevenlabs' },
1672
+ { name: 'Dr. Sarah', role: 'guest', voiceId: 'voice-id-2', provider: 'elevenlabs' },
1673
+ ],
1674
+ includeMusic: true,
1675
+ });
1676
+
1677
+ // From sermon notes (DOCX from pastor's study)
1678
+ const episode = await sermonPipeline.run({
1679
+ type: 'docx',
1680
+ path: './sermons/2026-03-10-grace-in-action.docx',
1681
+ });
1682
+
1683
+ // From sermon recording transcript (already transcribed via transcription-pipeline-selector)
1684
+ const fromRecording = await sermonPipeline.run({
1685
+ type: 'text',
1686
+ content: transcriptText,
1687
+ title: 'Grace in Action - Sunday Sermon Discussion',
1688
+ });
1689
+ ```
1690
+
1691
+ ### Bible Study to Discussion Format
1692
+
1693
+ Convert Bible study materials into an engaging discussion podcast where the host and guest
1694
+ explore the passage together.
1695
+
1696
+ ```typescript
1697
+ const bibleStudyPipeline = new DocToPodcastPipeline({
1698
+ format: 'deep-dive',
1699
+ duration: '15min',
1700
+ speakers: [
1701
+ { name: 'Teacher', role: 'host', voiceId: 'voice-id-1', provider: 'elevenlabs' },
1702
+ { name: 'Student', role: 'guest', voiceId: 'voice-id-2', provider: 'elevenlabs' },
1703
+ ],
1704
+ outputDir: './output/bible-study',
1705
+ });
1706
+
1707
+ // Custom AI prompt override for Bible study context:
1708
+ // The key point extractor can be tuned to focus on theological themes:
1709
+ // - Historical context of the passage
1710
+ // - Key Greek/Hebrew word meanings
1711
+ // - Practical application points
1712
+ // - Cross-references to other scripture
1713
+ ```
1714
+
1715
+ ### Church Announcement to Brief Audio Update
1716
+
1717
+ Quick 2-3 minute audio updates for the congregation.
1718
+
1719
+ ```typescript
1720
+ const announcementPipeline = new DocToPodcastPipeline({
1721
+ format: 'brief',
1722
+ duration: '5min',
1723
+ speakers: [
1724
+ { name: 'Church Office', role: 'narrator', voiceId: 'en-US-Studio-O', provider: 'google-cloud' },
1725
+ { name: 'Pastor', role: 'host', voiceId: 'voice-id-1', provider: 'elevenlabs' },
1726
+ ],
1727
+ outputDir: './output/announcements',
1728
+ includeMusic: true,
1729
+ });
1730
+
1731
+ const announcement = await announcementPipeline.run({
1732
+ type: 'text',
1733
+ content: `
1734
+ This week at Grace Community Church:
1735
+ - Sunday Service at 10am: "Walking in Faith" series continues
1736
+ - Wednesday Bible Study: Romans Chapter 8, 7pm in Fellowship Hall
1737
+ - Youth Group Friday Night: Movie and pizza, 6-9pm
1738
+ - Volunteer sign-ups for Easter service are open at the welcome desk
1739
+ - Prayer requests can be submitted online at our website
1740
+ `,
1741
+ title: 'This Week at Grace Community - March 10, 2026',
1742
+ });
1743
+ ```
1744
+
1745
+ ### Teaching Recording to Educational Deep-Dive
1746
+
1747
+ Transform a lecture or teaching session into a structured educational podcast that
1748
+ breaks down complex theological or educational topics.
1749
+
1750
+ ```typescript
1751
+ const teachingPipeline = new DocToPodcastPipeline({
1752
+ format: 'deep-dive',
1753
+ duration: '60min',
1754
+ speakers: [
1755
+ { name: 'Professor', role: 'host', voiceId: 'voice-id-1', provider: 'elevenlabs' },
1756
+ { name: 'Teaching Assistant', role: 'guest', voiceId: 'voice-id-2', provider: 'elevenlabs' },
1757
+ ],
1758
+ outputDir: './output/teaching-series',
1759
+ });
1760
+
1761
+ // From a seminary lecture PDF
1762
+ const lecture = await teachingPipeline.run({
1763
+ type: 'pdf',
1764
+ path: './materials/systematic-theology-ch3.pdf',
1765
+ });
1766
+
1767
+ // Integration with content-repurposing-pipeline:
1768
+ // After generating the podcast, feed the script into the repurposing pipeline
1769
+ // to create social media clips, quote cards, and blog posts from the same source.
1770
+ ```
1771
+
1772
+ ### Ministry Integration Architecture
1773
+
1774
+ ```
1775
+ [Sermon Recording] [Bible Study Notes] [Announcements] [Teaching Materials]
1776
+ | | | |
1777
+ v v v v
1778
+ [Transcription] [DOCX Parser] [Text Input] [PDF Parser]
1779
+ | | | |
1780
+ +--------------------+------------------+-------------------+
1781
+ |
1782
+ v
1783
+ +-------------------------------+
1784
+ | DocToPodcastPipeline.run() |
1785
+ | (format per content type) |
1786
+ +-------------------------------+
1787
+ |
1788
+ v
1789
+ +-------------------------------+
1790
+ | Output: MP3 + Script JSON |
1791
+ +-------------------------------+
1792
+ | |
1793
+ v v
1794
+ [Podcast RSS] [Content Repurposing]
1795
+ [Apple/Spotify] [Social clips, quotes]
1796
+ ```
1797
+
1798
+ ---
1799
+
1800
+ ## Error Handling and Resilience
1801
+
1802
+ ### Retry Logic for TTS APIs
1803
+
1804
+ ```typescript
1805
+ async function withRetry<T>(
1806
+ fn: () => Promise<T>,
1807
+ maxRetries: number = 3,
1808
+ delayMs: number = 1000
1809
+ ): Promise<T> {
1810
+ for (let attempt = 1; attempt <= maxRetries; attempt++) {
1811
+ try {
1812
+ return await fn();
1813
+ } catch (error: any) {
1814
+ const isRateLimit = error?.status === 429;
1815
+ const isServerError = error?.status >= 500;
1816
+
1817
+ if (attempt === maxRetries || (!isRateLimit && !isServerError)) {
1818
+ throw error;
1819
+ }
1820
+
1821
+ const backoff = isRateLimit ? delayMs * attempt * 2 : delayMs * attempt;
1822
+ console.warn(
1823
+ `[Retry ${attempt}/${maxRetries}] ${error.message}. Waiting ${backoff}ms...`
1824
+ );
1825
+ await new Promise((resolve) => setTimeout(resolve, backoff));
1826
+ }
1827
+ }
1828
+ throw new Error('Unreachable');
1829
+ }
1830
+
1831
+ // Usage in TTS generation:
1832
+ const audioBuffer = await withRetry(
1833
+ () => tts.provider.synthesize(line.text, tts.voiceId, line.emotion),
1834
+ 3,
1835
+ 2000
1836
+ );
1837
+ ```
1838
+
1839
+ ### Pipeline Checkpointing
1840
+
1841
+ For long-running pipelines (60-min episodes), save intermediate state so a failure
1842
+ in Stage 4 does not require re-running Stages 1-3.
1843
+
1844
+ ```typescript
1845
+ interface PipelineCheckpoint {
1846
+ stage: 1 | 2 | 3 | 4;
1847
+ doc?: ParsedDocument;
1848
+ outline?: PodcastOutline;
1849
+ script?: PodcastScript;
1850
+ segmentPaths?: string[];
1851
+ timestamp: string;
1852
+ }
1853
+
1854
+ async function saveCheckpoint(
1855
+ checkpoint: PipelineCheckpoint,
1856
+ outputDir: string
1857
+ ): Promise<void> {
1858
+ await writeFile(
1859
+ join(outputDir, 'checkpoint.json'),
1860
+ JSON.stringify(checkpoint, null, 2)
1861
+ );
1862
+ }
1863
+
1864
+ async function loadCheckpoint(outputDir: string): Promise<PipelineCheckpoint | null> {
1865
+ try {
1866
+ const data = await readFile(join(outputDir, 'checkpoint.json'), 'utf-8');
1867
+ return JSON.parse(data);
1868
+ } catch {
1869
+ return null;
1870
+ }
1871
+ }
1872
+
1873
+ // Enhanced run() with checkpointing:
1874
+ async function runWithCheckpoints(
1875
+ pipeline: DocToPodcastPipeline,
1876
+ source: DocumentSource,
1877
+ outputDir: string
1878
+ ): Promise<PodcastAudio> {
1879
+ const existing = await loadCheckpoint(outputDir);
1880
+
1881
+ let doc: ParsedDocument;
1882
+ let outline: PodcastOutline;
1883
+ let script: PodcastScript;
1884
+
1885
+ if (existing && existing.stage >= 2 && existing.doc) {
1886
+ console.log('[Resume] Skipping Stage 1 (cached)');
1887
+ doc = existing.doc;
1888
+ } else {
1889
+ doc = await pipeline.ingest(source);
1890
+ await saveCheckpoint(
1891
+ { stage: 1, doc, timestamp: new Date().toISOString() },
1892
+ outputDir
1893
+ );
1894
+ }
1895
+
1896
+ if (existing && existing.stage >= 3 && existing.outline) {
1897
+ console.log('[Resume] Skipping Stage 2 (cached)');
1898
+ outline = existing.outline;
1899
+ } else {
1900
+ outline = await pipeline.understand(doc);
1901
+ await saveCheckpoint(
1902
+ { stage: 2, doc, outline, timestamp: new Date().toISOString() },
1903
+ outputDir
1904
+ );
1905
+ }
1906
+
1907
+ if (existing && existing.stage >= 4 && existing.script) {
1908
+ console.log('[Resume] Skipping Stage 3 (cached)');
1909
+ script = existing.script;
1910
+ } else {
1911
+ script = await pipeline.generateScript(outline, doc);
1912
+ await saveCheckpoint(
1913
+ { stage: 3, doc, outline, script, timestamp: new Date().toISOString() },
1914
+ outputDir
1915
+ );
1916
+ }
1917
+
1918
+ const audio = await pipeline.synthesize(script);
1919
+ return audio;
1920
+ }
1921
+ ```
1922
+
1923
+ ---
1924
+
1925
+ ## Performance Optimization
1926
+
1927
+ ### Parallel TTS Generation
1928
+
1929
+ For cloud TTS providers with sufficient rate limits, generate multiple segments in parallel.
1930
+
1931
+ ```typescript
1932
+ async function generateAudioSegmentsParallel(
1933
+ script: PodcastScript,
1934
+ speakers: SpeakerConfig[],
1935
+ outputDir: string,
1936
+ concurrency: number = 5
1937
+ ): Promise<string[]> {
1938
+ await mkdir(outputDir, { recursive: true });
1939
+
1940
+ const ttsProviders = buildTTSProviders(speakers);
1941
+ const segmentPaths: string[] = new Array(script.lines.length);
1942
+
1943
+ // Process in batches of `concurrency`
1944
+ for (let i = 0; i < script.lines.length; i += concurrency) {
1945
+ const batch = script.lines.slice(i, i + concurrency);
1946
+ const promises = batch.map(async (line, batchIdx) => {
1947
+ const globalIdx = i + batchIdx;
1948
+ const tts = ttsProviders[line.speaker];
1949
+ if (!tts) return;
1950
+
1951
+ const audioBuffer = await withRetry(
1952
+ () => tts.provider.synthesize(line.text, tts.voiceId, line.emotion),
1953
+ 3,
1954
+ 2000
1955
+ );
1956
+ const segmentPath = join(
1957
+ outputDir,
1958
+ `segment-${String(globalIdx).padStart(4, '0')}.wav`
1959
+ );
1960
+ await writeFile(segmentPath, audioBuffer);
1961
+ segmentPaths[globalIdx] = segmentPath;
1962
+ });
1963
+
1964
+ await Promise.all(promises);
1965
+ console.log(
1966
+ `[TTS] Completed ${Math.min(i + concurrency, script.lines.length)}` +
1967
+ `/${script.lines.length}`
1968
+ );
1969
+ }
1970
+
1971
+ return segmentPaths.filter(Boolean);
1972
+ }
1973
+
1974
+ function buildTTSProviders(
1975
+ speakers: SpeakerConfig[]
1976
+ ): Record<string, { provider: TTSProvider; voiceId: string }> {
1977
+ const providers: Record<string, { provider: TTSProvider; voiceId: string }> = {};
1978
+ for (const speaker of speakers) {
1979
+ switch (speaker.provider) {
1980
+ case 'elevenlabs':
1981
+ providers[speaker.role] = {
1982
+ provider: new ElevenLabsTTS(process.env.ELEVENLABS_API_KEY!),
1983
+ voiceId: speaker.voiceId,
1984
+ };
1985
+ break;
1986
+ case 'google-cloud':
1987
+ providers[speaker.role] = {
1988
+ provider: new GoogleCloudTTS(process.env.GOOGLE_TTS_API_KEY!),
1989
+ voiceId: speaker.voiceId,
1990
+ };
1991
+ break;
1992
+ }
1993
+ }
1994
+ return providers;
1995
+ }
1996
+ ```
1997
+
1998
+ ### Streaming Pipeline (Future Enhancement)
1999
+
2000
+ For real-time applications, each stage can emit results as they become available
2001
+ rather than waiting for the entire stage to complete:
2002
+
2003
+ ```
2004
+ Stage 1 emits chunks as they are parsed
2005
+ --> Stage 2 begins embedding as chunks arrive
2006
+ --> Stage 3 begins writing intro while later points are still extracted
2007
+ --> Stage 4 begins TTS on early script lines while later ones generate
2008
+ ```
2009
+
2010
+ This reduces end-to-end latency by ~40% for long documents but adds significant
2011
+ implementation complexity. Recommended only for production deployment.
2012
+
2013
+ ---
2014
+
2015
+ ## Testing and Validation
2016
+
2017
+ ### Unit Test Checklist
2018
+
2019
+ ```typescript
2020
+ // Tests for each stage (using vitest):
2021
+ describe('DocToPodcastPipeline', () => {
2022
+ // Stage 1
2023
+ test('parsePDF extracts text and sections from a known PDF', async () => {
2024
+ const doc = await parsePDF('./fixtures/sample.pdf');
2025
+ expect(doc.text.length).toBeGreaterThan(100);
2026
+ expect(doc.sections.length).toBeGreaterThan(0);
2027
+ expect(doc.metadata.sourceType).toBe('pdf');
2028
+ });
2029
+
2030
+ test('parseDOCX extracts heading structure', async () => {
2031
+ const doc = await parseDOCX('./fixtures/sample.docx');
2032
+ expect(doc.sections.some((s) => s.heading !== 'Introduction')).toBe(true);
2033
+ });
2034
+
2035
+ test('semanticChunk produces overlapping chunks within token budget', () => {
2036
+ const doc = createMockDocument(5000); // 5000 words
2037
+ const chunks = semanticChunk(doc);
2038
+ expect(chunks.every((c) => c.tokenCount <= 650)).toBe(true); // 500 + buffer
2039
+ expect(chunks.every((c) => c.tokenCount >= 50)).toBe(true); // Not too small
2040
+ });
2041
+
2042
+ // Stage 2
2043
+ test('extractKeyPoints returns ranked points with chunk references', async () => {
2044
+ const points = await extractKeyPoints(mockDoc, mockChunks, 5);
2045
+ expect(points.length).toBeLessThanOrEqual(5);
2046
+ expect(points[0].importance).toBeGreaterThanOrEqual(points[1].importance);
2047
+ expect(points.every((p) => p.relevantChunks.length > 0)).toBe(true);
2048
+ });
2049
+
2050
+ // Stage 3
2051
+ test('generated script has correct speaker alternation', async () => {
2052
+ const script = await generateScript(mockOutline, mockChunks, mockConfig);
2053
+ // Host should speak first
2054
+ expect(script.lines[0].speaker).toBe('host');
2055
+ // Speakers should mostly alternate
2056
+ let alternations = 0;
2057
+ for (let i = 1; i < script.lines.length; i++) {
2058
+ if (script.lines[i].speaker !== script.lines[i - 1].speaker) alternations++;
2059
+ }
2060
+ expect(alternations / script.lines.length).toBeGreaterThan(0.4);
2061
+ });
2062
+
2063
+ test('script duration is within 15% of target', () => {
2064
+ const validation = validateScriptDuration(mockScript);
2065
+ expect(validation.withinTolerance).toBe(true);
2066
+ });
2067
+
2068
+ // Stage 4
2069
+ test('FFmpeg concat produces valid audio file', async () => {
2070
+ const finalPath = await composeAudio(mockSegments, mockConfig, tmpDir, 'Test');
2071
+ const { stdout } = await execFileAsync('ffprobe', [
2072
+ '-v', 'quiet', '-show_entries', 'format=duration', '-of', 'csv=p=0', finalPath,
2073
+ ]);
2074
+ expect(parseFloat(stdout)).toBeGreaterThan(0);
2075
+ });
2076
+
2077
+ test('output loudness is within EBU R128 spec', async () => {
2078
+ const { stderr } = await execFileAsync('ffmpeg', [
2079
+ '-i', outputPath,
2080
+ '-af', 'loudnorm=print_format=json',
2081
+ '-f', 'null', '/dev/null',
2082
+ ]);
2083
+ const match = stderr.match(/"input_i"\s*:\s*"(-?\d+\.?\d*)"/);
2084
+ const loudness = parseFloat(match![1]);
2085
+ expect(loudness).toBeCloseTo(-16, 1); // Within 1 LUFS of target
2086
+ });
2087
+ });
2088
+ ```
2089
+
2090
+ ### Manual Validation Checklist
2091
+
2092
+ ```
2093
+ [ ] Document parses correctly (check text extraction, no garbled characters)
2094
+ [ ] Key points are relevant (not trivial or off-topic)
2095
+ [ ] Outline has logical flow (intro -> body -> conclusion)
2096
+ [ ] Script reads naturally when read aloud
2097
+ [ ] No hallucinated facts in script (compare to source)
2098
+ [ ] Speaker voices are distinct and appropriate
2099
+ [ ] Pauses feel natural (not too short, not too long)
2100
+ [ ] Loudness is consistent throughout (-16 LUFS +/- 1)
2101
+ [ ] No audio artifacts (clicks, pops, unnatural transitions)
2102
+ [ ] Total duration matches target within 15%
2103
+ [ ] MP3 metadata tags are correct (title, artist, genre)
2104
+ ```
2105
+
2106
+ ---
2107
+
2108
+ ## Research Citations
2109
+
2110
+ > **Meta NotebookLlama (Oct 2024):** Open-source reproduction of Google NotebookLM's "Audio
2111
+ > Overview" feature. Demonstrates tiered model architecture: Llama-3.2-1B for text cleaning,
2112
+ > Llama-3.1-70B for script generation, Llama-3.1-8B for TTS transcript preparation. Key insight:
2113
+ > match model capability to task complexity rather than using one model for everything. Small
2114
+ > models handle mechanical tasks faster and cheaper; large models are reserved for creative
2115
+ > generation where quality matters most.
2116
+ > Source: github.com/meta-llama/llama-recipes/tree/main/recipes/quickstart/NotebookLlama
2117
+
2118
+ > **Mozilla AI Document-to-Podcast Blueprint (2024):** Fully local, CPU-only pipeline using
2119
+ > GGUF quantized models via llama_cpp Python bindings. Demonstrates that consumer hardware
2120
+ > (16GB RAM, no GPU) can run the complete pipeline with acceptable quality. Uses Parler TTS
2121
+ > for speech synthesis. Zero API cost makes it suitable for privacy-sensitive or budget-
2122
+ > constrained deployments.
2123
+ > Source: github.com/mozilla-ai/document-to-podcast
2124
+
2125
+ > **PodAgent (ACL 2025, arXiv 2503.00455):** Multi-agent framework for podcast generation with
2126
+ > Host, Guest, and Writer agents. The Writer agent performs faithfulness verification by checking
2127
+ > each generated claim against source material, reducing hallucination rate by 23% compared to
2128
+ > single-agent approaches. Introduces the "discussion angle" concept where each key point is
2129
+ > framed as a conversation starter rather than a lecture point.
2130
+ > Source: arxiv.org/abs/2503.00455
2131
+
2132
+ > **EBU R128 Loudness Standard:** European Broadcasting Union recommendation for loudness
2133
+ > normalization. Podcasts target -16 LUFS (Integrated Loudness) with a True Peak ceiling of
2134
+ > -1.5 dBTP. This is the de facto standard for Apple Podcasts, Spotify, and YouTube. FFmpeg's
2135
+ > loudnorm filter implements this standard natively.
2136
+ > Source: tech.ebu.ch/docs/r/r128.pdf
2137
+
2138
+ > **Google Gemini Embedding API:** embedding-001 model produces 768-dimensional dense vectors
2139
+ > optimized for retrieval tasks. Supports batch embedding (up to 100 texts per request) and
2140
+ > task-type hints (RETRIEVAL_DOCUMENT vs RETRIEVAL_QUERY) for improved relevance.
2141
+ > Source: ai.google.dev/gemini-api/docs/embeddings
2142
+
2143
+ ---
2144
+
2145
+ ## Appendix: Environment Variables
2146
+
2147
+ ```bash
2148
+ # Required for cloud API pipeline
2149
+ GEMINI_API_KEY=your-gemini-api-key
2150
+ ELEVENLABS_API_KEY=your-elevenlabs-api-key
2151
+
2152
+ # Optional (for Google Cloud TTS instead of ElevenLabs)
2153
+ GOOGLE_TTS_API_KEY=your-google-tts-key
2154
+
2155
+ # Optional (for Anthropic Claude instead of Gemini for script generation)
2156
+ ANTHROPIC_API_KEY=your-anthropic-api-key
2157
+
2158
+ # Optional (for local pipeline via Ollama)
2159
+ OLLAMA_BASE_URL=http://localhost:11434
2160
+ ```
2161
+
2162
+ ## Appendix: npm Dependencies (Complete)
2163
+
2164
+ ```json
2165
+ {
2166
+ "dependencies": {
2167
+ "pdf-parse": "^1.1.1",
2168
+ "mammoth": "^1.8.0",
2169
+ "@mozilla/readability": "^0.5.0",
2170
+ "linkedom": "^0.18.0",
2171
+ "cheerio": "^1.0.0",
2172
+ "youtube-transcript": "^1.2.1",
2173
+ "@google/generative-ai": "^0.21.0",
2174
+ "fluent-ffmpeg": "^2.1.3"
2175
+ },
2176
+ "devDependencies": {
2177
+ "@types/fluent-ffmpeg": "^2.1.24",
2178
+ "vitest": "^2.0.0"
2179
+ },
2180
+ "peerDependencies": {
2181
+ "ffmpeg": "System-installed FFmpeg 6.x+ required"
2182
+ }
2183
+ }
2184
+ ```