@thierrynakoa/fire-flow 12.2.1 → 13.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CREDITS.md +25 -0
- package/DOMINION-FLOW-OVERVIEW.md +182 -38
- package/README.md +399 -455
- package/TROUBLESHOOTING.md +264 -264
- package/agents/fire-debugger.md +54 -0
- package/agents/fire-executor.md +1610 -1033
- package/agents/fire-fact-checker.md +1 -1
- package/agents/fire-planner.md +85 -17
- package/agents/fire-project-researcher.md +1 -1
- package/agents/fire-researcher.md +4 -22
- package/agents/{fire-phoenix-analyst.md → fire-resurrection-analyst.md} +394 -394
- package/agents/fire-reviewer.md +552 -499
- package/agents/fire-verifier.md +114 -19
- package/bin/cli.js +18 -101
- package/commands/fire-0-orient.md +2 -2
- package/commands/fire-1a-new.md +50 -15
- package/commands/fire-1c-setup.md +33 -5
- package/commands/fire-1d-discuss.md +87 -1
- package/commands/fire-2-plan.md +556 -527
- package/commands/fire-3-execute.md +2046 -1356
- package/commands/fire-4-verify.md +975 -906
- package/commands/fire-5-handoff.md +46 -5
- package/commands/fire-6-resume.md +2 -31
- package/commands/fire-add-new-skill.md +138 -19
- package/commands/fire-autonomous.md +14 -2
- package/commands/fire-complete-milestone.md +1 -1
- package/commands/fire-cost.md +179 -183
- package/commands/fire-debug.md +1 -6
- package/commands/fire-loop-resume.md +2 -2
- package/commands/fire-loop-stop.md +1 -1
- package/commands/fire-loop.md +2 -15
- package/commands/fire-map-codebase.md +1 -1
- package/commands/fire-migrate-database.md +548 -0
- package/commands/fire-new-milestone.md +1 -1
- package/commands/fire-reflect.md +1 -2
- package/commands/fire-research.md +142 -21
- package/commands/{fire-phoenix.md → fire-resurrect.md} +859 -603
- package/commands/fire-scaffold.md +297 -0
- package/commands/fire-search.md +1 -2
- package/commands/fire-security-scan.md +483 -484
- package/commands/fire-setup.md +359 -0
- package/commands/fire-skill.md +770 -0
- package/commands/fire-skills-diff.md +506 -506
- package/commands/fire-skills-history.md +388 -388
- package/commands/fire-skills-rollback.md +7 -7
- package/commands/fire-skills-sync.md +470 -470
- package/commands/fire-test.md +5 -5
- package/commands/fire-todos.md +1 -1
- package/commands/fire-update.md +5 -5
- package/commands/fire-validate-skills.md +282 -0
- package/commands/fire-vuln-scan.md +492 -493
- package/hooks/run-hook.sh +8 -8
- package/hooks/run-session-end.sh +7 -7
- package/hooks/session-end.sh +90 -90
- package/hooks/session-start.sh +1 -1
- package/package.json +4 -24
- package/plugin.json +7 -7
- package/references/autonomy-levels.md +235 -0
- package/references/behavioral-directives.md +95 -3
- package/references/blocker-tracking.md +1 -1
- package/references/circuit-breaker.md +93 -2
- package/references/context-engineering.md +227 -9
- package/references/honesty-protocols.md +70 -1
- package/references/issue-to-pr-pipeline.md +149 -150
- package/references/metrics-and-trends.md +1 -2
- package/references/research-improvements.md +4 -108
- package/references/sdlc-mapping.md +73 -0
- package/references/state-machine.md +151 -0
- package/skills-library/AVAILABLE_TOOLS_REFERENCE.md +333 -0
- package/skills-library/SKILLS-INDEX.md +57 -558
- package/skills-library/SKILLS_LIBRARY_INDEX.md +532 -0
- package/skills-library/_general/api-patterns/api-field-name-mismatch.md +107 -0
- package/skills-library/_general/api-patterns/streaming-command-timeout.md +122 -0
- package/skills-library/_general/api-patterns/streaming-proxy-cors-bypass.md +102 -0
- package/skills-library/_general/automation/settings-gui-generator.md +172 -0
- package/skills-library/_general/database-solutions/data-type-mapping-reference.md +181 -0
- package/skills-library/_general/database-solutions/mysql-limit-offset-string-coercion.md +102 -0
- package/skills-library/_general/database-solutions/mysql-to-pg-migration.md +195 -0
- package/skills-library/_general/database-solutions/orm-schema-portability.md +193 -0
- package/skills-library/_general/database-solutions/persistent-analysis-storage.md +207 -0
- package/skills-library/_general/database-solutions/pg-to-mysql-schema-migration-methodology.md +190 -0
- package/skills-library/_general/database-solutions/sql-dialect-compatibility-matrix.md +306 -0
- package/skills-library/_general/database-solutions/sqlite-to-pg-migration.md +219 -0
- package/skills-library/_general/frontend/canvas-bubble-animation-grouping.md +270 -0
- package/skills-library/_general/frontend/color-token-migration.md +112 -0
- package/skills-library/_general/frontend/framer-motion-layoutid-grouping.md +150 -0
- package/skills-library/_general/frontend/pyqt6-settings-dialog.md +191 -0
- package/skills-library/_general/frontend/react-flow-animated-layout-switching.md +101 -0
- package/skills-library/_general/frontend/react-hooks-order-debugging.md +141 -0
- package/skills-library/_general/frontend/redux-localstorage-auth-desync.md +126 -0
- package/skills-library/_general/frontend/safari-csp-theme-color-debugging.md +124 -0
- package/skills-library/_general/frontend/safari-sw-cache-poisoning.md +138 -0
- package/skills-library/_general/frontend/svg-sparkline-no-charting-library.md +131 -0
- package/skills-library/_general/growth-marketing/oss-daily-growth-intelligence.md +224 -0
- package/skills-library/_general/integrations/claude-code-local-mcp-integration.md +250 -0
- package/skills-library/_general/integrations/mcp-composite-tool-orchestration.md +200 -0
- package/skills-library/_general/methodology/AGENT_SDK_STANDALONE_TOOLING.md +181 -0
- package/skills-library/_general/methodology/AGENT_TEAMS_GUIDE.md +169 -0
- package/skills-library/_general/methodology/ALAS_STATEFUL_EXECUTION.md +207 -0
- package/skills-library/_general/methodology/AUTO_REVIEWER_SUBAGENT.md +211 -0
- package/skills-library/_general/methodology/CONSISTENCY_CHECK_AMBIGUITY_GATE.md +96 -0
- package/skills-library/_general/methodology/DEAD_ENDS_SHELF.md +4 -4
- package/skills-library/_general/methodology/DISTILL_NOT_DUMP.md +108 -0
- package/skills-library/_general/methodology/EXECUTION_PROGRESS_MONITOR.md +157 -0
- package/skills-library/_general/methodology/HIERARCHICAL_REVIEW_MARS.md +122 -0
- package/skills-library/_general/methodology/MCP_INTER_AGENT_BRIDGE.md +207 -0
- package/skills-library/_general/methodology/MERMAID_WIZARD_DIAGRAMS.md +77 -0
- package/skills-library/_general/methodology/MISSING_DIMENSION_DETECTOR.md +89 -0
- package/skills-library/_general/methodology/MULTI_AGENT_COORDINATION.md +397 -0
- package/skills-library/_general/methodology/OBSERVATION_MASKING.md +100 -0
- package/skills-library/_general/methodology/PHOENIX_REBUILD_METHODOLOGY.md +82 -11
- package/skills-library/_general/methodology/REVIEW_BACKTRACK_PANEL.md +140 -0
- package/skills-library/_general/methodology/REVIEW_FIX_LOOP.md +117 -0
- package/skills-library/_general/methodology/VOTING_VERDICT_ARBITRATION.md +155 -0
- package/skills-library/_general/methodology/ZERO_FRICTION_CLI_SETUP.md +2 -2
- package/skills-library/_general/methodology/dead-code-activation.md +123 -0
- package/skills-library/_general/methodology/debug-swarm-researcher-escape-hatch.md +240 -240
- package/skills-library/_general/methodology/shell-autonomous-loop-fixplan.md +1 -1
- package/skills-library/_general/patterns-standards/GOF_DESIGN_PATTERNS_FOR_AI_AGENTS.md +5 -5
- package/skills-library/_general/patterns-standards/cascading-failure-diagnosis.md +119 -0
- package/skills-library/_general/patterns-standards/domain-specific-layout-algorithms.md +209 -0
- package/skills-library/_general/patterns-standards/python-desktop-app-architecture.md +399 -0
- package/skills-library/_general/patterns-standards/realtime-monitoring-dashboard.md +457 -0
- package/skills-library/_general/patterns-standards/togglable-processing-pipeline.md +169 -0
- package/skills-library/_general/performance/liveclock-extraction.md +112 -0
- package/skills-library/_general/performance/ref-based-canvas-animation.md +117 -0
- package/skills-library/_general/performance/use-visible-interval.md +131 -0
- package/skills-library/_general/testing/playwright-firefox-withcredentials-auth-issue.md +104 -0
- package/skills-library/_quarantine/README.md +30 -0
- package/skills-library/api-patterns/BROADCAST_SCHEDULER_SHARED_EXECUTE_FUNCTION.md +150 -0
- package/skills-library/api-patterns/ERROR_RESPONSE_STANDARDS.md +145 -0
- package/skills-library/api-patterns/EXPRESS_ROUTE_ORDERING_MIDDLEWARE_INTERCEPTION.md +326 -0
- package/skills-library/api-patterns/PAGINATION_PATTERNS.md +137 -0
- package/skills-library/api-patterns/PODCAST_PROGRESS_TRACKING_THREE_ROOT_CAUSES.md +277 -0
- package/skills-library/api-patterns/RATE_LIMITING_TOGGLE.md +155 -0
- package/skills-library/api-patterns/graphql-content-queries.md +708 -0
- package/skills-library/appointment-scheduler-design.md +423 -0
- package/skills-library/automation/AUTO_POPULATE_COMPLETE_GUIDE.md +631 -0
- package/skills-library/automation/CC_WORKFLOW_STUDIO.md +83 -0
- package/skills-library/automation/CLAUDE_CODE_SWARM_MODE.md +95 -0
- package/skills-library/automation/DAEMON_TRIGGER_FILE_IPC.md +195 -0
- package/skills-library/automation/scheduled-content-publishing.md +608 -0
- package/skills-library/awesome-workflows/Blogging-Platform-Instructions/view_commands.md +25 -0
- package/skills-library/awesome-workflows/CREDENTIAL-SECURITY-WORKFLOW.md +109 -0
- package/skills-library/awesome-workflows/DEBUGGING-WORKFLOW.md +124 -0
- package/skills-library/awesome-workflows/Design-Review-Workflow/README.md +31 -0
- package/skills-library/awesome-workflows/Design-Review-Workflow/design-principles-example.md +129 -0
- package/skills-library/awesome-workflows/Design-Review-Workflow/design-review-agent.md +107 -0
- package/skills-library/awesome-workflows/Design-Review-Workflow/design-review-claude-md-snippet.md +24 -0
- package/skills-library/awesome-workflows/Design-Review-Workflow/design-review-slash-command.md +38 -0
- package/skills-library/awesome-workflows/PARALLEL-RESEARCH-WORKFLOW.md +89 -0
- package/skills-library/awesome-workflows/PHASE-EXECUTION-WORKFLOW.md +97 -0
- package/skills-library/awesome-workflows/SESSION-HANDOFF-WORKFLOW.md +116 -0
- package/skills-library/cms-patterns/content-branch-preview.md +515 -0
- package/skills-library/cms-patterns/inline-visual-editing.md +666 -0
- package/skills-library/cms-patterns/mdx-component-content.md +649 -0
- package/skills-library/cms-patterns/media-manager-abstraction.md +827 -0
- package/skills-library/cms-patterns/schema-driven-form-generator.md +838 -0
- package/skills-library/complexity-metrics/complexity-divider.md +707 -0
- package/skills-library/complexity-metrics/work-with-complexity.md +193 -0
- package/skills-library/creative-multimedia/animation-stack-guide.md +577 -0
- package/skills-library/creative-multimedia/audio-enhancement-pipeline.md +625 -0
- package/skills-library/creative-multimedia/content-repurposing-pipeline.md +1146 -0
- package/skills-library/creative-multimedia/data-visualization-generator.md +862 -0
- package/skills-library/creative-multimedia/doc-to-podcast-pipeline.md +2184 -0
- package/skills-library/creative-multimedia/ffmpeg-command-generator.md +405 -0
- package/skills-library/creative-multimedia/image-optimization-pipeline.md +605 -0
- package/skills-library/creative-multimedia/multi-format-content-generator.md +1759 -0
- package/skills-library/creative-multimedia/og-image-generator.md +635 -0
- package/skills-library/creative-multimedia/podcast-audio-composition.md +1355 -0
- package/skills-library/creative-multimedia/podcast-quality-evaluation.md +1452 -0
- package/skills-library/creative-multimedia/podcast-script-generation.md +1841 -0
- package/skills-library/creative-multimedia/svg-generation.md +750 -0
- package/skills-library/creative-multimedia/text-to-speech-provider-selector.md +1414 -0
- package/skills-library/creative-multimedia/transcription-pipeline-selector.md +677 -0
- package/skills-library/creative-multimedia/video-streaming-setup.md +559 -0
- package/skills-library/database-solutions/AI_RESPONSE_DATABASE_CACHING.md +520 -0
- package/skills-library/database-solutions/CONDITIONAL_SQL_MIGRATION_PATTERN.md +119 -0
- package/skills-library/database-solutions/DATABASE_COLUMN_NAME_MISMATCH.md +393 -0
- package/skills-library/database-solutions/DATABASE_SCHEMA.md +394 -0
- package/skills-library/database-solutions/DATABASE_SCHEMA_VERIFICATION_GUIDE.md +348 -0
- package/skills-library/database-solutions/DATABASE_STRATEGY.md +71 -0
- package/skills-library/database-solutions/ES_MODULE_SEED_SCRIPT_PATTERN.md +52 -0
- package/skills-library/database-solutions/MIGRATION_GUIDE.md +3 -0
- package/skills-library/database-solutions/PLPGSQL_VARIABLE_CONFLICT_FIX.md +208 -0
- package/skills-library/database-solutions/POSTGRESQL_JSONB_DOUBLE_STRINGIFY_FIX.md +245 -0
- package/skills-library/database-solutions/POSTGRESQL_LICENSE_TABLE_DESIGN.md +393 -0
- package/skills-library/database-solutions/POSTGRESQL_UUID_DOCUMENT_RAG_DUAL_SCOPE.md +732 -0
- package/skills-library/database-solutions/POSTGRES_SQL_TEMPLATE_BINDING_ERROR.md +240 -0
- package/skills-library/database-solutions/PRISMA_DB_PUSH_DATA_LOSS_PREVENTION.md +141 -0
- package/skills-library/database-solutions/PRODUCTION_QUERY_OPTIMIZATION_RESTART_FIX.md +389 -0
- package/skills-library/database-solutions/RLS_SECURITY_GUIDE.md +107 -0
- package/skills-library/database-solutions/SCHEMA_ENHANCEMENTS_GUIDE.md +373 -0
- package/skills-library/database-solutions/SCHEMA_MIGRATION_GUIDE.md +368 -0
- package/skills-library/database-solutions/SCHEMA_VERIFICATION_QUICK_REFERENCE.md +104 -0
- package/skills-library/database-solutions/ai-erd-generator.md +1213 -0
- package/skills-library/database-solutions/content-publishing-states.md +631 -0
- package/skills-library/database-solutions/database-schema-designer.md +522 -0
- package/skills-library/database-solutions/er-diagram-components.md +569 -0
- package/skills-library/database-solutions/er-to-ddl-mapping.md +1405 -0
- package/skills-library/database-solutions/erd-creator-textbook-research.md +433 -0
- package/skills-library/database-solutions/erd-react-flow-architecture.md +1965 -0
- package/skills-library/database-solutions/mariadb-aggregate-function-replacement.md +145 -0
- package/skills-library/database-solutions/normalization-validator.md +778 -0
- package/skills-library/database-solutions/postgres-full-text-search-content.md +494 -0
- package/skills-library/database-solutions/postgresql-to-mysql-runtime-translation.md +286 -0
- package/skills-library/database-solutions/regex-alternation-ordering-sql-types.md +92 -0
- package/skills-library/database-solutions/reserved-word-context-aware-quoting.md +142 -0
- package/skills-library/database-solutions/sql-ddl-generator.md +756 -0
- package/skills-library/database-solutions/supabase-connection-pooler-fix.md +102 -0
- package/skills-library/deployment-security/CPANEL_NODE_DEPLOYMENT.md +166 -0
- package/skills-library/deployment-security/DEPLOYMENT.md +275 -0
- package/skills-library/deployment-security/DEPLOYMENT_CHECKLIST.md +363 -0
- package/skills-library/deployment-security/DEPLOYMENT_PLAN.md +669 -0
- package/skills-library/deployment-security/KNEX_DATABASE_ABSTRACTION.md +444 -0
- package/skills-library/deployment-security/LICENSE_KEY_SYSTEM.md +206 -0
- package/skills-library/deployment-security/NODE18_DEPENDENCY_COMPATIBILITY.md +284 -0
- package/skills-library/deployment-security/PHP_INSTALLER_WIZARD_GUIDE.md +315 -0
- package/skills-library/deployment-security/PM2_ENVIRONMENT_VARIABLE_CACHING.md +256 -0
- package/skills-library/deployment-security/PM2_MEMORY_EXHAUSTION_FIX.md +370 -0
- package/skills-library/deployment-security/PRODUCTION_DEPLOYMENT_GUIDE.md +592 -0
- package/skills-library/deployment-security/PRODUCTION_HARDENING_DOCUMENTATION.md +307 -0
- package/skills-library/deployment-security/PRODUCTION_RECOVERY_CHERRY_PICK_PATTERN.md +202 -0
- package/skills-library/deployment-security/PYINSTALLER_CUDA_WHISPER_BUNDLING.md +236 -0
- package/skills-library/deployment-security/SECURITY.md +41 -0
- package/skills-library/deployment-security/SMTP_SSL_HOSTNAME_MISMATCH_SHARED_HOSTING.md +220 -0
- package/skills-library/deployment-security/SPA_SEO_OPTIMIZATION_CPANEL.md +200 -0
- package/skills-library/deployment-security/SUPABASE_EDGE_FUNCTIONS.md +338 -0
- package/skills-library/deployment-security/VERCEL_GITHUB_DEPLOYMENT_GUIDE.md +858 -0
- package/skills-library/deployment-security/VPS_DEPLOYMENT_READINESS.md +356 -0
- package/skills-library/deployment-security/deployment-changes-not-applying.md +241 -0
- package/skills-library/deployment-security/env-file-management-production-local.md +203 -0
- package/skills-library/deployment-security/express-secure-file-downloads.md +413 -0
- package/skills-library/deployment-security/react-production-deployment-desktop-guide.md +2011 -0
- package/skills-library/deployment-security/self-hosted-supabase-coolify-guide.md +1684 -0
- package/skills-library/deployment-security/unique-features-ai-strategy-plaid-security.md +1613 -0
- package/skills-library/deployment-security/vps-deployment.md +135 -0
- package/skills-library/document-processing/WORD_EXPORT_MARKDOWN_FORMATTING.md +482 -0
- package/skills-library/document-processing/document-ai-landingai-integration.md +677 -0
- package/skills-library/document-processing/express-secure-file-downloads-mern.md +413 -0
- package/skills-library/document-processing/express-secure-file-downloads.md +413 -0
- package/skills-library/document-processing/md-to-word-converter.md +318 -0
- package/skills-library/document-processing/pdf-forms-integration/README.md +101 -0
- package/skills-library/document-processing/pdf-forms-integration/SKILL.md +662 -0
- package/skills-library/ecommerce/ADMIN_PRODUCTS_GUIDE.md +428 -0
- package/skills-library/ecommerce/ECOMMERCE_API_REFERENCE.md +776 -0
- package/skills-library/ecommerce/ECOMMERCE_COMPLETION_SUMMARY.md +673 -0
- package/skills-library/ecommerce/ECOMMERCE_IMPLEMENTATION_GUIDE.md +729 -0
- package/skills-library/ecommerce/ECOMMERCE_QUICK_REFERENCE.md +521 -0
- package/skills-library/ecommerce/ECOMMERCE_TESTING_CHECKLIST.md +565 -0
- package/skills-library/ecommerce/ECOMMERCE_WORKFLOW_GUIDE.md +1059 -0
- package/skills-library/ecommerce/PRODUCT_CREATION_EXPANDED.md +522 -0
- package/skills-library/ecommerce/agentic-commerce-protocol.md +203 -0
- package/skills-library/ecommerce/cart-abandonment-recovery.md +236 -0
- package/skills-library/ecommerce/cart-architecture-patterns.md +300 -0
- package/skills-library/ecommerce/cart-item-count-indicator.md +264 -0
- package/skills-library/ecommerce/checkout-ux-conversion.md +227 -0
- package/skills-library/ecommerce/composable-commerce-selection.md +166 -0
- package/skills-library/ecommerce/ecommerce-analytics-patterns.md +167 -0
- package/skills-library/ecommerce/fraud-detection-patterns.md +179 -0
- package/skills-library/ecommerce/inventory-stock-management.md +270 -0
- package/skills-library/ecommerce/order-saga-state-machine.md +336 -0
- package/skills-library/ecommerce/payment-provider-abstraction.md +245 -0
- package/skills-library/ecommerce/pci-compliance-checklist.md +192 -0
- package/skills-library/ecommerce/refund-chargeback-handling.md +177 -0
- package/skills-library/ecommerce/shipping-carrier-integration.md +218 -0
- package/skills-library/ecommerce/webhook-idempotency-patterns.md +253 -0
- package/skills-library/excalidraw-diagrams/.github/workflows/ci.yml +558 -0
- package/skills-library/excalidraw-diagrams/.github/workflows/prompt-gallery.yml +448 -0
- package/skills-library/excalidraw-diagrams/.github/workflows/release.yml +42 -0
- package/skills-library/excalidraw-diagrams/.github/workflows/test-reusable-ci.yml +25 -0
- package/skills-library/excalidraw-diagrams/CLAUDE.md +57 -0
- package/skills-library/excalidraw-diagrams/LICENSE +21 -0
- package/skills-library/excalidraw-diagrams/README.md +178 -0
- package/skills-library/excalidraw-diagrams/SKILL.md +715 -0
- package/skills-library/form-solutions/BUTTON_TYPE_FORM_SUBMISSION.md +336 -0
- package/skills-library/form-solutions/FILLABLE_PDF_IMPLEMENTATION.md +226 -0
- package/skills-library/form-solutions/SURVEYJS_QUESTIONNAIRE_SYSTEM.md +367 -0
- package/skills-library/form-solutions/tiptap-minimal-setup.md +690 -0
- package/skills-library/frontend/scholarly-classification-bubble-map.md +149 -0
- package/skills-library/infrastructure/ci-cd-pipeline-builder.md +517 -0
- package/skills-library/infrastructure/observability-designer.md +264 -0
- package/skills-library/infrastructure/performance-profiler.md +621 -0
- package/skills-library/installer-wizard-patterns.md +249 -0
- package/skills-library/integrations/CLAUDE_CODE_TOKEN_ANALYTICS.md +160 -0
- package/skills-library/integrations/CONFIGURABLE_AI_PROVIDER_SELECTION.md +728 -0
- package/skills-library/integrations/SOCKET_IO_BROADCAST_ALL_VS_ROOM.md +141 -0
- package/skills-library/integrations/VIRTUAL_MEETINGS_IMPLEMENTATION.md +374 -0
- package/skills-library/integrations/WORDPRESS_LEARNDASH_DATA_RECOVERY.md +53 -0
- package/skills-library/integrations/YOUTUBE_API_SETUP.md +141 -0
- package/skills-library/integrations/YOUTUBE_BOOKMARKING_EXPLANATION.md +252 -0
- package/skills-library/integrations/YOUTUBE_BOOKMARKING_SOLUTION.md +268 -0
- package/skills-library/integrations/YOUTUBE_OAUTH_SETUP_GUIDE.md +200 -0
- package/skills-library/integrations/YOUTUBE_VIDEO_FIX_COMPLETE.md +192 -0
- package/skills-library/integrations/ai-ml/GEMINI_AI_RAG_PIPELINE_COMPLETE_GUIDE.md +195 -0
- package/skills-library/integrations/ai-ml/GEMINI_IMAGE_GENERATION_SETUP.md +64 -0
- package/skills-library/integrations/cloudflare/cloudflare-turnstile-debugging.md +202 -0
- package/skills-library/integrations/cloudflare/cloudflare-turnstile-implementation.md +476 -0
- package/skills-library/integrations/cloudflare-turnstile-debugging.md +202 -0
- package/skills-library/integrations/cloudflare-turnstile-implementation.md +476 -0
- package/skills-library/integrations/ghost-creator-monetization-pattern.md +454 -0
- package/skills-library/integrations/headless-cms-architecture.md +484 -0
- package/skills-library/integrations/headless-cms-stack-selection.md +183 -0
- package/skills-library/integrations/payload-cms-patterns.md +674 -0
- package/skills-library/integrations/realtimestt-openwakeword-cuda-windows.md +229 -0
- package/skills-library/integrations/rss-podcast-integration.md +300 -0
- package/skills-library/integrations/wordpress/WORDPRESS_LEARNDASH_DATA_RECOVERY.md +53 -0
- package/skills-library/integrations/youtube/YOUTUBE_API_SETUP.md +141 -0
- package/skills-library/integrations/youtube/YOUTUBE_BOOKMARKING_EXPLANATION.md +252 -0
- package/skills-library/integrations/youtube/YOUTUBE_BOOKMARKING_SOLUTION.md +268 -0
- package/skills-library/integrations/youtube/YOUTUBE_OAUTH_SETUP_GUIDE.md +200 -0
- package/skills-library/integrations/youtube/YOUTUBE_VIDEO_FIX_COMPLETE.md +192 -0
- package/skills-library/marketing/campaign-analytics.md +97 -0
- package/skills-library/marketing/content-creator.md +105 -0
- package/skills-library/marketing/marketing-strategy-pmm.md +94 -0
- package/skills-library/marketing/social-media-analyzer.md +81 -0
- package/skills-library/methodology/ADVANCED_ORCHESTRATION_PATTERNS.md +401 -0
- package/skills-library/methodology/AGENT_SELF_IMPROVEMENT_LOOP.md +179 -0
- package/skills-library/methodology/BREATH_BASED_PARALLEL_EXECUTION.md +1 -1
- package/skills-library/methodology/CLEANSING_CYCLE.md +358 -0
- package/skills-library/methodology/CONFIDENCE_ANNOTATION_PATTERN.md +143 -0
- package/skills-library/methodology/CRITICAL_PATTERNS_DOCUMENTATION_COMPLETE.md +204 -0
- package/skills-library/methodology/DELIVERABLES_SUMMARY.md +341 -0
- package/skills-library/methodology/DIFFICULTY_AWARE_AGENT_ROUTING.md +252 -0
- package/skills-library/methodology/EVOLUTIONARY_SKILL_SYNTHESIS.md +219 -0
- package/skills-library/methodology/GLOMERULUS_DECISION_GATE.md +223 -0
- package/skills-library/methodology/HIBERNATION_SYSTEM.md +231 -0
- package/skills-library/methodology/INSTRUMENTATION_OVER_RESTRICTION.md +192 -0
- package/skills-library/methodology/MASTER_COMPLETION_SUMMARY.md +444 -0
- package/skills-library/methodology/MASTER_SESSION_COMPLETION.md +743 -0
- package/skills-library/methodology/MERN_QUICK_REFERENCE.md +358 -0
- package/skills-library/methodology/ORGAN_AGENT_MAPPING.md +177 -0
- package/skills-library/methodology/PARALLEL_WAVE_BASED_REFACTORING.md +440 -0
- package/skills-library/methodology/QUICK_REFERENCE.md +358 -0
- package/skills-library/methodology/SDFT_ONPOLICY_SELF_DISTILLATION.md +186 -0
- package/skills-library/methodology/SELF_QUESTIONING_TASK_GENERATION.md +270 -0
- package/skills-library/methodology/SESSION_COMPLETION_SUMMARY.md +304 -0
- package/skills-library/methodology/SESSION_SUMMARY.md +432 -0
- package/skills-library/methodology/WARRIOR_WORKFLOW_DEBUGGING_PROTOCOL.md +252 -0
- package/skills-library/methodology/tech-debt-tracker.md +570 -0
- package/skills-library/parallel-debug/SKILL.md +60 -0
- package/skills-library/patterns-standards/API_PATTERN_FIX_SUMMARY.md +236 -0
- package/skills-library/patterns-standards/BATCH_OPERATIONS_WITH_PROGRESS_MODAL.md +362 -0
- package/skills-library/patterns-standards/CRITICAL_CODING_PATTERNS.md +639 -0
- package/skills-library/patterns-standards/DARK_MODE_MODAL_VISIBILITY.md +258 -0
- package/skills-library/patterns-standards/ERROR_RESILIENCE_IMPLEMENTATION.md +375 -0
- package/skills-library/patterns-standards/ES_MODULE_IMPORT_HOISTING_DOTENV.md +298 -0
- package/skills-library/patterns-standards/NESTED_BACKDROP_FILTER_CSS_ARTIFACT_FIX.md +76 -0
- package/skills-library/patterns-standards/ORDERED_DETECTOR_PIPELINE_GRACEFUL_FALLBACK.md +333 -0
- package/skills-library/patterns-standards/PHASE_IMPORT_ERROR_DEBUGGING.md +271 -0
- package/skills-library/patterns-standards/PYNPUT_GLOBAL_HOTKEY_VK_MATCHING.md +252 -0
- package/skills-library/patterns-standards/REACT_USEEFFECT_CASCADE_RESET_FIX.md +132 -0
- package/skills-library/patterns-standards/SUBMENU_HOVER_DROPDOWN_PATTERN.md +225 -0
- package/skills-library/patterns-standards/TAILWIND_TEXT_VISIBILITY_OVERRIDE.md +322 -0
- package/skills-library/patterns-standards/THEME_AWARE_CSS_VARIABLES_PATTERN.md +209 -0
- package/skills-library/patterns-standards/THEME_USER_OBJECT_PROPERTY_NAMING.md +194 -0
- package/skills-library/patterns-standards/TOOLTIP_BLOCKING_CLICKS_FIX.md +267 -0
- package/skills-library/patterns-standards/claude-code-plugin-structure.md +235 -0
- package/skills-library/patterns-standards/react-i18next-setup.md +429 -0
- package/skills-library/patterns-standards/thesys-c1-generative-ui-integration.md +967 -0
- package/skills-library/plugin-development/CLAUDE_CODE_COMMAND_REGISTRATION_SILENT_FAILURE.md +315 -0
- package/skills-library/plugin-development/plugin-command-namespace-vs-global.md +390 -0
- package/skills-library/plugin-development/plugin-doc-auto-generation.md +172 -0
- package/skills-library/security/GITHUB_REPO_SECURITY_AUDIT.md +115 -0
- package/skills-library/security/admin-deletion-safety.md +396 -0
- package/skills-library/security/application-vuln-patterns.md +477 -0
- package/skills-library/security/env-secrets-manager.md +686 -0
- package/skills-library/security/secure-ai-application-templates.md +347 -0
- package/skills-library/security/sql-injection-prevention-postgresjs.md +151 -0
- package/skills-library/supabase-connection-pooler-fix.md +102 -0
- package/skills-library/system-context/POWERSHELL_BASH_INTEROP.md +82 -0
- package/skills-library/system-context/SERVICE_LIFECYCLE_MANAGEMENT.md +119 -0
- package/skills-library/system-context/SKILL.md +40 -0
- package/skills-library/system-context/WINDOWS_DEV_ENVIRONMENT.md +73 -0
- package/skills-library/testing/E2E_PLAYWRIGHT_PATTERNS.md +99 -0
- package/skills-library/testing/INTEGRATION_TEST_STRATEGY.md +82 -0
- package/skills-library/testing/RED_GREEN_BUGFIX_GATE.md +203 -0
- package/skills-library/testing/TEST_DATA_MANAGEMENT.md +69 -0
- package/skills-library/testing/VITEST_UNIT_TEST_PATTERNS.md +75 -0
- package/skills-library/testing/playwright-api-security-tests.md +202 -0
- package/skills-library/toolbox/SKILL.md +84 -0
- package/skills-library/toolbox/code-graph-and-web-scraping-mcps.md +237 -0
- package/skills-library/ui-ux-pro-max/ACCESSIBILITY_ESSENTIALS.md +115 -0
- package/skills-library/ui-ux-pro-max/DESIGN_SYSTEM_SCAFFOLDING.md +133 -0
- package/skills-library/ui-ux-pro-max/RESPONSIVE_LAYOUT_PATTERNS.md +119 -0
- package/skills-library/ui-ux-pro-max/SKILL.md +386 -0
- package/skills-library/ui-ux-pro-max/data/charts.csv +26 -0
- package/skills-library/ui-ux-pro-max/data/colors.csv +97 -0
- package/skills-library/ui-ux-pro-max/data/icons.csv +101 -0
- package/skills-library/ui-ux-pro-max/data/landing.csv +31 -0
- package/skills-library/ui-ux-pro-max/data/products.csv +97 -0
- package/skills-library/ui-ux-pro-max/data/react-performance.csv +45 -0
- package/skills-library/ui-ux-pro-max/data/stacks/astro.csv +54 -0
- package/skills-library/ui-ux-pro-max/data/stacks/flutter.csv +53 -0
- package/skills-library/ui-ux-pro-max/data/stacks/html-tailwind.csv +56 -0
- package/skills-library/ui-ux-pro-max/data/stacks/jetpack-compose.csv +53 -0
- package/skills-library/ui-ux-pro-max/data/stacks/nextjs.csv +53 -0
- package/skills-library/ui-ux-pro-max/data/stacks/nuxt-ui.csv +51 -0
- package/skills-library/ui-ux-pro-max/data/stacks/nuxtjs.csv +59 -0
- package/skills-library/ui-ux-pro-max/data/stacks/react-native.csv +52 -0
- package/skills-library/ui-ux-pro-max/data/stacks/react.csv +54 -0
- package/skills-library/ui-ux-pro-max/data/stacks/shadcn.csv +61 -0
- package/skills-library/ui-ux-pro-max/data/stacks/svelte.csv +54 -0
- package/skills-library/ui-ux-pro-max/data/stacks/swiftui.csv +51 -0
- package/skills-library/ui-ux-pro-max/data/stacks/vue.csv +50 -0
- package/skills-library/ui-ux-pro-max/data/styles.csv +68 -0
- package/skills-library/ui-ux-pro-max/data/typography.csv +58 -0
- package/skills-library/ui-ux-pro-max/data/ui-reasoning.csv +101 -0
- package/skills-library/ui-ux-pro-max/data/ux-guidelines.csv +100 -0
- package/skills-library/ui-ux-pro-max/data/web-interface.csv +31 -0
- package/skills-library/wordpress-style-theme-components.md +1526 -0
- package/templates/ASSUMPTIONS.md +1 -1
- package/templates/DECISION_LOG.md +0 -1
- package/templates/phase-prompt.md +1 -1
- package/templates/phoenix-comparison.md +6 -6
- package/templates/skill-api-integration.md +106 -0
- package/templates/skill-architecture-pattern.md +92 -0
- package/templates/skill-debug-pattern.md +98 -0
- package/templates/skill-devops-recipe.md +107 -0
- package/templates/skill-general.md +65 -0
- package/templates/skill-ui-component.md +113 -0
- package/tools/uat-runner.py +179 -0
- package/version.json +7 -3
- package/workflows/handoff-session.md +2 -2
- package/workflows/new-project.md +2 -2
- package/workflows/plan-phase.md +1 -1
- package/.claude-plugin/plugin.json +0 -64
- package/skills-library/_general/methodology/LIVE_BREADCRUMB_PROTOCOL.md +0 -242
- package/skills-library/_general/methodology/llm-judge-memory-crud.md +0 -241
- package/skills-library/methodology/REFLEXION_MEMORY_PATTERN.md +0 -183
- package/skills-library/methodology/RESEARCH_BACKED_WORKFLOW_UPGRADE.md +0 -263
- package/skills-library/methodology/SABBATH_REST_PATTERN.md +0 -267
- package/skills-library/methodology/STONE_AND_SCAFFOLD.md +0 -220
- package/skills-library/specialists/api-architecture/api-designer.md +0 -49
- package/skills-library/specialists/api-architecture/graphql-architect.md +0 -49
- package/skills-library/specialists/api-architecture/mcp-developer.md +0 -51
- package/skills-library/specialists/api-architecture/microservices-architect.md +0 -50
- package/skills-library/specialists/api-architecture/websocket-engineer.md +0 -48
- package/skills-library/specialists/backend/django-expert.md +0 -52
- package/skills-library/specialists/backend/fastapi-expert.md +0 -52
- package/skills-library/specialists/backend/laravel-specialist.md +0 -52
- package/skills-library/specialists/backend/nestjs-expert.md +0 -51
- package/skills-library/specialists/backend/rails-expert.md +0 -53
- package/skills-library/specialists/backend/spring-boot-engineer.md +0 -56
- package/skills-library/specialists/data-ml/fine-tuning-expert.md +0 -48
- package/skills-library/specialists/data-ml/ml-pipeline.md +0 -47
- package/skills-library/specialists/data-ml/pandas-pro.md +0 -47
- package/skills-library/specialists/data-ml/rag-architect.md +0 -51
- package/skills-library/specialists/data-ml/spark-engineer.md +0 -47
- package/skills-library/specialists/frontend/angular-architect.md +0 -52
- package/skills-library/specialists/frontend/flutter-expert.md +0 -51
- package/skills-library/specialists/frontend/nextjs-developer.md +0 -54
- package/skills-library/specialists/frontend/react-native-expert.md +0 -50
- package/skills-library/specialists/frontend/vue-expert.md +0 -51
- package/skills-library/specialists/infrastructure/chaos-engineer.md +0 -74
- package/skills-library/specialists/infrastructure/cloud-architect.md +0 -70
- package/skills-library/specialists/infrastructure/database-optimizer.md +0 -64
- package/skills-library/specialists/infrastructure/devops-engineer.md +0 -70
- package/skills-library/specialists/infrastructure/kubernetes-specialist.md +0 -52
- package/skills-library/specialists/infrastructure/monitoring-expert.md +0 -70
- package/skills-library/specialists/infrastructure/sre-engineer.md +0 -70
- package/skills-library/specialists/infrastructure/terraform-engineer.md +0 -51
- package/skills-library/specialists/languages/cpp-pro.md +0 -74
- package/skills-library/specialists/languages/csharp-developer.md +0 -69
- package/skills-library/specialists/languages/dotnet-core-expert.md +0 -54
- package/skills-library/specialists/languages/golang-pro.md +0 -51
- package/skills-library/specialists/languages/java-architect.md +0 -49
- package/skills-library/specialists/languages/javascript-pro.md +0 -68
- package/skills-library/specialists/languages/kotlin-specialist.md +0 -68
- package/skills-library/specialists/languages/php-pro.md +0 -49
- package/skills-library/specialists/languages/python-pro.md +0 -52
- package/skills-library/specialists/languages/react-expert.md +0 -51
- package/skills-library/specialists/languages/rust-engineer.md +0 -50
- package/skills-library/specialists/languages/sql-pro.md +0 -56
- package/skills-library/specialists/languages/swift-expert.md +0 -69
- package/skills-library/specialists/languages/typescript-pro.md +0 -51
- package/skills-library/specialists/platform/atlassian-mcp.md +0 -52
- package/skills-library/specialists/platform/embedded-systems.md +0 -53
- package/skills-library/specialists/platform/game-developer.md +0 -53
- package/skills-library/specialists/platform/salesforce-developer.md +0 -53
- package/skills-library/specialists/platform/shopify-expert.md +0 -49
- package/skills-library/specialists/platform/wordpress-pro.md +0 -49
- package/skills-library/specialists/quality/code-documenter.md +0 -51
- package/skills-library/specialists/quality/code-reviewer.md +0 -67
- package/skills-library/specialists/quality/debugging-wizard.md +0 -51
- package/skills-library/specialists/quality/fullstack-guardian.md +0 -51
- package/skills-library/specialists/quality/legacy-modernizer.md +0 -50
- package/skills-library/specialists/quality/playwright-expert.md +0 -65
- package/skills-library/specialists/quality/spec-miner.md +0 -56
- package/skills-library/specialists/quality/test-master.md +0 -65
- package/skills-library/specialists/security/secure-code-guardian.md +0 -55
- package/skills-library/specialists/security/security-reviewer.md +0 -53
- package/skills-library/specialists/workflow/architecture-designer.md +0 -53
- package/skills-library/specialists/workflow/cli-developer.md +0 -70
- package/skills-library/specialists/workflow/feature-forge.md +0 -65
- package/skills-library/specialists/workflow/prompt-engineer.md +0 -54
- package/skills-library/specialists/workflow/the-fool.md +0 -62
- /package/skills-library/{performance → _general/performance}/cache-augmented-generation.md +0 -0
- /package/skills-library/{debugging → parallel-debug}/FAILURE_TAXONOMY_CLASSIFICATION.md +0 -0
- /package/skills-library/{debugging → parallel-debug}/THREE_AGENT_HYPOTHESIS_DEBUGGING.md +0 -0
|
@@ -0,0 +1,2184 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: doc-to-podcast-pipeline
|
|
3
|
+
category: creative-multimedia
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
contributed: 2026-03-10
|
|
6
|
+
contributor: dominion-flow-research
|
|
7
|
+
last_updated: 2026-03-10
|
|
8
|
+
tags: [podcast, pipeline, document-to-audio, notebooklm, architecture, end-to-end, ffmpeg, tts, rag]
|
|
9
|
+
difficulty: hard
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
# Document-to-Podcast Pipeline
|
|
13
|
+
## Description
|
|
14
|
+
|
|
15
|
+
End-to-end pipeline that transforms any document (PDF, DOCX, URL, YouTube transcript) into a
|
|
16
|
+
polished podcast episode with multiple speakers, natural conversation flow, intro/outro music,
|
|
17
|
+
and broadcast-quality loudness normalization. Combines RAG-based content understanding, multi-agent
|
|
18
|
+
script generation (PodAgent pattern), neural TTS synthesis, and FFmpeg audio composition into a
|
|
19
|
+
single automated workflow.
|
|
20
|
+
|
|
21
|
+
This is the "full stack" audio generation skill -- it orchestrates capabilities from several
|
|
22
|
+
sibling skills (transcription-pipeline-selector, ffmpeg-command-generator, audio-enhancement-pipeline,
|
|
23
|
+
content-repurposing-pipeline) into one cohesive pipeline.
|
|
24
|
+
|
|
25
|
+
## When to Use
|
|
26
|
+
|
|
27
|
+
- Transforming written documents (PDFs, articles, papers) into listenable podcast episodes
|
|
28
|
+
- Building a NotebookLM-style "Audio Overview" feature for your application
|
|
29
|
+
- Converting sermons, Bible studies, or teaching notes into podcast format
|
|
30
|
+
- Creating educational audio content from textbooks or course materials
|
|
31
|
+
- Automating podcast production from blog posts or newsletters
|
|
32
|
+
- Building an internal tool that generates audio briefings from reports
|
|
33
|
+
|
|
34
|
+
## Related Skills
|
|
35
|
+
|
|
36
|
+
- `transcription-pipeline-selector.md` -- Input stage: transcribe audio/video sources before processing
|
|
37
|
+
- `ffmpeg-command-generator.md` -- Output stage: all FFmpeg commands for audio composition
|
|
38
|
+
- `audio-enhancement-pipeline.md` -- Post-production: loudness normalization, noise reduction
|
|
39
|
+
- `content-repurposing-pipeline.md` -- Broader pipeline: podcast is one output format among many
|
|
40
|
+
- `podcast-script-generation.md` -- Stage 3 deep-dive: PodAgent multi-agent script writing
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## Architecture Overview
|
|
45
|
+
|
|
46
|
+
### The 4-Stage Pipeline
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
Stage 1: INGEST --> Parse documents, extract text, chunk semantically
|
|
50
|
+
Stage 2: UNDERSTAND --> RAG retrieval, key point extraction, outline generation
|
|
51
|
+
Stage 3: SCRIPT --> Multi-agent podcast script generation (PodAgent pattern)
|
|
52
|
+
Stage 4: SYNTHESIZE --> TTS audio generation, mixing, post-production
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Full Architecture Diagram
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
+--------------------------------------------------+
|
|
59
|
+
| DOCUMENT SOURCES |
|
|
60
|
+
| [PDF] [DOCX] [URL] [YouTube] [Audio/Video] |
|
|
61
|
+
+--------------------------------------------------+
|
|
62
|
+
|
|
|
63
|
+
v
|
|
64
|
+
+--------------------------------------------------+
|
|
65
|
+
| STAGE 1: INGEST |
|
|
66
|
+
| |
|
|
67
|
+
| Document Parser (pdf-parse / mammoth / cheerio) |
|
|
68
|
+
| | |
|
|
69
|
+
| v |
|
|
70
|
+
| [Clean Text + Metadata] |
|
|
71
|
+
| | |
|
|
72
|
+
| v |
|
|
73
|
+
| Semantic Chunker (400-600 tokens, 50 overlap) |
|
|
74
|
+
| | |
|
|
75
|
+
| v |
|
|
76
|
+
| [Chunks + Embeddings --> Vector DB] |
|
|
77
|
+
+--------------------------------------------------+
|
|
78
|
+
|
|
|
79
|
+
v
|
|
80
|
+
+--------------------------------------------------+
|
|
81
|
+
| STAGE 2: UNDERSTAND |
|
|
82
|
+
| |
|
|
83
|
+
| Key Point Extractor (AI) |
|
|
84
|
+
| | |
|
|
85
|
+
| v |
|
|
86
|
+
| [Ranked Discussion Points] |
|
|
87
|
+
| | |
|
|
88
|
+
| v |
|
|
89
|
+
| Outline Generator (AI) |
|
|
90
|
+
| | |
|
|
91
|
+
| v |
|
|
92
|
+
| [Podcast Outline: Intro -> Segments -> Outro] |
|
|
93
|
+
+--------------------------------------------------+
|
|
94
|
+
|
|
|
95
|
+
v
|
|
96
|
+
+--------------------------------------------------+
|
|
97
|
+
| STAGE 3: SCRIPT |
|
|
98
|
+
| |
|
|
99
|
+
| Multi-Agent Script Writer (PodAgent pattern) |
|
|
100
|
+
| - Host Agent: drives conversation |
|
|
101
|
+
| - Guest Agent: provides expert responses |
|
|
102
|
+
| - Writer Agent: structures + verifies |
|
|
103
|
+
| | |
|
|
104
|
+
| v |
|
|
105
|
+
| [Structured Script JSON] |
|
|
106
|
+
| { speaker, text, emotion, duration }[] |
|
|
107
|
+
+--------------------------------------------------+
|
|
108
|
+
|
|
|
109
|
+
v
|
|
110
|
+
+--------------------------------------------------+
|
|
111
|
+
| STAGE 4: SYNTHESIZE |
|
|
112
|
+
| |
|
|
113
|
+
| TTS Engine (per segment, per speaker) |
|
|
114
|
+
| | |
|
|
115
|
+
| v |
|
|
116
|
+
| [Audio Segments WAV] |
|
|
117
|
+
| | |
|
|
118
|
+
| v |
|
|
119
|
+
| FFmpeg Composer |
|
|
120
|
+
| - Concatenate segments |
|
|
121
|
+
| - Insert pauses (200-500ms) |
|
|
122
|
+
| - Add intro/outro music |
|
|
123
|
+
| - Normalize loudness (EBU R128, -16 LUFS) |
|
|
124
|
+
| | |
|
|
125
|
+
| v |
|
|
126
|
+
| [Final Podcast MP3 + ID3 Metadata] |
|
|
127
|
+
+--------------------------------------------------+
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## Stage 1: Document Ingestion
|
|
133
|
+
|
|
134
|
+
The ingestion stage accepts multiple document formats and produces clean, chunked text ready
|
|
135
|
+
for AI understanding. Each parser extracts both text content and structural metadata (titles,
|
|
136
|
+
headings, page numbers) to preserve document context.
|
|
137
|
+
|
|
138
|
+
### Dependencies
|
|
139
|
+
|
|
140
|
+
```json
|
|
141
|
+
{
|
|
142
|
+
"dependencies": {
|
|
143
|
+
"pdf-parse": "^1.1.1",
|
|
144
|
+
"mammoth": "^1.8.0",
|
|
145
|
+
"@mozilla/readability": "^0.5.0",
|
|
146
|
+
"cheerio": "^1.0.0",
|
|
147
|
+
"linkedom": "^0.18.0",
|
|
148
|
+
"youtube-transcript": "^1.2.1"
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### PDF Parser
|
|
154
|
+
|
|
155
|
+
```typescript
|
|
156
|
+
import pdfParse from 'pdf-parse';
|
|
157
|
+
import { readFile } from 'fs/promises';
|
|
158
|
+
|
|
159
|
+
interface ParsedDocument {
|
|
160
|
+
text: string;
|
|
161
|
+
metadata: {
|
|
162
|
+
title: string;
|
|
163
|
+
author: string;
|
|
164
|
+
source: string;
|
|
165
|
+
sourceType: 'pdf' | 'docx' | 'url' | 'youtube' | 'transcript';
|
|
166
|
+
pageCount?: number;
|
|
167
|
+
wordCount: number;
|
|
168
|
+
extractedAt: string;
|
|
169
|
+
};
|
|
170
|
+
sections: { heading: string; content: string; page?: number }[];
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
async function parsePDF(filePath: string): Promise<ParsedDocument> {
|
|
174
|
+
const buffer = await readFile(filePath);
|
|
175
|
+
const data = await pdfParse(buffer);
|
|
176
|
+
|
|
177
|
+
// Split into sections by detecting heading patterns
|
|
178
|
+
const lines = data.text.split('\n');
|
|
179
|
+
const sections: ParsedDocument['sections'] = [];
|
|
180
|
+
let currentSection = { heading: 'Introduction', content: '', page: 1 };
|
|
181
|
+
|
|
182
|
+
for (const line of lines) {
|
|
183
|
+
const trimmed = line.trim();
|
|
184
|
+
// Heuristic: short lines in ALL CAPS or Title Case are likely headings
|
|
185
|
+
if (
|
|
186
|
+
trimmed.length > 0 &&
|
|
187
|
+
trimmed.length < 100 &&
|
|
188
|
+
(trimmed === trimmed.toUpperCase() || /^[A-Z][a-z]/.test(trimmed)) &&
|
|
189
|
+
!trimmed.endsWith('.')
|
|
190
|
+
) {
|
|
191
|
+
if (currentSection.content.trim()) {
|
|
192
|
+
sections.push({ ...currentSection });
|
|
193
|
+
}
|
|
194
|
+
currentSection = { heading: trimmed, content: '', page: currentSection.page };
|
|
195
|
+
} else {
|
|
196
|
+
currentSection.content += trimmed + ' ';
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
if (currentSection.content.trim()) {
|
|
200
|
+
sections.push(currentSection);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
return {
|
|
204
|
+
text: data.text,
|
|
205
|
+
metadata: {
|
|
206
|
+
title: data.info?.Title || filePath.split('/').pop()?.replace('.pdf', '') || 'Untitled',
|
|
207
|
+
author: data.info?.Author || 'Unknown',
|
|
208
|
+
source: filePath,
|
|
209
|
+
sourceType: 'pdf',
|
|
210
|
+
pageCount: data.numpages,
|
|
211
|
+
wordCount: data.text.split(/\s+/).length,
|
|
212
|
+
extractedAt: new Date().toISOString(),
|
|
213
|
+
},
|
|
214
|
+
sections,
|
|
215
|
+
};
|
|
216
|
+
}
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
### DOCX Parser
|
|
220
|
+
|
|
221
|
+
```typescript
|
|
222
|
+
import mammoth from 'mammoth';
|
|
223
|
+
import { readFile } from 'fs/promises';
|
|
224
|
+
|
|
225
|
+
async function parseDOCX(filePath: string): Promise<ParsedDocument> {
|
|
226
|
+
const buffer = await readFile(filePath);
|
|
227
|
+
const result = await mammoth.extractRawText({ buffer });
|
|
228
|
+
const text = result.value;
|
|
229
|
+
|
|
230
|
+
// Also extract with HTML to get heading structure
|
|
231
|
+
const htmlResult = await mammoth.convertToHtml({ buffer });
|
|
232
|
+
const sections = extractSectionsFromHtml(htmlResult.value);
|
|
233
|
+
|
|
234
|
+
return {
|
|
235
|
+
text,
|
|
236
|
+
metadata: {
|
|
237
|
+
title: filePath.split('/').pop()?.replace('.docx', '') || 'Untitled',
|
|
238
|
+
author: 'Unknown',
|
|
239
|
+
source: filePath,
|
|
240
|
+
sourceType: 'docx',
|
|
241
|
+
wordCount: text.split(/\s+/).length,
|
|
242
|
+
extractedAt: new Date().toISOString(),
|
|
243
|
+
},
|
|
244
|
+
sections,
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
function extractSectionsFromHtml(html: string): ParsedDocument['sections'] {
|
|
249
|
+
// Use regex to split on h1-h4 tags (lightweight, no DOM needed)
|
|
250
|
+
const headingPattern = /<h[1-4][^>]*>(.*?)<\/h[1-4]>/gi;
|
|
251
|
+
const sections: ParsedDocument['sections'] = [];
|
|
252
|
+
let lastIndex = 0;
|
|
253
|
+
let lastHeading = 'Introduction';
|
|
254
|
+
let match: RegExpExecArray | null;
|
|
255
|
+
|
|
256
|
+
while ((match = headingPattern.exec(html)) !== null) {
|
|
257
|
+
const content = html.slice(lastIndex, match.index).replace(/<[^>]*>/g, '').trim();
|
|
258
|
+
if (content) {
|
|
259
|
+
sections.push({ heading: lastHeading, content });
|
|
260
|
+
}
|
|
261
|
+
lastHeading = match[1].replace(/<[^>]*>/g, '').trim();
|
|
262
|
+
lastIndex = match.index + match[0].length;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// Remaining content after last heading
|
|
266
|
+
const remaining = html.slice(lastIndex).replace(/<[^>]*>/g, '').trim();
|
|
267
|
+
if (remaining) {
|
|
268
|
+
sections.push({ heading: lastHeading, content: remaining });
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
return sections;
|
|
272
|
+
}
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
### URL Parser (Web Articles)
|
|
276
|
+
|
|
277
|
+
```typescript
|
|
278
|
+
import { Readability } from '@mozilla/readability';
|
|
279
|
+
import { parseHTML } from 'linkedom';
|
|
280
|
+
|
|
281
|
+
async function parseURL(url: string): Promise<ParsedDocument> {
|
|
282
|
+
const response = await fetch(url);
|
|
283
|
+
const html = await response.text();
|
|
284
|
+
|
|
285
|
+
// linkedom provides a DOM-like environment for Readability
|
|
286
|
+
const { document } = parseHTML(html);
|
|
287
|
+
const reader = new Readability(document as any);
|
|
288
|
+
const article = reader.parse();
|
|
289
|
+
|
|
290
|
+
if (!article) {
|
|
291
|
+
throw new Error(`Could not extract readable content from ${url}`);
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
const text = article.textContent || '';
|
|
295
|
+
|
|
296
|
+
return {
|
|
297
|
+
text,
|
|
298
|
+
metadata: {
|
|
299
|
+
title: article.title || url,
|
|
300
|
+
author: article.byline || 'Unknown',
|
|
301
|
+
source: url,
|
|
302
|
+
sourceType: 'url',
|
|
303
|
+
wordCount: text.split(/\s+/).length,
|
|
304
|
+
extractedAt: new Date().toISOString(),
|
|
305
|
+
},
|
|
306
|
+
sections: [{ heading: article.title || 'Article', content: text }],
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
### YouTube Transcript Parser
|
|
312
|
+
|
|
313
|
+
```typescript
|
|
314
|
+
import { YoutubeTranscript } from 'youtube-transcript';
|
|
315
|
+
|
|
316
|
+
async function parseYouTube(videoUrl: string): Promise<ParsedDocument> {
|
|
317
|
+
const videoId = extractVideoId(videoUrl);
|
|
318
|
+
const transcript = await YoutubeTranscript.fetchTranscript(videoId);
|
|
319
|
+
|
|
320
|
+
const text = transcript.map((entry) => entry.text).join(' ');
|
|
321
|
+
|
|
322
|
+
// Group transcript into ~2-minute segments as "sections"
|
|
323
|
+
const sections: ParsedDocument['sections'] = [];
|
|
324
|
+
let currentSection = { heading: 'Opening', content: '' };
|
|
325
|
+
let segmentDuration = 0;
|
|
326
|
+
let segmentIndex = 1;
|
|
327
|
+
|
|
328
|
+
for (const entry of transcript) {
|
|
329
|
+
currentSection.content += entry.text + ' ';
|
|
330
|
+
segmentDuration += entry.duration;
|
|
331
|
+
|
|
332
|
+
if (segmentDuration >= 120) {
|
|
333
|
+
// 2-minute segments
|
|
334
|
+
sections.push({ ...currentSection });
|
|
335
|
+
segmentIndex++;
|
|
336
|
+
currentSection = {
|
|
337
|
+
heading: `Segment ${segmentIndex} (${formatTime(entry.offset)})`,
|
|
338
|
+
content: '',
|
|
339
|
+
};
|
|
340
|
+
segmentDuration = 0;
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
if (currentSection.content.trim()) {
|
|
344
|
+
sections.push(currentSection);
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
return {
|
|
348
|
+
text,
|
|
349
|
+
metadata: {
|
|
350
|
+
title: `YouTube: ${videoId}`,
|
|
351
|
+
author: 'Unknown',
|
|
352
|
+
source: videoUrl,
|
|
353
|
+
sourceType: 'youtube',
|
|
354
|
+
wordCount: text.split(/\s+/).length,
|
|
355
|
+
extractedAt: new Date().toISOString(),
|
|
356
|
+
},
|
|
357
|
+
sections,
|
|
358
|
+
};
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
function extractVideoId(url: string): string {
|
|
362
|
+
const match = url.match(
|
|
363
|
+
/(?:youtube\.com\/(?:watch\?v=|embed\/)|youtu\.be\/)([a-zA-Z0-9_-]{11})/
|
|
364
|
+
);
|
|
365
|
+
if (!match) throw new Error(`Invalid YouTube URL: ${url}`);
|
|
366
|
+
return match[1];
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
function formatTime(seconds: number): string {
|
|
370
|
+
const m = Math.floor(seconds / 60);
|
|
371
|
+
const s = Math.floor(seconds % 60);
|
|
372
|
+
return `${m}:${s.toString().padStart(2, '0')}`;
|
|
373
|
+
}
|
|
374
|
+
```
|
|
375
|
+
|
|
376
|
+
### Unified Document Ingestor
|
|
377
|
+
|
|
378
|
+
```typescript
|
|
379
|
+
type DocumentSource =
|
|
380
|
+
| { type: 'pdf'; path: string }
|
|
381
|
+
| { type: 'docx'; path: string }
|
|
382
|
+
| { type: 'url'; url: string }
|
|
383
|
+
| { type: 'youtube'; url: string }
|
|
384
|
+
| { type: 'text'; content: string; title?: string };
|
|
385
|
+
|
|
386
|
+
async function ingestDocument(source: DocumentSource): Promise<ParsedDocument> {
|
|
387
|
+
switch (source.type) {
|
|
388
|
+
case 'pdf':
|
|
389
|
+
return parsePDF(source.path);
|
|
390
|
+
case 'docx':
|
|
391
|
+
return parseDOCX(source.path);
|
|
392
|
+
case 'url':
|
|
393
|
+
return parseURL(source.url);
|
|
394
|
+
case 'youtube':
|
|
395
|
+
return parseYouTube(source.url);
|
|
396
|
+
case 'text':
|
|
397
|
+
return {
|
|
398
|
+
text: source.content,
|
|
399
|
+
metadata: {
|
|
400
|
+
title: source.title || 'Direct Text',
|
|
401
|
+
author: 'User',
|
|
402
|
+
source: 'direct-input',
|
|
403
|
+
sourceType: 'transcript',
|
|
404
|
+
wordCount: source.content.split(/\s+/).length,
|
|
405
|
+
extractedAt: new Date().toISOString(),
|
|
406
|
+
},
|
|
407
|
+
sections: [{ heading: source.title || 'Content', content: source.content }],
|
|
408
|
+
};
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
```
|
|
412
|
+
|
|
413
|
+
### Semantic Chunker
|
|
414
|
+
|
|
415
|
+
Chunking strategy: 400-600 tokens per chunk with 50-token overlap. This ensures each chunk
|
|
416
|
+
has enough context for meaningful embedding while maintaining continuity across chunk boundaries.
|
|
417
|
+
|
|
418
|
+
```typescript
|
|
419
|
+
interface TextChunk {
|
|
420
|
+
id: string;
|
|
421
|
+
text: string;
|
|
422
|
+
index: number;
|
|
423
|
+
sectionHeading: string;
|
|
424
|
+
tokenCount: number;
|
|
425
|
+
embedding?: number[];
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
function semanticChunk(
|
|
429
|
+
doc: ParsedDocument,
|
|
430
|
+
targetTokens: number = 500,
|
|
431
|
+
overlapTokens: number = 50
|
|
432
|
+
): TextChunk[] {
|
|
433
|
+
const chunks: TextChunk[] = [];
|
|
434
|
+
let chunkIndex = 0;
|
|
435
|
+
|
|
436
|
+
for (const section of doc.sections) {
|
|
437
|
+
const words = section.content.split(/\s+/);
|
|
438
|
+
// Rough token estimate: 1 word ~ 1.3 tokens
|
|
439
|
+
const wordsPerChunk = Math.floor(targetTokens / 1.3);
|
|
440
|
+
const overlapWords = Math.floor(overlapTokens / 1.3);
|
|
441
|
+
|
|
442
|
+
let start = 0;
|
|
443
|
+
while (start < words.length) {
|
|
444
|
+
const end = Math.min(start + wordsPerChunk, words.length);
|
|
445
|
+
const chunkText = words.slice(start, end).join(' ');
|
|
446
|
+
|
|
447
|
+
if (chunkText.trim().length > 20) {
|
|
448
|
+
// Skip tiny fragments
|
|
449
|
+
chunks.push({
|
|
450
|
+
id: `chunk-${chunkIndex}`,
|
|
451
|
+
text: chunkText,
|
|
452
|
+
index: chunkIndex,
|
|
453
|
+
sectionHeading: section.heading,
|
|
454
|
+
tokenCount: Math.ceil(chunkText.split(/\s+/).length * 1.3),
|
|
455
|
+
});
|
|
456
|
+
chunkIndex++;
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
start = end - overlapWords;
|
|
460
|
+
if (start >= words.length - overlapWords) break;
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
return chunks;
|
|
465
|
+
}
|
|
466
|
+
```
|
|
467
|
+
|
|
468
|
+
### Embedding Generation
|
|
469
|
+
|
|
470
|
+
Two embedding options: Gemini embedding-001 (768d, cloud) or nomic-embed-text (local via Ollama).
|
|
471
|
+
|
|
472
|
+
```typescript
|
|
473
|
+
// Option A: Gemini Embedding API
|
|
474
|
+
async function embedWithGemini(chunks: TextChunk[]): Promise<TextChunk[]> {
|
|
475
|
+
const API_KEY = process.env.GEMINI_API_KEY;
|
|
476
|
+
const BATCH_SIZE = 100; // Gemini supports batch embedding
|
|
477
|
+
|
|
478
|
+
for (let i = 0; i < chunks.length; i += BATCH_SIZE) {
|
|
479
|
+
const batch = chunks.slice(i, i + BATCH_SIZE);
|
|
480
|
+
const response = await fetch(
|
|
481
|
+
`https://generativelanguage.googleapis.com/v1beta/models/embedding-001:batchEmbedContents?key=${API_KEY}`,
|
|
482
|
+
{
|
|
483
|
+
method: 'POST',
|
|
484
|
+
headers: { 'Content-Type': 'application/json' },
|
|
485
|
+
body: JSON.stringify({
|
|
486
|
+
requests: batch.map((chunk) => ({
|
|
487
|
+
model: 'models/embedding-001',
|
|
488
|
+
content: { parts: [{ text: chunk.text }] },
|
|
489
|
+
taskType: 'RETRIEVAL_DOCUMENT',
|
|
490
|
+
})),
|
|
491
|
+
}),
|
|
492
|
+
}
|
|
493
|
+
);
|
|
494
|
+
|
|
495
|
+
const data = await response.json();
|
|
496
|
+
for (let j = 0; j < batch.length; j++) {
|
|
497
|
+
batch[j].embedding = data.embeddings[j].values;
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
return chunks;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
// Option B: Local embedding via Ollama (nomic-embed-text, 768d)
|
|
505
|
+
async function embedWithOllama(chunks: TextChunk[]): Promise<TextChunk[]> {
|
|
506
|
+
for (const chunk of chunks) {
|
|
507
|
+
const response = await fetch('http://localhost:11434/api/embeddings', {
|
|
508
|
+
method: 'POST',
|
|
509
|
+
headers: { 'Content-Type': 'application/json' },
|
|
510
|
+
body: JSON.stringify({
|
|
511
|
+
model: 'nomic-embed-text',
|
|
512
|
+
prompt: chunk.text,
|
|
513
|
+
}),
|
|
514
|
+
});
|
|
515
|
+
const data = await response.json();
|
|
516
|
+
chunk.embedding = data.embedding;
|
|
517
|
+
}
|
|
518
|
+
return chunks;
|
|
519
|
+
}
|
|
520
|
+
```
|
|
521
|
+
|
|
522
|
+
---
|
|
523
|
+
|
|
524
|
+
## Stage 2: Understanding
|
|
525
|
+
|
|
526
|
+
The understanding stage transforms raw chunks into a structured podcast outline.
|
|
527
|
+
It identifies the most discussion-worthy points, ranks them by importance, and
|
|
528
|
+
generates a conversational flow.
|
|
529
|
+
|
|
530
|
+
### Key Point Extraction
|
|
531
|
+
|
|
532
|
+
```typescript
|
|
533
|
+
import { GoogleGenerativeAI } from '@google/generative-ai';
|
|
534
|
+
|
|
535
|
+
interface KeyPoint {
|
|
536
|
+
topic: string;
|
|
537
|
+
summary: string;
|
|
538
|
+
relevantChunks: string[]; // chunk IDs
|
|
539
|
+
importance: number; // 1-10
|
|
540
|
+
discussionAngle: string; // how to frame it for conversation
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
interface PodcastOutline {
|
|
544
|
+
title: string;
|
|
545
|
+
description: string;
|
|
546
|
+
targetDuration: string;
|
|
547
|
+
keyPoints: KeyPoint[];
|
|
548
|
+
segments: PodcastSegment[];
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
interface PodcastSegment {
|
|
552
|
+
type: 'intro' | 'discussion' | 'deep-dive' | 'recap' | 'outro';
|
|
553
|
+
title: string;
|
|
554
|
+
keyPointRefs: number[];
|
|
555
|
+
estimatedDuration: number; // seconds
|
|
556
|
+
notes: string;
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
async function extractKeyPoints(
|
|
560
|
+
doc: ParsedDocument,
|
|
561
|
+
chunks: TextChunk[],
|
|
562
|
+
maxPoints: number = 8
|
|
563
|
+
): Promise<KeyPoint[]> {
|
|
564
|
+
const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
|
|
565
|
+
const model = genAI.getGenerativeModel({ model: 'gemini-2.0-flash' });
|
|
566
|
+
|
|
567
|
+
const chunkSummaries = chunks
|
|
568
|
+
.map((c) => `[${c.id}] (Section: ${c.sectionHeading}): ${c.text.slice(0, 200)}...`)
|
|
569
|
+
.join('\n');
|
|
570
|
+
|
|
571
|
+
const prompt = `You are an expert podcast producer analyzing a document for conversion into a podcast episode.
|
|
572
|
+
|
|
573
|
+
Document Title: ${doc.metadata.title}
|
|
574
|
+
Author: ${doc.metadata.author}
|
|
575
|
+
Word Count: ${doc.metadata.wordCount}
|
|
576
|
+
|
|
577
|
+
Document chunks:
|
|
578
|
+
${chunkSummaries}
|
|
579
|
+
|
|
580
|
+
Identify the top ${maxPoints} most discussion-worthy points from this document.
|
|
581
|
+
For each point, provide:
|
|
582
|
+
1. A clear topic name
|
|
583
|
+
2. A 1-2 sentence summary
|
|
584
|
+
3. The chunk IDs that are most relevant (as an array)
|
|
585
|
+
4. An importance score (1-10)
|
|
586
|
+
5. A discussion angle (how would podcast hosts naturally discuss this?)
|
|
587
|
+
|
|
588
|
+
Return ONLY valid JSON in this format:
|
|
589
|
+
[
|
|
590
|
+
{
|
|
591
|
+
"topic": "string",
|
|
592
|
+
"summary": "string",
|
|
593
|
+
"relevantChunks": ["chunk-0", "chunk-3"],
|
|
594
|
+
"importance": 8,
|
|
595
|
+
"discussionAngle": "string"
|
|
596
|
+
}
|
|
597
|
+
]
|
|
598
|
+
|
|
599
|
+
Focus on points that:
|
|
600
|
+
- Would be interesting to a general audience
|
|
601
|
+
- Have enough depth for 2-3 minutes of discussion
|
|
602
|
+
- Connect to broader themes or real-world applications
|
|
603
|
+
- Would benefit from being explained conversationally`;
|
|
604
|
+
|
|
605
|
+
const result = await model.generateContent(prompt);
|
|
606
|
+
const text = result.response.text();
|
|
607
|
+
|
|
608
|
+
// Extract JSON from response (handle markdown code blocks)
|
|
609
|
+
const jsonMatch = text.match(/\[[\s\S]*\]/);
|
|
610
|
+
if (!jsonMatch) throw new Error('Failed to extract key points JSON from AI response');
|
|
611
|
+
|
|
612
|
+
const keyPoints: KeyPoint[] = JSON.parse(jsonMatch[0]);
|
|
613
|
+
return keyPoints.sort((a, b) => b.importance - a.importance).slice(0, maxPoints);
|
|
614
|
+
}
|
|
615
|
+
```
|
|
616
|
+
|
|
617
|
+
### Outline Generation
|
|
618
|
+
|
|
619
|
+
```typescript
|
|
620
|
+
async function generateOutline(
|
|
621
|
+
doc: ParsedDocument,
|
|
622
|
+
keyPoints: KeyPoint[],
|
|
623
|
+
config: { format: string; duration: string }
|
|
624
|
+
): Promise<PodcastOutline> {
|
|
625
|
+
const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
|
|
626
|
+
const model = genAI.getGenerativeModel({ model: 'gemini-2.0-flash' });
|
|
627
|
+
|
|
628
|
+
const durationSeconds = parseDuration(config.duration);
|
|
629
|
+
const pointsSummary = keyPoints
|
|
630
|
+
.map((kp, i) => `${i + 1}. [Importance: ${kp.importance}] ${kp.topic}: ${kp.summary}`)
|
|
631
|
+
.join('\n');
|
|
632
|
+
|
|
633
|
+
const prompt = `You are a podcast producer creating an episode outline.
|
|
634
|
+
|
|
635
|
+
Document: "${doc.metadata.title}" by ${doc.metadata.author}
|
|
636
|
+
Format: ${config.format}
|
|
637
|
+
Target Duration: ${config.duration} (${durationSeconds} seconds)
|
|
638
|
+
|
|
639
|
+
Key discussion points (ranked by importance):
|
|
640
|
+
${pointsSummary}
|
|
641
|
+
|
|
642
|
+
Create a podcast outline with these segments:
|
|
643
|
+
1. INTRO (10-15% of duration): Hook the listener, introduce the topic
|
|
644
|
+
2. DISCUSSION segments (70-80%): Cover the key points in a logical flow
|
|
645
|
+
3. RECAP/OUTRO (10-15%): Summarize takeaways, closing thoughts
|
|
646
|
+
|
|
647
|
+
For a "${config.format}" format:
|
|
648
|
+
- "deep-dive": Thorough exploration, technical depth, expert tone
|
|
649
|
+
- "brief": Quick overview, highlight the top 3-4 points only
|
|
650
|
+
- "debate": Present contrasting viewpoints on each point
|
|
651
|
+
- "narration": Single narrator, storytelling approach
|
|
652
|
+
|
|
653
|
+
Return ONLY valid JSON:
|
|
654
|
+
{
|
|
655
|
+
"title": "Episode title",
|
|
656
|
+
"description": "1-2 sentence episode description",
|
|
657
|
+
"targetDuration": "${config.duration}",
|
|
658
|
+
"segments": [
|
|
659
|
+
{
|
|
660
|
+
"type": "intro|discussion|deep-dive|recap|outro",
|
|
661
|
+
"title": "Segment title",
|
|
662
|
+
"keyPointRefs": [0, 1],
|
|
663
|
+
"estimatedDuration": 120,
|
|
664
|
+
"notes": "Production notes for script writer"
|
|
665
|
+
}
|
|
666
|
+
]
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
Ensure total estimatedDuration across all segments equals approximately ${durationSeconds} seconds.`;
|
|
670
|
+
|
|
671
|
+
const result = await model.generateContent(prompt);
|
|
672
|
+
const text = result.response.text();
|
|
673
|
+
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
674
|
+
if (!jsonMatch) throw new Error('Failed to extract outline JSON from AI response');
|
|
675
|
+
|
|
676
|
+
const outline: PodcastOutline = JSON.parse(jsonMatch[0]);
|
|
677
|
+
outline.keyPoints = keyPoints;
|
|
678
|
+
return outline;
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
function parseDuration(duration: string): number {
|
|
682
|
+
const match = duration.match(/(\d+)\s*min/);
|
|
683
|
+
return match ? parseInt(match[1]) * 60 : 900; // default 15min
|
|
684
|
+
}
|
|
685
|
+
```
|
|
686
|
+
|
|
687
|
+
---
|
|
688
|
+
|
|
689
|
+
## Stage 3: Script Generation
|
|
690
|
+
|
|
691
|
+
The script generation stage uses the PodAgent pattern (ACL 2025): three specialized AI agents
|
|
692
|
+
collaborate to produce a natural, engaging podcast script with faithfulness verification.
|
|
693
|
+
|
|
694
|
+
### PodAgent Multi-Agent Architecture
|
|
695
|
+
|
|
696
|
+
```
|
|
697
|
+
+-------------------+ +-------------------+ +-------------------+
|
|
698
|
+
| HOST AGENT | | GUEST AGENT | | WRITER AGENT |
|
|
699
|
+
| | | | | |
|
|
700
|
+
| - Drives convo | | - Expert voice | | - Structures flow |
|
|
701
|
+
| - Asks questions | | - Provides depth | | - Verifies facts |
|
|
702
|
+
| - Transitions | | - Uses analogies | | - Controls timing |
|
|
703
|
+
| - Engages listener| | - Cites sources | | - Ensures quality |
|
|
704
|
+
+-------------------+ +-------------------+ +-------------------+
|
|
705
|
+
| | |
|
|
706
|
+
+-------------------------+-------------------------+
|
|
707
|
+
|
|
|
708
|
+
v
|
|
709
|
+
+-----------------------------+
|
|
710
|
+
| STRUCTURED SCRIPT JSON |
|
|
711
|
+
| [{speaker, text, emotion, |
|
|
712
|
+
| duration, segmentRef}] |
|
|
713
|
+
+-----------------------------+
|
|
714
|
+
```
|
|
715
|
+
|
|
716
|
+
### Script Data Types
|
|
717
|
+
|
|
718
|
+
```typescript
|
|
719
|
+
interface ScriptLine {
|
|
720
|
+
speaker: 'host' | 'guest';
|
|
721
|
+
text: string;
|
|
722
|
+
emotion: 'neutral' | 'excited' | 'thoughtful' | 'humorous' | 'serious' | 'curious';
|
|
723
|
+
estimatedDuration: number; // seconds (based on ~150 words/minute speaking rate)
|
|
724
|
+
segmentRef: string; // which outline segment this belongs to
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
interface PodcastScript {
|
|
728
|
+
title: string;
|
|
729
|
+
totalDuration: number;
|
|
730
|
+
speakers: {
|
|
731
|
+
host: { name: string; personality: string };
|
|
732
|
+
guest: { name: string; personality: string };
|
|
733
|
+
};
|
|
734
|
+
lines: ScriptLine[];
|
|
735
|
+
}
|
|
736
|
+
```
|
|
737
|
+
|
|
738
|
+
### Script Generator
|
|
739
|
+
|
|
740
|
+
```typescript
|
|
741
|
+
async function generateScript(
|
|
742
|
+
outline: PodcastOutline,
|
|
743
|
+
chunks: TextChunk[],
|
|
744
|
+
config: PipelineConfig
|
|
745
|
+
): Promise<PodcastScript> {
|
|
746
|
+
const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
|
|
747
|
+
// Use the most capable model for creative script writing
|
|
748
|
+
const model = genAI.getGenerativeModel({ model: 'gemini-2.5-pro-preview-05-06' });
|
|
749
|
+
|
|
750
|
+
const speakerNames = config.speakers.length >= 2
|
|
751
|
+
? { host: config.speakers[0].name, guest: config.speakers[1].name }
|
|
752
|
+
: { host: 'Alex', guest: 'Jordan' };
|
|
753
|
+
|
|
754
|
+
// Gather source material for each segment
|
|
755
|
+
const segmentContext = outline.segments.map((seg) => {
|
|
756
|
+
const relevantPoints = seg.keyPointRefs.map((ref) => outline.keyPoints[ref]);
|
|
757
|
+
const relevantChunkIds = relevantPoints.flatMap((kp) => kp.relevantChunks);
|
|
758
|
+
const sourceText = chunks
|
|
759
|
+
.filter((c) => relevantChunkIds.includes(c.id))
|
|
760
|
+
.map((c) => c.text)
|
|
761
|
+
.join('\n\n');
|
|
762
|
+
|
|
763
|
+
return {
|
|
764
|
+
segment: seg,
|
|
765
|
+
sourceText: sourceText.slice(0, 2000), // Token budget management
|
|
766
|
+
points: relevantPoints,
|
|
767
|
+
};
|
|
768
|
+
});
|
|
769
|
+
|
|
770
|
+
const prompt = `You are a team of three podcast production agents creating a script.
|
|
771
|
+
|
|
772
|
+
PODCAST: "${outline.title}"
|
|
773
|
+
DESCRIPTION: ${outline.description}
|
|
774
|
+
FORMAT: ${config.format}
|
|
775
|
+
TARGET DURATION: ${outline.targetDuration}
|
|
776
|
+
HOST: ${speakerNames.host} - Curious, engaging, asks great follow-up questions
|
|
777
|
+
GUEST: ${speakerNames.guest} - Expert, uses analogies, explains complex ideas simply
|
|
778
|
+
|
|
779
|
+
SEGMENTS AND SOURCE MATERIAL:
|
|
780
|
+
${segmentContext
|
|
781
|
+
.map(
|
|
782
|
+
(sc, i) => `
|
|
783
|
+
--- SEGMENT ${i + 1}: ${sc.segment.title} (${sc.segment.type}, ~${sc.segment.estimatedDuration}s) ---
|
|
784
|
+
Key Points: ${sc.points.map((p) => p.topic).join(', ')}
|
|
785
|
+
Discussion Angles: ${sc.points.map((p) => p.discussionAngle).join('; ')}
|
|
786
|
+
Source Material: ${sc.sourceText}
|
|
787
|
+
`
|
|
788
|
+
)
|
|
789
|
+
.join('\n')}
|
|
790
|
+
|
|
791
|
+
SCRIPT RULES:
|
|
792
|
+
1. Write natural, conversational dialogue -- NOT robotic or scripted-sounding
|
|
793
|
+
2. Host asks questions, makes transitions, keeps energy up
|
|
794
|
+
3. Guest provides substance, uses analogies and examples
|
|
795
|
+
4. Include verbal fillers sparingly ("you know", "right", "exactly") for naturalness
|
|
796
|
+
5. Each speaker turn should be 20-60 words (30 words = ~12 seconds at speaking pace)
|
|
797
|
+
6. Total script must hit approximately ${parseDuration(outline.targetDuration)} seconds
|
|
798
|
+
7. Speaking rate assumption: 150 words per minute (2.5 words per second)
|
|
799
|
+
8. FAITHFULNESS: Every claim must be traceable to the source material. Do not fabricate facts.
|
|
800
|
+
9. Include emotional tone markers for TTS guidance
|
|
801
|
+
|
|
802
|
+
Return ONLY valid JSON:
|
|
803
|
+
{
|
|
804
|
+
"title": "${outline.title}",
|
|
805
|
+
"totalDuration": ${parseDuration(outline.targetDuration)},
|
|
806
|
+
"speakers": {
|
|
807
|
+
"host": { "name": "${speakerNames.host}", "personality": "Curious and engaging" },
|
|
808
|
+
"guest": { "name": "${speakerNames.guest}", "personality": "Expert and insightful" }
|
|
809
|
+
},
|
|
810
|
+
"lines": [
|
|
811
|
+
{
|
|
812
|
+
"speaker": "host",
|
|
813
|
+
"text": "Welcome to the show! Today we are diving into...",
|
|
814
|
+
"emotion": "excited",
|
|
815
|
+
"estimatedDuration": 8,
|
|
816
|
+
"segmentRef": "intro"
|
|
817
|
+
}
|
|
818
|
+
]
|
|
819
|
+
}`;
|
|
820
|
+
|
|
821
|
+
const result = await model.generateContent(prompt);
|
|
822
|
+
const text = result.response.text();
|
|
823
|
+
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
824
|
+
if (!jsonMatch) throw new Error('Failed to extract script JSON from AI response');
|
|
825
|
+
|
|
826
|
+
const script: PodcastScript = JSON.parse(jsonMatch[0]);
|
|
827
|
+
|
|
828
|
+
// Verify faithfulness with a second pass
|
|
829
|
+
const verified = await verifyFaithfulness(script, chunks, genAI);
|
|
830
|
+
|
|
831
|
+
return verified;
|
|
832
|
+
}
|
|
833
|
+
```
|
|
834
|
+
|
|
835
|
+
### Faithfulness Verification
|
|
836
|
+
|
|
837
|
+
The Writer Agent's verification pass ensures no hallucinated facts sneak into the script.
|
|
838
|
+
This is critical -- the podcast claims must be traceable to source material.
|
|
839
|
+
|
|
840
|
+
```typescript
|
|
841
|
+
async function verifyFaithfulness(
|
|
842
|
+
script: PodcastScript,
|
|
843
|
+
chunks: TextChunk[],
|
|
844
|
+
genAI: GoogleGenerativeAI
|
|
845
|
+
): Promise<PodcastScript> {
|
|
846
|
+
const model = genAI.getGenerativeModel({ model: 'gemini-2.0-flash' });
|
|
847
|
+
|
|
848
|
+
const sourceText = chunks.map((c) => c.text).join('\n\n').slice(0, 8000);
|
|
849
|
+
const scriptText = script.lines.map((l) => `${l.speaker}: ${l.text}`).join('\n');
|
|
850
|
+
|
|
851
|
+
const prompt = `You are a fact-checker for a podcast script. Compare the script against the source material.
|
|
852
|
+
|
|
853
|
+
SOURCE MATERIAL:
|
|
854
|
+
${sourceText}
|
|
855
|
+
|
|
856
|
+
PODCAST SCRIPT:
|
|
857
|
+
${scriptText}
|
|
858
|
+
|
|
859
|
+
For each line in the script, check if the claims are supported by the source material.
|
|
860
|
+
If a line contains unsupported claims, rewrite it to be faithful to the source.
|
|
861
|
+
If a line is opinion/transition/question, mark it as OK.
|
|
862
|
+
|
|
863
|
+
Return ONLY valid JSON -- an array of objects:
|
|
864
|
+
[
|
|
865
|
+
{ "lineIndex": 0, "status": "ok" },
|
|
866
|
+
{ "lineIndex": 3, "status": "revised", "revisedText": "corrected text here" }
|
|
867
|
+
]
|
|
868
|
+
|
|
869
|
+
Only include lines that need revision. If all lines are faithful, return an empty array [].`;
|
|
870
|
+
|
|
871
|
+
const result = await model.generateContent(prompt);
|
|
872
|
+
const text = result.response.text();
|
|
873
|
+
const jsonMatch = text.match(/\[[\s\S]*\]/);
|
|
874
|
+
if (!jsonMatch) return script; // If parsing fails, return original
|
|
875
|
+
|
|
876
|
+
const revisions = JSON.parse(jsonMatch[0]);
|
|
877
|
+
for (const rev of revisions) {
|
|
878
|
+
if (rev.status === 'revised' && rev.revisedText && script.lines[rev.lineIndex]) {
|
|
879
|
+
script.lines[rev.lineIndex].text = rev.revisedText;
|
|
880
|
+
}
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
return script;
|
|
884
|
+
}
|
|
885
|
+
```
|
|
886
|
+
|
|
887
|
+
### Duration Control
|
|
888
|
+
|
|
889
|
+
```typescript
|
|
890
|
+
function validateScriptDuration(script: PodcastScript): {
|
|
891
|
+
actual: number;
|
|
892
|
+
target: number;
|
|
893
|
+
deviation: number;
|
|
894
|
+
withinTolerance: boolean;
|
|
895
|
+
} {
|
|
896
|
+
const actual = script.lines.reduce((sum, line) => sum + line.estimatedDuration, 0);
|
|
897
|
+
const deviation = Math.abs(actual - script.totalDuration) / script.totalDuration;
|
|
898
|
+
|
|
899
|
+
return {
|
|
900
|
+
actual,
|
|
901
|
+
target: script.totalDuration,
|
|
902
|
+
deviation,
|
|
903
|
+
withinTolerance: deviation <= 0.15, // 15% tolerance
|
|
904
|
+
};
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
function adjustScriptDuration(script: PodcastScript): PodcastScript {
|
|
908
|
+
const validation = validateScriptDuration(script);
|
|
909
|
+
if (validation.withinTolerance) return script;
|
|
910
|
+
|
|
911
|
+
const ratio = script.totalDuration / validation.actual;
|
|
912
|
+
|
|
913
|
+
if (ratio < 1) {
|
|
914
|
+
// Script too long -- trim from the middle (keep intro/outro intact)
|
|
915
|
+
const middleLines = script.lines.filter(
|
|
916
|
+
(l) => l.segmentRef !== 'intro' && l.segmentRef !== 'outro'
|
|
917
|
+
);
|
|
918
|
+
const excessSeconds = validation.actual - script.totalDuration;
|
|
919
|
+
let trimmed = 0;
|
|
920
|
+
|
|
921
|
+
// Remove the shortest lines from the middle until we are within target
|
|
922
|
+
const sortedByDuration = [...middleLines].sort(
|
|
923
|
+
(a, b) => a.estimatedDuration - b.estimatedDuration
|
|
924
|
+
);
|
|
925
|
+
const linesToRemove = new Set<ScriptLine>();
|
|
926
|
+
|
|
927
|
+
for (const line of sortedByDuration) {
|
|
928
|
+
if (trimmed >= excessSeconds) break;
|
|
929
|
+
linesToRemove.add(line);
|
|
930
|
+
trimmed += line.estimatedDuration;
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
script.lines = script.lines.filter((l) => !linesToRemove.has(l));
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
return script;
|
|
937
|
+
}
|
|
938
|
+
```
|
|
939
|
+
|
|
940
|
+
---
|
|
941
|
+
|
|
942
|
+
## Stage 4: Audio Synthesis
|
|
943
|
+
|
|
944
|
+
The synthesis stage converts the structured script into a polished podcast audio file.
|
|
945
|
+
Each script line is synthesized individually with the appropriate speaker voice, then
|
|
946
|
+
composed into a final mix with pauses, optional music, and loudness normalization.
|
|
947
|
+
|
|
948
|
+
### TTS Provider Interface
|
|
949
|
+
|
|
950
|
+
```typescript
|
|
951
|
+
interface TTSProvider {
|
|
952
|
+
synthesize(text: string, voice: string, emotion?: string): Promise<Buffer>;
|
|
953
|
+
listVoices(): Promise<{ id: string; name: string; gender: string }[]>;
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
interface SpeakerConfig {
|
|
957
|
+
name: string;
|
|
958
|
+
role: 'host' | 'guest' | 'narrator';
|
|
959
|
+
voiceId: string;
|
|
960
|
+
provider: 'elevenlabs' | 'orpheus' | 'chatterbox' | 'google-cloud';
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
interface PodcastAudio {
|
|
964
|
+
filePath: string;
|
|
965
|
+
duration: number;
|
|
966
|
+
format: string;
|
|
967
|
+
fileSize: number;
|
|
968
|
+
metadata: {
|
|
969
|
+
title: string;
|
|
970
|
+
description: string;
|
|
971
|
+
speakers: string[];
|
|
972
|
+
generatedAt: string;
|
|
973
|
+
};
|
|
974
|
+
}
|
|
975
|
+
```
|
|
976
|
+
|
|
977
|
+
### ElevenLabs TTS Implementation
|
|
978
|
+
|
|
979
|
+
```typescript
|
|
980
|
+
class ElevenLabsTTS implements TTSProvider {
|
|
981
|
+
private apiKey: string;
|
|
982
|
+
private baseUrl = 'https://api.elevenlabs.io/v1';
|
|
983
|
+
|
|
984
|
+
constructor(apiKey: string) {
|
|
985
|
+
this.apiKey = apiKey;
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
async synthesize(text: string, voiceId: string, emotion?: string): Promise<Buffer> {
|
|
989
|
+
// ElevenLabs supports emotion through stability/similarity settings
|
|
990
|
+
const stability = emotion === 'excited' ? 0.3 : emotion === 'serious' ? 0.8 : 0.5;
|
|
991
|
+
const similarityBoost = 0.75;
|
|
992
|
+
|
|
993
|
+
const response = await fetch(`${this.baseUrl}/text-to-speech/${voiceId}`, {
|
|
994
|
+
method: 'POST',
|
|
995
|
+
headers: {
|
|
996
|
+
'xi-api-key': this.apiKey,
|
|
997
|
+
'Content-Type': 'application/json',
|
|
998
|
+
},
|
|
999
|
+
body: JSON.stringify({
|
|
1000
|
+
text,
|
|
1001
|
+
model_id: 'eleven_multilingual_v2',
|
|
1002
|
+
voice_settings: {
|
|
1003
|
+
stability,
|
|
1004
|
+
similarity_boost: similarityBoost,
|
|
1005
|
+
style: emotion === 'excited' ? 0.7 : 0.3,
|
|
1006
|
+
use_speaker_boost: true,
|
|
1007
|
+
},
|
|
1008
|
+
}),
|
|
1009
|
+
});
|
|
1010
|
+
|
|
1011
|
+
if (!response.ok) {
|
|
1012
|
+
throw new Error(`ElevenLabs TTS failed: ${response.status} ${await response.text()}`);
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
return Buffer.from(await response.arrayBuffer());
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
async listVoices() {
|
|
1019
|
+
const response = await fetch(`${this.baseUrl}/voices`, {
|
|
1020
|
+
headers: { 'xi-api-key': this.apiKey },
|
|
1021
|
+
});
|
|
1022
|
+
const data = await response.json();
|
|
1023
|
+
return data.voices.map((v: any) => ({
|
|
1024
|
+
id: v.voice_id,
|
|
1025
|
+
name: v.name,
|
|
1026
|
+
gender: v.labels?.gender || 'unknown',
|
|
1027
|
+
}));
|
|
1028
|
+
}
|
|
1029
|
+
}
|
|
1030
|
+
```
|
|
1031
|
+
|
|
1032
|
+
### Google Cloud TTS Implementation
|
|
1033
|
+
|
|
1034
|
+
```typescript
|
|
1035
|
+
class GoogleCloudTTS implements TTSProvider {
|
|
1036
|
+
private apiKey: string;
|
|
1037
|
+
|
|
1038
|
+
constructor(apiKey: string) {
|
|
1039
|
+
this.apiKey = apiKey;
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
async synthesize(text: string, voiceId: string, emotion?: string): Promise<Buffer> {
|
|
1043
|
+
const response = await fetch(
|
|
1044
|
+
`https://texttospeech.googleapis.com/v1/text:synthesize?key=${this.apiKey}`,
|
|
1045
|
+
{
|
|
1046
|
+
method: 'POST',
|
|
1047
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1048
|
+
body: JSON.stringify({
|
|
1049
|
+
input: { text },
|
|
1050
|
+
voice: {
|
|
1051
|
+
languageCode: 'en-US',
|
|
1052
|
+
name: voiceId, // e.g., 'en-US-Studio-O' (male) or 'en-US-Studio-Q' (female)
|
|
1053
|
+
},
|
|
1054
|
+
audioConfig: {
|
|
1055
|
+
audioEncoding: 'LINEAR16',
|
|
1056
|
+
sampleRateHertz: 24000,
|
|
1057
|
+
speakingRate: emotion === 'excited' ? 1.1 : emotion === 'thoughtful' ? 0.9 : 1.0,
|
|
1058
|
+
pitch: emotion === 'curious' ? 1.5 : 0,
|
|
1059
|
+
},
|
|
1060
|
+
}),
|
|
1061
|
+
}
|
|
1062
|
+
);
|
|
1063
|
+
|
|
1064
|
+
const data = await response.json();
|
|
1065
|
+
return Buffer.from(data.audioContent, 'base64');
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
async listVoices() {
|
|
1069
|
+
const response = await fetch(
|
|
1070
|
+
`https://texttospeech.googleapis.com/v1/voices?key=${this.apiKey}`
|
|
1071
|
+
);
|
|
1072
|
+
const data = await response.json();
|
|
1073
|
+
return data.voices
|
|
1074
|
+
.filter((v: any) => v.name.includes('Studio') || v.name.includes('Neural2'))
|
|
1075
|
+
.map((v: any) => ({
|
|
1076
|
+
id: v.name,
|
|
1077
|
+
name: v.name,
|
|
1078
|
+
gender: v.ssmlGender?.toLowerCase() || 'unknown',
|
|
1079
|
+
}));
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
```
|
|
1083
|
+
|
|
1084
|
+
### Per-Segment Audio Generation
|
|
1085
|
+
|
|
1086
|
+
```typescript
|
|
1087
|
+
import { writeFile, mkdir } from 'fs/promises';
|
|
1088
|
+
import { join } from 'path';
|
|
1089
|
+
|
|
1090
|
+
async function generateAudioSegments(
|
|
1091
|
+
script: PodcastScript,
|
|
1092
|
+
speakers: SpeakerConfig[],
|
|
1093
|
+
outputDir: string
|
|
1094
|
+
): Promise<string[]> {
|
|
1095
|
+
await mkdir(outputDir, { recursive: true });
|
|
1096
|
+
|
|
1097
|
+
// Create TTS providers for each speaker
|
|
1098
|
+
const ttsProviders: Record<string, { provider: TTSProvider; voiceId: string }> = {};
|
|
1099
|
+
|
|
1100
|
+
for (const speaker of speakers) {
|
|
1101
|
+
switch (speaker.provider) {
|
|
1102
|
+
case 'elevenlabs':
|
|
1103
|
+
ttsProviders[speaker.role] = {
|
|
1104
|
+
provider: new ElevenLabsTTS(process.env.ELEVENLABS_API_KEY!),
|
|
1105
|
+
voiceId: speaker.voiceId,
|
|
1106
|
+
};
|
|
1107
|
+
break;
|
|
1108
|
+
case 'google-cloud':
|
|
1109
|
+
ttsProviders[speaker.role] = {
|
|
1110
|
+
provider: new GoogleCloudTTS(process.env.GOOGLE_TTS_API_KEY!),
|
|
1111
|
+
voiceId: speaker.voiceId,
|
|
1112
|
+
};
|
|
1113
|
+
break;
|
|
1114
|
+
// Add other providers as needed
|
|
1115
|
+
}
|
|
1116
|
+
}
|
|
1117
|
+
|
|
1118
|
+
const segmentPaths: string[] = [];
|
|
1119
|
+
const totalLines = script.lines.length;
|
|
1120
|
+
|
|
1121
|
+
for (let i = 0; i < totalLines; i++) {
|
|
1122
|
+
const line = script.lines[i];
|
|
1123
|
+
const tts = ttsProviders[line.speaker];
|
|
1124
|
+
|
|
1125
|
+
if (!tts) {
|
|
1126
|
+
console.warn(`No TTS provider configured for speaker: ${line.speaker}, skipping`);
|
|
1127
|
+
continue;
|
|
1128
|
+
}
|
|
1129
|
+
|
|
1130
|
+
console.log(`Synthesizing line ${i + 1}/${totalLines}: ${line.speaker} (${line.emotion})`);
|
|
1131
|
+
|
|
1132
|
+
const audioBuffer = await tts.provider.synthesize(line.text, tts.voiceId, line.emotion);
|
|
1133
|
+
const segmentPath = join(outputDir, `segment-${String(i).padStart(4, '0')}.wav`);
|
|
1134
|
+
await writeFile(segmentPath, audioBuffer);
|
|
1135
|
+
segmentPaths.push(segmentPath);
|
|
1136
|
+
|
|
1137
|
+
// Rate limiting: avoid API throttling
|
|
1138
|
+
await new Promise((resolve) => setTimeout(resolve, 200));
|
|
1139
|
+
}
|
|
1140
|
+
|
|
1141
|
+
return segmentPaths;
|
|
1142
|
+
}
|
|
1143
|
+
```
|
|
1144
|
+
|
|
1145
|
+
### FFmpeg Audio Composition
|
|
1146
|
+
|
|
1147
|
+
The final composition pipeline concatenates all speech segments with natural pauses,
|
|
1148
|
+
optionally adds intro/outro music, and normalizes to podcast-standard loudness.
|
|
1149
|
+
|
|
1150
|
+
```typescript
|
|
1151
|
+
import { execFile } from 'child_process';
|
|
1152
|
+
import { promisify } from 'util';
|
|
1153
|
+
import { writeFile as writeFileAsync } from 'fs/promises';
|
|
1154
|
+
import { join } from 'path';
|
|
1155
|
+
|
|
1156
|
+
const execFileAsync = promisify(execFile);
|
|
1157
|
+
|
|
1158
|
+
/**
|
|
1159
|
+
* Run an FFmpeg command safely using execFile (no shell injection risk).
|
|
1160
|
+
* For complex filter graphs, use the -filter_complex flag as a single argument.
|
|
1161
|
+
*/
|
|
1162
|
+
async function runFFmpeg(args: string[]): Promise<{ stdout: string; stderr: string }> {
|
|
1163
|
+
return execFileAsync('ffmpeg', args);
|
|
1164
|
+
}
|
|
1165
|
+
|
|
1166
|
+
async function composeAudio(
|
|
1167
|
+
segmentPaths: string[],
|
|
1168
|
+
config: PipelineConfig,
|
|
1169
|
+
outputDir: string,
|
|
1170
|
+
title: string
|
|
1171
|
+
): Promise<string> {
|
|
1172
|
+
// Step 1: Generate silence segments for natural pauses
|
|
1173
|
+
const pauseDuration = '0.35'; // 350ms between speaker turns
|
|
1174
|
+
const longPauseDuration = '0.8'; // 800ms between segments/topics
|
|
1175
|
+
const silencePath = join(outputDir, 'silence-short.wav');
|
|
1176
|
+
const longSilencePath = join(outputDir, 'silence-long.wav');
|
|
1177
|
+
|
|
1178
|
+
await runFFmpeg([
|
|
1179
|
+
'-y', '-f', 'lavfi', '-i', `anullsrc=r=24000:cl=mono`,
|
|
1180
|
+
'-t', pauseDuration, silencePath,
|
|
1181
|
+
]);
|
|
1182
|
+
await runFFmpeg([
|
|
1183
|
+
'-y', '-f', 'lavfi', '-i', `anullsrc=r=24000:cl=mono`,
|
|
1184
|
+
'-t', longPauseDuration, longSilencePath,
|
|
1185
|
+
]);
|
|
1186
|
+
|
|
1187
|
+
// Step 2: Build concat file list
|
|
1188
|
+
const concatListPath = join(outputDir, 'concat-list.txt');
|
|
1189
|
+
const concatEntries: string[] = [];
|
|
1190
|
+
|
|
1191
|
+
for (let i = 0; i < segmentPaths.length; i++) {
|
|
1192
|
+
concatEntries.push(`file '${segmentPaths[i].replace(/\\/g, '/')}'`);
|
|
1193
|
+
|
|
1194
|
+
if (i < segmentPaths.length - 1) {
|
|
1195
|
+
concatEntries.push(`file '${silencePath.replace(/\\/g, '/')}'`);
|
|
1196
|
+
}
|
|
1197
|
+
}
|
|
1198
|
+
|
|
1199
|
+
await writeFileAsync(concatListPath, concatEntries.join('\n'));
|
|
1200
|
+
|
|
1201
|
+
// Step 3: Concatenate all segments
|
|
1202
|
+
const rawConcatPath = join(outputDir, 'raw-concat.wav');
|
|
1203
|
+
await runFFmpeg([
|
|
1204
|
+
'-y', '-f', 'concat', '-safe', '0',
|
|
1205
|
+
'-i', concatListPath, '-c', 'copy', rawConcatPath,
|
|
1206
|
+
]);
|
|
1207
|
+
|
|
1208
|
+
// Step 4: Optionally add intro/outro music
|
|
1209
|
+
let preMasterPath = rawConcatPath;
|
|
1210
|
+
|
|
1211
|
+
if (config.includeMusic) {
|
|
1212
|
+
preMasterPath = join(outputDir, 'with-music.wav');
|
|
1213
|
+
const introMusicPath = join(outputDir, '..', 'assets', 'intro-music.wav');
|
|
1214
|
+
const outroMusicPath = join(outputDir, '..', 'assets', 'outro-music.wav');
|
|
1215
|
+
|
|
1216
|
+
// Overlay intro music (ducked under speech) and append outro
|
|
1217
|
+
await runFFmpeg([
|
|
1218
|
+
'-y',
|
|
1219
|
+
'-i', rawConcatPath,
|
|
1220
|
+
'-i', introMusicPath,
|
|
1221
|
+
'-i', outroMusicPath,
|
|
1222
|
+
'-filter_complex',
|
|
1223
|
+
'[1:a]atrim=0:8,afade=t=in:d=1:st=0,afade=t=out:d=2:st=6,volume=0.15[intro_music];' +
|
|
1224
|
+
'[2:a]afade=t=in:d=1:st=0,afade=t=out:d=2:st=6,volume=0.15[outro_music];' +
|
|
1225
|
+
'[intro_music][0:a][outro_music]concat=n=3:v=0:a=1[mixed]',
|
|
1226
|
+
'-map', '[mixed]', preMasterPath,
|
|
1227
|
+
]);
|
|
1228
|
+
}
|
|
1229
|
+
|
|
1230
|
+
// Step 5: Loudness normalization (EBU R128, -16 LUFS for podcasts)
|
|
1231
|
+
const normalizedPath = join(outputDir, 'normalized.wav');
|
|
1232
|
+
await runFFmpeg([
|
|
1233
|
+
'-y', '-i', preMasterPath,
|
|
1234
|
+
'-af', 'loudnorm=I=-16:TP=-1.5:LRA=11:print_format=json',
|
|
1235
|
+
normalizedPath,
|
|
1236
|
+
]);
|
|
1237
|
+
|
|
1238
|
+
// Step 6: Export as final format with metadata
|
|
1239
|
+
const outputExt = config.outputFormat || 'mp3';
|
|
1240
|
+
const finalPath = join(outputDir, `podcast-final.${outputExt}`);
|
|
1241
|
+
|
|
1242
|
+
if (outputExt === 'mp3') {
|
|
1243
|
+
await runFFmpeg([
|
|
1244
|
+
'-y', '-i', normalizedPath,
|
|
1245
|
+
'-codec:a', 'libmp3lame', '-b:a', '192k',
|
|
1246
|
+
'-metadata', `title=${title}`,
|
|
1247
|
+
'-metadata', 'artist=Generated Podcast',
|
|
1248
|
+
'-metadata', 'album=Document-to-Podcast',
|
|
1249
|
+
'-metadata', 'genre=Podcast',
|
|
1250
|
+
'-metadata', `date=${new Date().getFullYear()}`,
|
|
1251
|
+
finalPath,
|
|
1252
|
+
]);
|
|
1253
|
+
} else if (outputExt === 'ogg') {
|
|
1254
|
+
await runFFmpeg([
|
|
1255
|
+
'-y', '-i', normalizedPath,
|
|
1256
|
+
'-codec:a', 'libvorbis', '-q:a', '6',
|
|
1257
|
+
'-metadata', `title=${title}`,
|
|
1258
|
+
finalPath,
|
|
1259
|
+
]);
|
|
1260
|
+
} else {
|
|
1261
|
+
// WAV -- just copy
|
|
1262
|
+
await runFFmpeg(['-y', '-i', normalizedPath, finalPath]);
|
|
1263
|
+
}
|
|
1264
|
+
|
|
1265
|
+
return finalPath;
|
|
1266
|
+
}
|
|
1267
|
+
```
|
|
1268
|
+
|
|
1269
|
+
### FFmpeg Command Reference (Standalone)
|
|
1270
|
+
|
|
1271
|
+
For manual use or debugging, here are the key FFmpeg commands in the pipeline:
|
|
1272
|
+
|
|
1273
|
+
```bash
|
|
1274
|
+
# Generate silence (350ms pause between turns)
|
|
1275
|
+
ffmpeg -y -f lavfi -i anullsrc=r=24000:cl=mono -t 0.35 silence.wav
|
|
1276
|
+
|
|
1277
|
+
# Concatenate segments from a file list
|
|
1278
|
+
ffmpeg -y -f concat -safe 0 -i concat-list.txt -c copy raw-concat.wav
|
|
1279
|
+
|
|
1280
|
+
# Normalize to podcast standard (-16 LUFS, EBU R128)
|
|
1281
|
+
ffmpeg -y -i raw-concat.wav -af "loudnorm=I=-16:TP=-1.5:LRA=11" normalized.wav
|
|
1282
|
+
|
|
1283
|
+
# Two-pass loudness normalization (higher precision)
|
|
1284
|
+
# Pass 1: Measure
|
|
1285
|
+
ffmpeg -i raw-concat.wav -af "loudnorm=I=-16:TP=-1.5:LRA=11:print_format=json" -f null /dev/null
|
|
1286
|
+
# Pass 2: Apply measured values (replace measured_* with Pass 1 output)
|
|
1287
|
+
ffmpeg -i raw-concat.wav -af "loudnorm=I=-16:TP=-1.5:LRA=11:measured_I=-23.5:measured_TP=-4.2:measured_LRA=7.1:measured_thresh=-34.0:offset=-0.3:linear=true" normalized.wav
|
|
1288
|
+
|
|
1289
|
+
# Export as MP3 with metadata tags
|
|
1290
|
+
ffmpeg -y -i normalized.wav -codec:a libmp3lame -b:a 192k \
|
|
1291
|
+
-metadata title="Episode Title" \
|
|
1292
|
+
-metadata artist="Podcast Name" \
|
|
1293
|
+
-metadata album="Season 1" \
|
|
1294
|
+
-metadata genre="Podcast" \
|
|
1295
|
+
podcast-final.mp3
|
|
1296
|
+
|
|
1297
|
+
# Add intro music (ducked under speech)
|
|
1298
|
+
ffmpeg -y -i speech.wav -i intro.wav \
|
|
1299
|
+
-filter_complex "[1:a]volume=0.15,afade=t=out:d=2:st=6[music];[music][0:a]concat=n=2:v=0:a=1[out]" \
|
|
1300
|
+
-map "[out]" with-intro.wav
|
|
1301
|
+
|
|
1302
|
+
# Quick quality check: get loudness stats
|
|
1303
|
+
ffmpeg -i podcast-final.mp3 -af "loudnorm=print_format=json" -f null /dev/null 2>&1 | tail -20
|
|
1304
|
+
```
|
|
1305
|
+
|
|
1306
|
+
---
|
|
1307
|
+
|
|
1308
|
+
## Two Reference Implementations
|
|
1309
|
+
|
|
1310
|
+
### A) Full-Scale (GPU Server / Cloud API)
|
|
1311
|
+
|
|
1312
|
+
Based on Meta NotebookLlama's tiered model approach. Best quality, requires GPU or API budget.
|
|
1313
|
+
|
|
1314
|
+
```
|
|
1315
|
+
Architecture:
|
|
1316
|
+
Stage 1 (Ingest): pdf-parse / mammoth / readability (same for both)
|
|
1317
|
+
Stage 2 (Understand): Gemini 2.5 Pro for key point extraction + outline
|
|
1318
|
+
Gemini embedding-001 for chunk embeddings
|
|
1319
|
+
pgvector for vector storage
|
|
1320
|
+
Stage 3 (Script): Claude Opus / Gemini 2.5 Pro for script generation
|
|
1321
|
+
Gemini Flash for faithfulness verification
|
|
1322
|
+
(Large model = better creative writing)
|
|
1323
|
+
Stage 4 (Synthesize): ElevenLabs Multilingual V2 for TTS
|
|
1324
|
+
(Or Orpheus TTS 3B on local GPU -- open source, near-commercial quality)
|
|
1325
|
+
FFmpeg for composition + mastering
|
|
1326
|
+
|
|
1327
|
+
Cost estimate (15-min episode from 10-page PDF):
|
|
1328
|
+
- Gemini 2.5 Pro: ~$0.15 (input) + $0.30 (output) = ~$0.45
|
|
1329
|
+
- Gemini Flash (verification): ~$0.01
|
|
1330
|
+
- Gemini Embedding: ~$0.001
|
|
1331
|
+
- ElevenLabs TTS: ~$0.50 (15 min at scale tier)
|
|
1332
|
+
- Total: ~$1.00 per episode
|
|
1333
|
+
|
|
1334
|
+
Hardware: Any machine. All processing is API-based.
|
|
1335
|
+
Latency: 3-8 minutes for a 15-minute episode.
|
|
1336
|
+
```
|
|
1337
|
+
|
|
1338
|
+
**Meta NotebookLlama Model Tiers (for self-hosted GPU):**
|
|
1339
|
+
|
|
1340
|
+
| Stage | Model | Purpose | Why This Size |
|
|
1341
|
+
|-------|-------|---------|---------------|
|
|
1342
|
+
| Text cleanup | Llama-3.2-1B | Strip headers, fix OCR errors | Small = fast, simple task |
|
|
1343
|
+
| Script writing | Llama-3.1-70B (or API) | Creative multi-speaker dialogue | Large = better creativity |
|
|
1344
|
+
| TTS prep | Llama-3.1-8B | Add SSML/emotion markers | Medium = good enough |
|
|
1345
|
+
| Audio | Orpheus TTS 3B | Speech synthesis | Specialized model |
|
|
1346
|
+
|
|
1347
|
+
Key insight from Meta's research: **do not use the same model for every stage.** Match model
|
|
1348
|
+
capability to task complexity. Small models are better (faster, cheaper) for mechanical tasks;
|
|
1349
|
+
large models are needed only for creative generation.
|
|
1350
|
+
|
|
1351
|
+
### B) Local / CPU-Only
|
|
1352
|
+
|
|
1353
|
+
Based on Mozilla AI's Document-to-Podcast Blueprint. Fully private, zero API cost, runs on
|
|
1354
|
+
consumer hardware. Lower quality but completely offline.
|
|
1355
|
+
|
|
1356
|
+
```
|
|
1357
|
+
Architecture:
|
|
1358
|
+
Stage 1 (Ingest): Same parsers (pdf-parse, mammoth, readability)
|
|
1359
|
+
Stage 2 (Understand): Llama 3.2 3B GGUF via llama_cpp (Q4_K_M quantization)
|
|
1360
|
+
nomic-embed-text via Ollama for embeddings
|
|
1361
|
+
Qdrant (local Docker) for vector storage
|
|
1362
|
+
Stage 3 (Script): Llama 3.1 8B GGUF via llama_cpp (Q5_K_M quantization)
|
|
1363
|
+
Self-verification (same model, second pass)
|
|
1364
|
+
Stage 4 (Synthesize): Orpheus TTS 150M (CPU-optimized) or Parler TTS Mini
|
|
1365
|
+
FFmpeg for composition + mastering
|
|
1366
|
+
|
|
1367
|
+
Cost: $0.00 (no API calls)
|
|
1368
|
+
|
|
1369
|
+
Hardware requirements:
|
|
1370
|
+
- RAM: 16GB minimum (8B model needs ~6GB in Q4)
|
|
1371
|
+
- Storage: ~15GB for all models
|
|
1372
|
+
- CPU: Modern 8-core (Intel 12th+ / AMD 5000+)
|
|
1373
|
+
- GPU: None required (but CUDA/Metal accelerates if available)
|
|
1374
|
+
|
|
1375
|
+
Latency: 15-45 minutes for a 15-minute episode (CPU-bound on TTS).
|
|
1376
|
+
|
|
1377
|
+
Model downloads (one-time):
|
|
1378
|
+
# Via Ollama (easiest)
|
|
1379
|
+
ollama pull llama3.2:3b # Understanding stage
|
|
1380
|
+
ollama pull llama3.1:8b # Script generation
|
|
1381
|
+
ollama pull nomic-embed-text # Embeddings
|
|
1382
|
+
|
|
1383
|
+
# Via llama_cpp (more control)
|
|
1384
|
+
# Download GGUF from huggingface.co/TheBloke or official repos
|
|
1385
|
+
```
|
|
1386
|
+
|
|
1387
|
+
**Quality Comparison:**
|
|
1388
|
+
|
|
1389
|
+
| Aspect | Full-Scale (API) | Local (CPU) |
|
|
1390
|
+
|--------|-----------------|-------------|
|
|
1391
|
+
| Script naturalness | 9/10 | 6/10 |
|
|
1392
|
+
| Voice quality | 9/10 (ElevenLabs) | 5/10 (Orpheus 150M) |
|
|
1393
|
+
| Faithfulness | 9/10 (separate verifier) | 7/10 (self-verify) |
|
|
1394
|
+
| Latency (15min ep) | 3-8 min | 15-45 min |
|
|
1395
|
+
| Cost per episode | ~$1.00 | $0.00 |
|
|
1396
|
+
| Privacy | Data sent to APIs | Fully local |
|
|
1397
|
+
| Offline capable | No | Yes |
|
|
1398
|
+
|
|
1399
|
+
---
|
|
1400
|
+
|
|
1401
|
+
## Complete TypeScript Pipeline Class
|
|
1402
|
+
|
|
1403
|
+
```typescript
|
|
1404
|
+
import { mkdir, writeFile, stat, readFile, rm } from 'fs/promises';
|
|
1405
|
+
import { join } from 'path';
|
|
1406
|
+
import { execFile } from 'child_process';
|
|
1407
|
+
import { promisify } from 'util';
|
|
1408
|
+
import { GoogleGenerativeAI } from '@google/generative-ai';
|
|
1409
|
+
|
|
1410
|
+
const execFileAsync = promisify(execFile);
|
|
1411
|
+
|
|
1412
|
+
// --- Configuration ---------------------------------------------------------
|
|
1413
|
+
|
|
1414
|
+
interface PipelineConfig {
|
|
1415
|
+
ttsProvider: 'elevenlabs' | 'orpheus' | 'chatterbox' | 'google-cloud';
|
|
1416
|
+
aiProvider: 'gemini' | 'claude';
|
|
1417
|
+
format: 'deep-dive' | 'brief' | 'debate' | 'narration';
|
|
1418
|
+
duration: '5min' | '15min' | '30min' | '60min';
|
|
1419
|
+
speakers: SpeakerConfig[];
|
|
1420
|
+
outputFormat: 'mp3' | 'wav' | 'ogg';
|
|
1421
|
+
includeMusic: boolean;
|
|
1422
|
+
language: string;
|
|
1423
|
+
outputDir: string;
|
|
1424
|
+
/** If true, keep intermediate files (segments, concat list) for debugging */
|
|
1425
|
+
keepIntermediates: boolean;
|
|
1426
|
+
}
|
|
1427
|
+
|
|
1428
|
+
const DEFAULT_CONFIG: PipelineConfig = {
|
|
1429
|
+
ttsProvider: 'elevenlabs',
|
|
1430
|
+
aiProvider: 'gemini',
|
|
1431
|
+
format: 'deep-dive',
|
|
1432
|
+
duration: '15min',
|
|
1433
|
+
speakers: [
|
|
1434
|
+
{ name: 'Alex', role: 'host', voiceId: 'pNInz6obpgDQGcFmaJgB', provider: 'elevenlabs' },
|
|
1435
|
+
{ name: 'Jordan', role: 'guest', voiceId: '21m00Tcm4TlvDq8ikWAM', provider: 'elevenlabs' },
|
|
1436
|
+
],
|
|
1437
|
+
outputFormat: 'mp3',
|
|
1438
|
+
includeMusic: false,
|
|
1439
|
+
language: 'en',
|
|
1440
|
+
outputDir: './podcast-output',
|
|
1441
|
+
keepIntermediates: false,
|
|
1442
|
+
};
|
|
1443
|
+
|
|
1444
|
+
// --- Pipeline Class --------------------------------------------------------
|
|
1445
|
+
|
|
1446
|
+
class DocToPodcastPipeline {
|
|
1447
|
+
private config: PipelineConfig;
|
|
1448
|
+
private genAI: GoogleGenerativeAI;
|
|
1449
|
+
private chunks: TextChunk[] = [];
|
|
1450
|
+
|
|
1451
|
+
constructor(config: Partial<PipelineConfig> = {}) {
|
|
1452
|
+
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
1453
|
+
this.genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY!);
|
|
1454
|
+
}
|
|
1455
|
+
|
|
1456
|
+
/**
|
|
1457
|
+
* Run the complete pipeline: Document -> Podcast Audio
|
|
1458
|
+
*/
|
|
1459
|
+
async run(source: DocumentSource): Promise<PodcastAudio> {
|
|
1460
|
+
const startTime = Date.now();
|
|
1461
|
+
const outputDir = this.config.outputDir;
|
|
1462
|
+
const segmentsDir = join(outputDir, 'segments');
|
|
1463
|
+
|
|
1464
|
+
await mkdir(outputDir, { recursive: true });
|
|
1465
|
+
await mkdir(segmentsDir, { recursive: true });
|
|
1466
|
+
|
|
1467
|
+
console.log('[Pipeline] Stage 1: Ingesting document...');
|
|
1468
|
+
const doc = await this.ingest(source);
|
|
1469
|
+
console.log(
|
|
1470
|
+
`[Pipeline] Ingested: "${doc.metadata.title}" (${doc.metadata.wordCount} words)`
|
|
1471
|
+
);
|
|
1472
|
+
|
|
1473
|
+
console.log('[Pipeline] Stage 2: Analyzing content...');
|
|
1474
|
+
const outline = await this.understand(doc);
|
|
1475
|
+
console.log(
|
|
1476
|
+
`[Pipeline] Outline: ${outline.segments.length} segments, ` +
|
|
1477
|
+
`${outline.keyPoints.length} key points`
|
|
1478
|
+
);
|
|
1479
|
+
|
|
1480
|
+
console.log('[Pipeline] Stage 3: Generating script...');
|
|
1481
|
+
const script = await this.generateScript(outline, doc);
|
|
1482
|
+
console.log(
|
|
1483
|
+
`[Pipeline] Script: ${script.lines.length} lines, ~${script.totalDuration}s`
|
|
1484
|
+
);
|
|
1485
|
+
|
|
1486
|
+
// Save script for reference
|
|
1487
|
+
await writeFile(join(outputDir, 'script.json'), JSON.stringify(script, null, 2));
|
|
1488
|
+
|
|
1489
|
+
console.log('[Pipeline] Stage 4: Synthesizing audio...');
|
|
1490
|
+
const audio = await this.synthesize(script);
|
|
1491
|
+
|
|
1492
|
+
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
|
|
1493
|
+
console.log(`[Pipeline] Complete in ${elapsed}s -> ${audio.filePath}`);
|
|
1494
|
+
|
|
1495
|
+
// Cleanup intermediates unless debugging
|
|
1496
|
+
if (!this.config.keepIntermediates) {
|
|
1497
|
+
await rm(segmentsDir, { recursive: true, force: true }).catch(() => {});
|
|
1498
|
+
}
|
|
1499
|
+
|
|
1500
|
+
return audio;
|
|
1501
|
+
}
|
|
1502
|
+
|
|
1503
|
+
/**
|
|
1504
|
+
* Stage 1: Parse document source into clean text with metadata
|
|
1505
|
+
*/
|
|
1506
|
+
async ingest(source: DocumentSource): Promise<ParsedDocument> {
|
|
1507
|
+
return ingestDocument(source);
|
|
1508
|
+
}
|
|
1509
|
+
|
|
1510
|
+
/**
|
|
1511
|
+
* Stage 2: Analyze document, extract key points, generate podcast outline
|
|
1512
|
+
*/
|
|
1513
|
+
async understand(doc: ParsedDocument): Promise<PodcastOutline> {
|
|
1514
|
+
// Chunk the document
|
|
1515
|
+
const chunks = semanticChunk(doc);
|
|
1516
|
+
|
|
1517
|
+
// Embed chunks (for potential RAG retrieval in future iterations)
|
|
1518
|
+
this.chunks = await embedWithGemini(chunks);
|
|
1519
|
+
|
|
1520
|
+
// Extract key points
|
|
1521
|
+
const maxPoints = this.config.format === 'brief' ? 4 : 8;
|
|
1522
|
+
const keyPoints = await extractKeyPoints(doc, this.chunks, maxPoints);
|
|
1523
|
+
|
|
1524
|
+
// Generate outline
|
|
1525
|
+
const outline = await generateOutline(doc, keyPoints, {
|
|
1526
|
+
format: this.config.format,
|
|
1527
|
+
duration: this.config.duration,
|
|
1528
|
+
});
|
|
1529
|
+
|
|
1530
|
+
return outline;
|
|
1531
|
+
}
|
|
1532
|
+
|
|
1533
|
+
/**
|
|
1534
|
+
* Stage 3: Generate multi-speaker podcast script from outline
|
|
1535
|
+
*/
|
|
1536
|
+
async generateScript(
|
|
1537
|
+
outline: PodcastOutline,
|
|
1538
|
+
doc?: ParsedDocument
|
|
1539
|
+
): Promise<PodcastScript> {
|
|
1540
|
+
const chunks = this.chunks.length > 0 ? this.chunks : semanticChunk(doc!);
|
|
1541
|
+
const script = await generateScript(outline, chunks, this.config);
|
|
1542
|
+
return adjustScriptDuration(script);
|
|
1543
|
+
}
|
|
1544
|
+
|
|
1545
|
+
/**
|
|
1546
|
+
* Stage 4: Synthesize audio from script, compose final podcast
|
|
1547
|
+
*/
|
|
1548
|
+
async synthesize(script: PodcastScript): Promise<PodcastAudio> {
|
|
1549
|
+
const segmentsDir = join(this.config.outputDir, 'segments');
|
|
1550
|
+
|
|
1551
|
+
// Generate individual audio segments via TTS
|
|
1552
|
+
const segmentPaths = await generateAudioSegments(
|
|
1553
|
+
script,
|
|
1554
|
+
this.config.speakers,
|
|
1555
|
+
segmentsDir
|
|
1556
|
+
);
|
|
1557
|
+
|
|
1558
|
+
// Compose final audio with FFmpeg
|
|
1559
|
+
const finalPath = await composeAudio(
|
|
1560
|
+
segmentPaths,
|
|
1561
|
+
this.config,
|
|
1562
|
+
this.config.outputDir,
|
|
1563
|
+
script.title
|
|
1564
|
+
);
|
|
1565
|
+
|
|
1566
|
+
// Get file stats
|
|
1567
|
+
const fileStat = await stat(finalPath);
|
|
1568
|
+
|
|
1569
|
+
// Calculate actual duration from FFmpeg probe
|
|
1570
|
+
let duration = script.totalDuration;
|
|
1571
|
+
try {
|
|
1572
|
+
const { stdout } = await execFileAsync('ffprobe', [
|
|
1573
|
+
'-v', 'quiet',
|
|
1574
|
+
'-show_entries', 'format=duration',
|
|
1575
|
+
'-of', 'csv=p=0',
|
|
1576
|
+
finalPath,
|
|
1577
|
+
]);
|
|
1578
|
+
duration = parseFloat(stdout.trim()) || duration;
|
|
1579
|
+
} catch {
|
|
1580
|
+
// Fall back to estimated duration
|
|
1581
|
+
}
|
|
1582
|
+
|
|
1583
|
+
return {
|
|
1584
|
+
filePath: finalPath,
|
|
1585
|
+
duration,
|
|
1586
|
+
format: this.config.outputFormat,
|
|
1587
|
+
fileSize: fileStat.size,
|
|
1588
|
+
metadata: {
|
|
1589
|
+
title: script.title,
|
|
1590
|
+
description:
|
|
1591
|
+
`Generated podcast from document. ${script.lines.length} script lines.`,
|
|
1592
|
+
speakers: [script.speakers.host.name, script.speakers.guest.name],
|
|
1593
|
+
generatedAt: new Date().toISOString(),
|
|
1594
|
+
},
|
|
1595
|
+
};
|
|
1596
|
+
}
|
|
1597
|
+
}
|
|
1598
|
+
```
|
|
1599
|
+
|
|
1600
|
+
### Usage Examples
|
|
1601
|
+
|
|
1602
|
+
```typescript
|
|
1603
|
+
// Example 1: PDF to podcast (cloud API, full quality)
|
|
1604
|
+
const pipeline = new DocToPodcastPipeline({
|
|
1605
|
+
ttsProvider: 'elevenlabs',
|
|
1606
|
+
aiProvider: 'gemini',
|
|
1607
|
+
format: 'deep-dive',
|
|
1608
|
+
duration: '15min',
|
|
1609
|
+
outputDir: './output/my-podcast',
|
|
1610
|
+
speakers: [
|
|
1611
|
+
{ name: 'Alex', role: 'host', voiceId: 'pNInz6obpgDQGcFmaJgB', provider: 'elevenlabs' },
|
|
1612
|
+
{ name: 'Jordan', role: 'guest', voiceId: '21m00Tcm4TlvDq8ikWAM', provider: 'elevenlabs' },
|
|
1613
|
+
],
|
|
1614
|
+
});
|
|
1615
|
+
|
|
1616
|
+
const result = await pipeline.run({
|
|
1617
|
+
type: 'pdf',
|
|
1618
|
+
path: './documents/research-paper.pdf',
|
|
1619
|
+
});
|
|
1620
|
+
console.log(`Podcast: ${result.filePath} (${result.duration}s, ${result.fileSize} bytes)`);
|
|
1621
|
+
|
|
1622
|
+
// Example 2: URL to brief podcast
|
|
1623
|
+
const brief = await new DocToPodcastPipeline({
|
|
1624
|
+
format: 'brief',
|
|
1625
|
+
duration: '5min',
|
|
1626
|
+
outputDir: './output/quick-brief',
|
|
1627
|
+
}).run({ type: 'url', url: 'https://example.com/article' });
|
|
1628
|
+
|
|
1629
|
+
// Example 3: YouTube video to podcast episode
|
|
1630
|
+
const ytPodcast = await new DocToPodcastPipeline({
|
|
1631
|
+
format: 'deep-dive',
|
|
1632
|
+
duration: '30min',
|
|
1633
|
+
outputDir: './output/yt-episode',
|
|
1634
|
+
}).run({ type: 'youtube', url: 'https://youtube.com/watch?v=example123' });
|
|
1635
|
+
|
|
1636
|
+
// Example 4: Direct text input (e.g., from a database or CMS)
|
|
1637
|
+
const textPodcast = await new DocToPodcastPipeline({
|
|
1638
|
+
format: 'narration',
|
|
1639
|
+
duration: '5min',
|
|
1640
|
+
speakers: [
|
|
1641
|
+
{ name: 'Narrator', role: 'narrator', voiceId: 'en-US-Studio-O', provider: 'google-cloud' },
|
|
1642
|
+
{ name: 'Expert', role: 'guest', voiceId: 'en-US-Studio-Q', provider: 'google-cloud' },
|
|
1643
|
+
],
|
|
1644
|
+
outputDir: './output/text-podcast',
|
|
1645
|
+
}).run({
|
|
1646
|
+
type: 'text',
|
|
1647
|
+
content: 'Your document text here...',
|
|
1648
|
+
title: 'Weekly Update',
|
|
1649
|
+
});
|
|
1650
|
+
```
|
|
1651
|
+
|
|
1652
|
+
---
|
|
1653
|
+
|
|
1654
|
+
## Ministry / Church Use Cases
|
|
1655
|
+
|
|
1656
|
+
Since this pipeline is designed with Thierry's ministry application stack in mind, here are
|
|
1657
|
+
specific configurations for common church content scenarios.
|
|
1658
|
+
|
|
1659
|
+
### Sermon to Podcast Episode
|
|
1660
|
+
|
|
1661
|
+
Transform a recorded sermon transcript or notes into a polished podcast discussion.
|
|
1662
|
+
|
|
1663
|
+
```typescript
|
|
1664
|
+
const sermonPipeline = new DocToPodcastPipeline({
|
|
1665
|
+
format: 'deep-dive',
|
|
1666
|
+
duration: '30min',
|
|
1667
|
+
aiProvider: 'gemini',
|
|
1668
|
+
ttsProvider: 'elevenlabs',
|
|
1669
|
+
outputDir: './output/sermon-podcast',
|
|
1670
|
+
speakers: [
|
|
1671
|
+
{ name: 'Pastor Mike', role: 'host', voiceId: 'voice-id-1', provider: 'elevenlabs' },
|
|
1672
|
+
{ name: 'Dr. Sarah', role: 'guest', voiceId: 'voice-id-2', provider: 'elevenlabs' },
|
|
1673
|
+
],
|
|
1674
|
+
includeMusic: true,
|
|
1675
|
+
});
|
|
1676
|
+
|
|
1677
|
+
// From sermon notes (DOCX from pastor's study)
|
|
1678
|
+
const episode = await sermonPipeline.run({
|
|
1679
|
+
type: 'docx',
|
|
1680
|
+
path: './sermons/2026-03-10-grace-in-action.docx',
|
|
1681
|
+
});
|
|
1682
|
+
|
|
1683
|
+
// From sermon recording transcript (already transcribed via transcription-pipeline-selector)
|
|
1684
|
+
const fromRecording = await sermonPipeline.run({
|
|
1685
|
+
type: 'text',
|
|
1686
|
+
content: transcriptText,
|
|
1687
|
+
title: 'Grace in Action - Sunday Sermon Discussion',
|
|
1688
|
+
});
|
|
1689
|
+
```
|
|
1690
|
+
|
|
1691
|
+
### Bible Study to Discussion Format
|
|
1692
|
+
|
|
1693
|
+
Convert Bible study materials into an engaging discussion podcast where the host and guest
|
|
1694
|
+
explore the passage together.
|
|
1695
|
+
|
|
1696
|
+
```typescript
|
|
1697
|
+
const bibleStudyPipeline = new DocToPodcastPipeline({
|
|
1698
|
+
format: 'deep-dive',
|
|
1699
|
+
duration: '15min',
|
|
1700
|
+
speakers: [
|
|
1701
|
+
{ name: 'Teacher', role: 'host', voiceId: 'voice-id-1', provider: 'elevenlabs' },
|
|
1702
|
+
{ name: 'Student', role: 'guest', voiceId: 'voice-id-2', provider: 'elevenlabs' },
|
|
1703
|
+
],
|
|
1704
|
+
outputDir: './output/bible-study',
|
|
1705
|
+
});
|
|
1706
|
+
|
|
1707
|
+
// Custom AI prompt override for Bible study context:
|
|
1708
|
+
// The key point extractor can be tuned to focus on theological themes:
|
|
1709
|
+
// - Historical context of the passage
|
|
1710
|
+
// - Key Greek/Hebrew word meanings
|
|
1711
|
+
// - Practical application points
|
|
1712
|
+
// - Cross-references to other scripture
|
|
1713
|
+
```
|
|
1714
|
+
|
|
1715
|
+
### Church Announcement to Brief Audio Update
|
|
1716
|
+
|
|
1717
|
+
Quick 2-3 minute audio updates for the congregation.
|
|
1718
|
+
|
|
1719
|
+
```typescript
|
|
1720
|
+
const announcementPipeline = new DocToPodcastPipeline({
|
|
1721
|
+
format: 'brief',
|
|
1722
|
+
duration: '5min',
|
|
1723
|
+
speakers: [
|
|
1724
|
+
{ name: 'Church Office', role: 'narrator', voiceId: 'en-US-Studio-O', provider: 'google-cloud' },
|
|
1725
|
+
{ name: 'Pastor', role: 'host', voiceId: 'voice-id-1', provider: 'elevenlabs' },
|
|
1726
|
+
],
|
|
1727
|
+
outputDir: './output/announcements',
|
|
1728
|
+
includeMusic: true,
|
|
1729
|
+
});
|
|
1730
|
+
|
|
1731
|
+
const announcement = await announcementPipeline.run({
|
|
1732
|
+
type: 'text',
|
|
1733
|
+
content: `
|
|
1734
|
+
This week at Grace Community Church:
|
|
1735
|
+
- Sunday Service at 10am: "Walking in Faith" series continues
|
|
1736
|
+
- Wednesday Bible Study: Romans Chapter 8, 7pm in Fellowship Hall
|
|
1737
|
+
- Youth Group Friday Night: Movie and pizza, 6-9pm
|
|
1738
|
+
- Volunteer sign-ups for Easter service are open at the welcome desk
|
|
1739
|
+
- Prayer requests can be submitted online at our website
|
|
1740
|
+
`,
|
|
1741
|
+
title: 'This Week at Grace Community - March 10, 2026',
|
|
1742
|
+
});
|
|
1743
|
+
```
|
|
1744
|
+
|
|
1745
|
+
### Teaching Recording to Educational Deep-Dive
|
|
1746
|
+
|
|
1747
|
+
Transform a lecture or teaching session into a structured educational podcast that
|
|
1748
|
+
breaks down complex theological or educational topics.
|
|
1749
|
+
|
|
1750
|
+
```typescript
|
|
1751
|
+
const teachingPipeline = new DocToPodcastPipeline({
|
|
1752
|
+
format: 'deep-dive',
|
|
1753
|
+
duration: '60min',
|
|
1754
|
+
speakers: [
|
|
1755
|
+
{ name: 'Professor', role: 'host', voiceId: 'voice-id-1', provider: 'elevenlabs' },
|
|
1756
|
+
{ name: 'Teaching Assistant', role: 'guest', voiceId: 'voice-id-2', provider: 'elevenlabs' },
|
|
1757
|
+
],
|
|
1758
|
+
outputDir: './output/teaching-series',
|
|
1759
|
+
});
|
|
1760
|
+
|
|
1761
|
+
// From a seminary lecture PDF
|
|
1762
|
+
const lecture = await teachingPipeline.run({
|
|
1763
|
+
type: 'pdf',
|
|
1764
|
+
path: './materials/systematic-theology-ch3.pdf',
|
|
1765
|
+
});
|
|
1766
|
+
|
|
1767
|
+
// Integration with content-repurposing-pipeline:
|
|
1768
|
+
// After generating the podcast, feed the script into the repurposing pipeline
|
|
1769
|
+
// to create social media clips, quote cards, and blog posts from the same source.
|
|
1770
|
+
```
|
|
1771
|
+
|
|
1772
|
+
### Ministry Integration Architecture
|
|
1773
|
+
|
|
1774
|
+
```
|
|
1775
|
+
[Sermon Recording] [Bible Study Notes] [Announcements] [Teaching Materials]
|
|
1776
|
+
| | | |
|
|
1777
|
+
v v v v
|
|
1778
|
+
[Transcription] [DOCX Parser] [Text Input] [PDF Parser]
|
|
1779
|
+
| | | |
|
|
1780
|
+
+--------------------+------------------+-------------------+
|
|
1781
|
+
|
|
|
1782
|
+
v
|
|
1783
|
+
+-------------------------------+
|
|
1784
|
+
| DocToPodcastPipeline.run() |
|
|
1785
|
+
| (format per content type) |
|
|
1786
|
+
+-------------------------------+
|
|
1787
|
+
|
|
|
1788
|
+
v
|
|
1789
|
+
+-------------------------------+
|
|
1790
|
+
| Output: MP3 + Script JSON |
|
|
1791
|
+
+-------------------------------+
|
|
1792
|
+
| |
|
|
1793
|
+
v v
|
|
1794
|
+
[Podcast RSS] [Content Repurposing]
|
|
1795
|
+
[Apple/Spotify] [Social clips, quotes]
|
|
1796
|
+
```
|
|
1797
|
+
|
|
1798
|
+
---
|
|
1799
|
+
|
|
1800
|
+
## Error Handling and Resilience
|
|
1801
|
+
|
|
1802
|
+
### Retry Logic for TTS APIs
|
|
1803
|
+
|
|
1804
|
+
```typescript
|
|
1805
|
+
async function withRetry<T>(
|
|
1806
|
+
fn: () => Promise<T>,
|
|
1807
|
+
maxRetries: number = 3,
|
|
1808
|
+
delayMs: number = 1000
|
|
1809
|
+
): Promise<T> {
|
|
1810
|
+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
1811
|
+
try {
|
|
1812
|
+
return await fn();
|
|
1813
|
+
} catch (error: any) {
|
|
1814
|
+
const isRateLimit = error?.status === 429;
|
|
1815
|
+
const isServerError = error?.status >= 500;
|
|
1816
|
+
|
|
1817
|
+
if (attempt === maxRetries || (!isRateLimit && !isServerError)) {
|
|
1818
|
+
throw error;
|
|
1819
|
+
}
|
|
1820
|
+
|
|
1821
|
+
const backoff = isRateLimit ? delayMs * attempt * 2 : delayMs * attempt;
|
|
1822
|
+
console.warn(
|
|
1823
|
+
`[Retry ${attempt}/${maxRetries}] ${error.message}. Waiting ${backoff}ms...`
|
|
1824
|
+
);
|
|
1825
|
+
await new Promise((resolve) => setTimeout(resolve, backoff));
|
|
1826
|
+
}
|
|
1827
|
+
}
|
|
1828
|
+
throw new Error('Unreachable');
|
|
1829
|
+
}
|
|
1830
|
+
|
|
1831
|
+
// Usage in TTS generation:
|
|
1832
|
+
const audioBuffer = await withRetry(
|
|
1833
|
+
() => tts.provider.synthesize(line.text, tts.voiceId, line.emotion),
|
|
1834
|
+
3,
|
|
1835
|
+
2000
|
|
1836
|
+
);
|
|
1837
|
+
```
|
|
1838
|
+
|
|
1839
|
+
### Pipeline Checkpointing
|
|
1840
|
+
|
|
1841
|
+
For long-running pipelines (60-min episodes), save intermediate state so a failure
|
|
1842
|
+
in Stage 4 does not require re-running Stages 1-3.
|
|
1843
|
+
|
|
1844
|
+
```typescript
|
|
1845
|
+
interface PipelineCheckpoint {
|
|
1846
|
+
stage: 1 | 2 | 3 | 4;
|
|
1847
|
+
doc?: ParsedDocument;
|
|
1848
|
+
outline?: PodcastOutline;
|
|
1849
|
+
script?: PodcastScript;
|
|
1850
|
+
segmentPaths?: string[];
|
|
1851
|
+
timestamp: string;
|
|
1852
|
+
}
|
|
1853
|
+
|
|
1854
|
+
async function saveCheckpoint(
|
|
1855
|
+
checkpoint: PipelineCheckpoint,
|
|
1856
|
+
outputDir: string
|
|
1857
|
+
): Promise<void> {
|
|
1858
|
+
await writeFile(
|
|
1859
|
+
join(outputDir, 'checkpoint.json'),
|
|
1860
|
+
JSON.stringify(checkpoint, null, 2)
|
|
1861
|
+
);
|
|
1862
|
+
}
|
|
1863
|
+
|
|
1864
|
+
async function loadCheckpoint(outputDir: string): Promise<PipelineCheckpoint | null> {
|
|
1865
|
+
try {
|
|
1866
|
+
const data = await readFile(join(outputDir, 'checkpoint.json'), 'utf-8');
|
|
1867
|
+
return JSON.parse(data);
|
|
1868
|
+
} catch {
|
|
1869
|
+
return null;
|
|
1870
|
+
}
|
|
1871
|
+
}
|
|
1872
|
+
|
|
1873
|
+
// Enhanced run() with checkpointing:
|
|
1874
|
+
async function runWithCheckpoints(
|
|
1875
|
+
pipeline: DocToPodcastPipeline,
|
|
1876
|
+
source: DocumentSource,
|
|
1877
|
+
outputDir: string
|
|
1878
|
+
): Promise<PodcastAudio> {
|
|
1879
|
+
const existing = await loadCheckpoint(outputDir);
|
|
1880
|
+
|
|
1881
|
+
let doc: ParsedDocument;
|
|
1882
|
+
let outline: PodcastOutline;
|
|
1883
|
+
let script: PodcastScript;
|
|
1884
|
+
|
|
1885
|
+
if (existing && existing.stage >= 2 && existing.doc) {
|
|
1886
|
+
console.log('[Resume] Skipping Stage 1 (cached)');
|
|
1887
|
+
doc = existing.doc;
|
|
1888
|
+
} else {
|
|
1889
|
+
doc = await pipeline.ingest(source);
|
|
1890
|
+
await saveCheckpoint(
|
|
1891
|
+
{ stage: 1, doc, timestamp: new Date().toISOString() },
|
|
1892
|
+
outputDir
|
|
1893
|
+
);
|
|
1894
|
+
}
|
|
1895
|
+
|
|
1896
|
+
if (existing && existing.stage >= 3 && existing.outline) {
|
|
1897
|
+
console.log('[Resume] Skipping Stage 2 (cached)');
|
|
1898
|
+
outline = existing.outline;
|
|
1899
|
+
} else {
|
|
1900
|
+
outline = await pipeline.understand(doc);
|
|
1901
|
+
await saveCheckpoint(
|
|
1902
|
+
{ stage: 2, doc, outline, timestamp: new Date().toISOString() },
|
|
1903
|
+
outputDir
|
|
1904
|
+
);
|
|
1905
|
+
}
|
|
1906
|
+
|
|
1907
|
+
if (existing && existing.stage >= 4 && existing.script) {
|
|
1908
|
+
console.log('[Resume] Skipping Stage 3 (cached)');
|
|
1909
|
+
script = existing.script;
|
|
1910
|
+
} else {
|
|
1911
|
+
script = await pipeline.generateScript(outline, doc);
|
|
1912
|
+
await saveCheckpoint(
|
|
1913
|
+
{ stage: 3, doc, outline, script, timestamp: new Date().toISOString() },
|
|
1914
|
+
outputDir
|
|
1915
|
+
);
|
|
1916
|
+
}
|
|
1917
|
+
|
|
1918
|
+
const audio = await pipeline.synthesize(script);
|
|
1919
|
+
return audio;
|
|
1920
|
+
}
|
|
1921
|
+
```
|
|
1922
|
+
|
|
1923
|
+
---
|
|
1924
|
+
|
|
1925
|
+
## Performance Optimization
|
|
1926
|
+
|
|
1927
|
+
### Parallel TTS Generation
|
|
1928
|
+
|
|
1929
|
+
For cloud TTS providers with sufficient rate limits, generate multiple segments in parallel.
|
|
1930
|
+
|
|
1931
|
+
```typescript
|
|
1932
|
+
async function generateAudioSegmentsParallel(
|
|
1933
|
+
script: PodcastScript,
|
|
1934
|
+
speakers: SpeakerConfig[],
|
|
1935
|
+
outputDir: string,
|
|
1936
|
+
concurrency: number = 5
|
|
1937
|
+
): Promise<string[]> {
|
|
1938
|
+
await mkdir(outputDir, { recursive: true });
|
|
1939
|
+
|
|
1940
|
+
const ttsProviders = buildTTSProviders(speakers);
|
|
1941
|
+
const segmentPaths: string[] = new Array(script.lines.length);
|
|
1942
|
+
|
|
1943
|
+
// Process in batches of `concurrency`
|
|
1944
|
+
for (let i = 0; i < script.lines.length; i += concurrency) {
|
|
1945
|
+
const batch = script.lines.slice(i, i + concurrency);
|
|
1946
|
+
const promises = batch.map(async (line, batchIdx) => {
|
|
1947
|
+
const globalIdx = i + batchIdx;
|
|
1948
|
+
const tts = ttsProviders[line.speaker];
|
|
1949
|
+
if (!tts) return;
|
|
1950
|
+
|
|
1951
|
+
const audioBuffer = await withRetry(
|
|
1952
|
+
() => tts.provider.synthesize(line.text, tts.voiceId, line.emotion),
|
|
1953
|
+
3,
|
|
1954
|
+
2000
|
|
1955
|
+
);
|
|
1956
|
+
const segmentPath = join(
|
|
1957
|
+
outputDir,
|
|
1958
|
+
`segment-${String(globalIdx).padStart(4, '0')}.wav`
|
|
1959
|
+
);
|
|
1960
|
+
await writeFile(segmentPath, audioBuffer);
|
|
1961
|
+
segmentPaths[globalIdx] = segmentPath;
|
|
1962
|
+
});
|
|
1963
|
+
|
|
1964
|
+
await Promise.all(promises);
|
|
1965
|
+
console.log(
|
|
1966
|
+
`[TTS] Completed ${Math.min(i + concurrency, script.lines.length)}` +
|
|
1967
|
+
`/${script.lines.length}`
|
|
1968
|
+
);
|
|
1969
|
+
}
|
|
1970
|
+
|
|
1971
|
+
return segmentPaths.filter(Boolean);
|
|
1972
|
+
}
|
|
1973
|
+
|
|
1974
|
+
function buildTTSProviders(
|
|
1975
|
+
speakers: SpeakerConfig[]
|
|
1976
|
+
): Record<string, { provider: TTSProvider; voiceId: string }> {
|
|
1977
|
+
const providers: Record<string, { provider: TTSProvider; voiceId: string }> = {};
|
|
1978
|
+
for (const speaker of speakers) {
|
|
1979
|
+
switch (speaker.provider) {
|
|
1980
|
+
case 'elevenlabs':
|
|
1981
|
+
providers[speaker.role] = {
|
|
1982
|
+
provider: new ElevenLabsTTS(process.env.ELEVENLABS_API_KEY!),
|
|
1983
|
+
voiceId: speaker.voiceId,
|
|
1984
|
+
};
|
|
1985
|
+
break;
|
|
1986
|
+
case 'google-cloud':
|
|
1987
|
+
providers[speaker.role] = {
|
|
1988
|
+
provider: new GoogleCloudTTS(process.env.GOOGLE_TTS_API_KEY!),
|
|
1989
|
+
voiceId: speaker.voiceId,
|
|
1990
|
+
};
|
|
1991
|
+
break;
|
|
1992
|
+
}
|
|
1993
|
+
}
|
|
1994
|
+
return providers;
|
|
1995
|
+
}
|
|
1996
|
+
```
|
|
1997
|
+
|
|
1998
|
+
### Streaming Pipeline (Future Enhancement)
|
|
1999
|
+
|
|
2000
|
+
For real-time applications, each stage can emit results as they become available
|
|
2001
|
+
rather than waiting for the entire stage to complete:
|
|
2002
|
+
|
|
2003
|
+
```
|
|
2004
|
+
Stage 1 emits chunks as they are parsed
|
|
2005
|
+
--> Stage 2 begins embedding as chunks arrive
|
|
2006
|
+
--> Stage 3 begins writing intro while later points are still extracted
|
|
2007
|
+
--> Stage 4 begins TTS on early script lines while later ones generate
|
|
2008
|
+
```
|
|
2009
|
+
|
|
2010
|
+
This reduces end-to-end latency by ~40% for long documents but adds significant
|
|
2011
|
+
implementation complexity. Recommended only for production deployment.
|
|
2012
|
+
|
|
2013
|
+
---
|
|
2014
|
+
|
|
2015
|
+
## Testing and Validation
|
|
2016
|
+
|
|
2017
|
+
### Unit Test Checklist
|
|
2018
|
+
|
|
2019
|
+
```typescript
|
|
2020
|
+
// Tests for each stage (using vitest):
|
|
2021
|
+
describe('DocToPodcastPipeline', () => {
|
|
2022
|
+
// Stage 1
|
|
2023
|
+
test('parsePDF extracts text and sections from a known PDF', async () => {
|
|
2024
|
+
const doc = await parsePDF('./fixtures/sample.pdf');
|
|
2025
|
+
expect(doc.text.length).toBeGreaterThan(100);
|
|
2026
|
+
expect(doc.sections.length).toBeGreaterThan(0);
|
|
2027
|
+
expect(doc.metadata.sourceType).toBe('pdf');
|
|
2028
|
+
});
|
|
2029
|
+
|
|
2030
|
+
test('parseDOCX extracts heading structure', async () => {
|
|
2031
|
+
const doc = await parseDOCX('./fixtures/sample.docx');
|
|
2032
|
+
expect(doc.sections.some((s) => s.heading !== 'Introduction')).toBe(true);
|
|
2033
|
+
});
|
|
2034
|
+
|
|
2035
|
+
test('semanticChunk produces overlapping chunks within token budget', () => {
|
|
2036
|
+
const doc = createMockDocument(5000); // 5000 words
|
|
2037
|
+
const chunks = semanticChunk(doc);
|
|
2038
|
+
expect(chunks.every((c) => c.tokenCount <= 650)).toBe(true); // 500 + buffer
|
|
2039
|
+
expect(chunks.every((c) => c.tokenCount >= 50)).toBe(true); // Not too small
|
|
2040
|
+
});
|
|
2041
|
+
|
|
2042
|
+
// Stage 2
|
|
2043
|
+
test('extractKeyPoints returns ranked points with chunk references', async () => {
|
|
2044
|
+
const points = await extractKeyPoints(mockDoc, mockChunks, 5);
|
|
2045
|
+
expect(points.length).toBeLessThanOrEqual(5);
|
|
2046
|
+
expect(points[0].importance).toBeGreaterThanOrEqual(points[1].importance);
|
|
2047
|
+
expect(points.every((p) => p.relevantChunks.length > 0)).toBe(true);
|
|
2048
|
+
});
|
|
2049
|
+
|
|
2050
|
+
// Stage 3
|
|
2051
|
+
test('generated script has correct speaker alternation', async () => {
|
|
2052
|
+
const script = await generateScript(mockOutline, mockChunks, mockConfig);
|
|
2053
|
+
// Host should speak first
|
|
2054
|
+
expect(script.lines[0].speaker).toBe('host');
|
|
2055
|
+
// Speakers should mostly alternate
|
|
2056
|
+
let alternations = 0;
|
|
2057
|
+
for (let i = 1; i < script.lines.length; i++) {
|
|
2058
|
+
if (script.lines[i].speaker !== script.lines[i - 1].speaker) alternations++;
|
|
2059
|
+
}
|
|
2060
|
+
expect(alternations / script.lines.length).toBeGreaterThan(0.4);
|
|
2061
|
+
});
|
|
2062
|
+
|
|
2063
|
+
test('script duration is within 15% of target', () => {
|
|
2064
|
+
const validation = validateScriptDuration(mockScript);
|
|
2065
|
+
expect(validation.withinTolerance).toBe(true);
|
|
2066
|
+
});
|
|
2067
|
+
|
|
2068
|
+
// Stage 4
|
|
2069
|
+
test('FFmpeg concat produces valid audio file', async () => {
|
|
2070
|
+
const finalPath = await composeAudio(mockSegments, mockConfig, tmpDir, 'Test');
|
|
2071
|
+
const { stdout } = await execFileAsync('ffprobe', [
|
|
2072
|
+
'-v', 'quiet', '-show_entries', 'format=duration', '-of', 'csv=p=0', finalPath,
|
|
2073
|
+
]);
|
|
2074
|
+
expect(parseFloat(stdout)).toBeGreaterThan(0);
|
|
2075
|
+
});
|
|
2076
|
+
|
|
2077
|
+
test('output loudness is within EBU R128 spec', async () => {
|
|
2078
|
+
const { stderr } = await execFileAsync('ffmpeg', [
|
|
2079
|
+
'-i', outputPath,
|
|
2080
|
+
'-af', 'loudnorm=print_format=json',
|
|
2081
|
+
'-f', 'null', '/dev/null',
|
|
2082
|
+
]);
|
|
2083
|
+
const match = stderr.match(/"input_i"\s*:\s*"(-?\d+\.?\d*)"/);
|
|
2084
|
+
const loudness = parseFloat(match![1]);
|
|
2085
|
+
expect(loudness).toBeCloseTo(-16, 1); // Within 1 LUFS of target
|
|
2086
|
+
});
|
|
2087
|
+
});
|
|
2088
|
+
```
|
|
2089
|
+
|
|
2090
|
+
### Manual Validation Checklist
|
|
2091
|
+
|
|
2092
|
+
```
|
|
2093
|
+
[ ] Document parses correctly (check text extraction, no garbled characters)
|
|
2094
|
+
[ ] Key points are relevant (not trivial or off-topic)
|
|
2095
|
+
[ ] Outline has logical flow (intro -> body -> conclusion)
|
|
2096
|
+
[ ] Script reads naturally when read aloud
|
|
2097
|
+
[ ] No hallucinated facts in script (compare to source)
|
|
2098
|
+
[ ] Speaker voices are distinct and appropriate
|
|
2099
|
+
[ ] Pauses feel natural (not too short, not too long)
|
|
2100
|
+
[ ] Loudness is consistent throughout (-16 LUFS +/- 1)
|
|
2101
|
+
[ ] No audio artifacts (clicks, pops, unnatural transitions)
|
|
2102
|
+
[ ] Total duration matches target within 15%
|
|
2103
|
+
[ ] MP3 metadata tags are correct (title, artist, genre)
|
|
2104
|
+
```
|
|
2105
|
+
|
|
2106
|
+
---
|
|
2107
|
+
|
|
2108
|
+
## Research Citations
|
|
2109
|
+
|
|
2110
|
+
> **Meta NotebookLlama (Oct 2024):** Open-source reproduction of Google NotebookLM's "Audio
|
|
2111
|
+
> Overview" feature. Demonstrates tiered model architecture: Llama-3.2-1B for text cleaning,
|
|
2112
|
+
> Llama-3.1-70B for script generation, Llama-3.1-8B for TTS transcript preparation. Key insight:
|
|
2113
|
+
> match model capability to task complexity rather than using one model for everything. Small
|
|
2114
|
+
> models handle mechanical tasks faster and cheaper; large models are reserved for creative
|
|
2115
|
+
> generation where quality matters most.
|
|
2116
|
+
> Source: github.com/meta-llama/llama-recipes/tree/main/recipes/quickstart/NotebookLlama
|
|
2117
|
+
|
|
2118
|
+
> **Mozilla AI Document-to-Podcast Blueprint (2024):** Fully local, CPU-only pipeline using
|
|
2119
|
+
> GGUF quantized models via llama_cpp Python bindings. Demonstrates that consumer hardware
|
|
2120
|
+
> (16GB RAM, no GPU) can run the complete pipeline with acceptable quality. Uses Parler TTS
|
|
2121
|
+
> for speech synthesis. Zero API cost makes it suitable for privacy-sensitive or budget-
|
|
2122
|
+
> constrained deployments.
|
|
2123
|
+
> Source: github.com/mozilla-ai/document-to-podcast
|
|
2124
|
+
|
|
2125
|
+
> **PodAgent (ACL 2025, arXiv 2503.00455):** Multi-agent framework for podcast generation with
|
|
2126
|
+
> Host, Guest, and Writer agents. The Writer agent performs faithfulness verification by checking
|
|
2127
|
+
> each generated claim against source material, reducing hallucination rate by 23% compared to
|
|
2128
|
+
> single-agent approaches. Introduces the "discussion angle" concept where each key point is
|
|
2129
|
+
> framed as a conversation starter rather than a lecture point.
|
|
2130
|
+
> Source: arxiv.org/abs/2503.00455
|
|
2131
|
+
|
|
2132
|
+
> **EBU R128 Loudness Standard:** European Broadcasting Union recommendation for loudness
|
|
2133
|
+
> normalization. Podcasts target -16 LUFS (Integrated Loudness) with a True Peak ceiling of
|
|
2134
|
+
> -1.5 dBTP. This is the de facto standard for Apple Podcasts, Spotify, and YouTube. FFmpeg's
|
|
2135
|
+
> loudnorm filter implements this standard natively.
|
|
2136
|
+
> Source: tech.ebu.ch/docs/r/r128.pdf
|
|
2137
|
+
|
|
2138
|
+
> **Google Gemini Embedding API:** embedding-001 model produces 768-dimensional dense vectors
|
|
2139
|
+
> optimized for retrieval tasks. Supports batch embedding (up to 100 texts per request) and
|
|
2140
|
+
> task-type hints (RETRIEVAL_DOCUMENT vs RETRIEVAL_QUERY) for improved relevance.
|
|
2141
|
+
> Source: ai.google.dev/gemini-api/docs/embeddings
|
|
2142
|
+
|
|
2143
|
+
---
|
|
2144
|
+
|
|
2145
|
+
## Appendix: Environment Variables
|
|
2146
|
+
|
|
2147
|
+
```bash
|
|
2148
|
+
# Required for cloud API pipeline
|
|
2149
|
+
GEMINI_API_KEY=your-gemini-api-key
|
|
2150
|
+
ELEVENLABS_API_KEY=your-elevenlabs-api-key
|
|
2151
|
+
|
|
2152
|
+
# Optional (for Google Cloud TTS instead of ElevenLabs)
|
|
2153
|
+
GOOGLE_TTS_API_KEY=your-google-tts-key
|
|
2154
|
+
|
|
2155
|
+
# Optional (for Anthropic Claude instead of Gemini for script generation)
|
|
2156
|
+
ANTHROPIC_API_KEY=your-anthropic-api-key
|
|
2157
|
+
|
|
2158
|
+
# Optional (for local pipeline via Ollama)
|
|
2159
|
+
OLLAMA_BASE_URL=http://localhost:11434
|
|
2160
|
+
```
|
|
2161
|
+
|
|
2162
|
+
## Appendix: npm Dependencies (Complete)
|
|
2163
|
+
|
|
2164
|
+
```json
|
|
2165
|
+
{
|
|
2166
|
+
"dependencies": {
|
|
2167
|
+
"pdf-parse": "^1.1.1",
|
|
2168
|
+
"mammoth": "^1.8.0",
|
|
2169
|
+
"@mozilla/readability": "^0.5.0",
|
|
2170
|
+
"linkedom": "^0.18.0",
|
|
2171
|
+
"cheerio": "^1.0.0",
|
|
2172
|
+
"youtube-transcript": "^1.2.1",
|
|
2173
|
+
"@google/generative-ai": "^0.21.0",
|
|
2174
|
+
"fluent-ffmpeg": "^2.1.3"
|
|
2175
|
+
},
|
|
2176
|
+
"devDependencies": {
|
|
2177
|
+
"@types/fluent-ffmpeg": "^2.1.24",
|
|
2178
|
+
"vitest": "^2.0.0"
|
|
2179
|
+
},
|
|
2180
|
+
"peerDependencies": {
|
|
2181
|
+
"ffmpeg": "System-installed FFmpeg 6.x+ required"
|
|
2182
|
+
}
|
|
2183
|
+
}
|
|
2184
|
+
```
|