pulse-engine 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. pulse_engine-0.2.0/PKG-INFO +654 -0
  2. pulse_engine-0.2.0/README.md +615 -0
  3. pulse_engine-0.2.0/pyproject.toml +116 -0
  4. pulse_engine-0.2.0/src/pulse_engine/__init__.py +0 -0
  5. pulse_engine-0.2.0/src/pulse_engine/adapters/__init__.py +58 -0
  6. pulse_engine-0.2.0/src/pulse_engine/adapters/audio_transcription.py +167 -0
  7. pulse_engine-0.2.0/src/pulse_engine/adapters/batcher.py +36 -0
  8. pulse_engine-0.2.0/src/pulse_engine/adapters/digital_news.py +128 -0
  9. pulse_engine-0.2.0/src/pulse_engine/adapters/digital_news_metadata.py +536 -0
  10. pulse_engine-0.2.0/src/pulse_engine/adapters/exceptions.py +10 -0
  11. pulse_engine-0.2.0/src/pulse_engine/adapters/models.py +134 -0
  12. pulse_engine-0.2.0/src/pulse_engine/adapters/opensearch_storage.py +160 -0
  13. pulse_engine-0.2.0/src/pulse_engine/adapters/speech_content.py +130 -0
  14. pulse_engine-0.2.0/src/pulse_engine/adapters/speech_metadata.py +374 -0
  15. pulse_engine-0.2.0/src/pulse_engine/adapters/twitter.py +423 -0
  16. pulse_engine-0.2.0/src/pulse_engine/adapters/youtube_downloader.py +186 -0
  17. pulse_engine-0.2.0/src/pulse_engine/adapters/youtube_metadata.py +261 -0
  18. pulse_engine-0.2.0/src/pulse_engine/api/__init__.py +0 -0
  19. pulse_engine-0.2.0/src/pulse_engine/api/v1/__init__.py +0 -0
  20. pulse_engine-0.2.0/src/pulse_engine/api/v1/auth.py +91 -0
  21. pulse_engine-0.2.0/src/pulse_engine/api/v1/health.py +62 -0
  22. pulse_engine-0.2.0/src/pulse_engine/api/v1/router.py +16 -0
  23. pulse_engine-0.2.0/src/pulse_engine/chain_recovery.py +131 -0
  24. pulse_engine-0.2.0/src/pulse_engine/cli/__init__.py +0 -0
  25. pulse_engine-0.2.0/src/pulse_engine/cli/main.py +169 -0
  26. pulse_engine-0.2.0/src/pulse_engine/cli/templates/cookiecutter.json +4 -0
  27. pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
  28. pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
  29. pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
  30. pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
  31. pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
  32. pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
  33. pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
  34. pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
  35. pulse_engine-0.2.0/src/pulse_engine/client.py +95 -0
  36. pulse_engine-0.2.0/src/pulse_engine/config.py +157 -0
  37. pulse_engine-0.2.0/src/pulse_engine/core/__init__.py +0 -0
  38. pulse_engine-0.2.0/src/pulse_engine/core/error_handlers.py +64 -0
  39. pulse_engine-0.2.0/src/pulse_engine/core/exceptions.py +67 -0
  40. pulse_engine-0.2.0/src/pulse_engine/core/job_token.py +109 -0
  41. pulse_engine-0.2.0/src/pulse_engine/core/logging.py +45 -0
  42. pulse_engine-0.2.0/src/pulse_engine/core/scope.py +23 -0
  43. pulse_engine-0.2.0/src/pulse_engine/core/security.py +130 -0
  44. pulse_engine-0.2.0/src/pulse_engine/database.py +30 -0
  45. pulse_engine-0.2.0/src/pulse_engine/dependencies.py +166 -0
  46. pulse_engine-0.2.0/src/pulse_engine/deployment/__init__.py +0 -0
  47. pulse_engine-0.2.0/src/pulse_engine/deployment/backend_deployment_repository.py +83 -0
  48. pulse_engine-0.2.0/src/pulse_engine/deployment/backends/__init__.py +0 -0
  49. pulse_engine-0.2.0/src/pulse_engine/deployment/backends/base.py +50 -0
  50. pulse_engine-0.2.0/src/pulse_engine/deployment/backends/exceptions.py +20 -0
  51. pulse_engine-0.2.0/src/pulse_engine/deployment/backends/native_lambda.py +125 -0
  52. pulse_engine-0.2.0/src/pulse_engine/deployment/backends/prefect_ecs.py +116 -0
  53. pulse_engine-0.2.0/src/pulse_engine/deployment/backends/prefect_k8s.py +131 -0
  54. pulse_engine-0.2.0/src/pulse_engine/deployment/backends/registry.py +50 -0
  55. pulse_engine-0.2.0/src/pulse_engine/deployment/infra_provisioner.py +285 -0
  56. pulse_engine-0.2.0/src/pulse_engine/deployment/job_launcher.py +178 -0
  57. pulse_engine-0.2.0/src/pulse_engine/deployment/models.py +48 -0
  58. pulse_engine-0.2.0/src/pulse_engine/deployment/repository.py +54 -0
  59. pulse_engine-0.2.0/src/pulse_engine/deployment/router.py +22 -0
  60. pulse_engine-0.2.0/src/pulse_engine/deployment/schemas.py +18 -0
  61. pulse_engine-0.2.0/src/pulse_engine/deployment/service.py +65 -0
  62. pulse_engine-0.2.0/src/pulse_engine/extractor/__init__.py +0 -0
  63. pulse_engine-0.2.0/src/pulse_engine/extractor/adapters/__init__.py +0 -0
  64. pulse_engine-0.2.0/src/pulse_engine/extractor/base.py +48 -0
  65. pulse_engine-0.2.0/src/pulse_engine/extractor/models.py +50 -0
  66. pulse_engine-0.2.0/src/pulse_engine/extractor/orchestrator/__init__.py +15 -0
  67. pulse_engine-0.2.0/src/pulse_engine/extractor/orchestrator/base.py +34 -0
  68. pulse_engine-0.2.0/src/pulse_engine/extractor/orchestrator/noop.py +37 -0
  69. pulse_engine-0.2.0/src/pulse_engine/extractor/orchestrator/prefect.py +163 -0
  70. pulse_engine-0.2.0/src/pulse_engine/extractor/repository.py +163 -0
  71. pulse_engine-0.2.0/src/pulse_engine/extractor/router.py +102 -0
  72. pulse_engine-0.2.0/src/pulse_engine/extractor/schemas.py +93 -0
  73. pulse_engine-0.2.0/src/pulse_engine/extractor/service.py +431 -0
  74. pulse_engine-0.2.0/src/pulse_engine/extractor/stage_models.py +36 -0
  75. pulse_engine-0.2.0/src/pulse_engine/extractor/stage_repository.py +109 -0
  76. pulse_engine-0.2.0/src/pulse_engine/main.py +195 -0
  77. pulse_engine-0.2.0/src/pulse_engine/mcp/__init__.py +0 -0
  78. pulse_engine-0.2.0/src/pulse_engine/mcp/__main__.py +5 -0
  79. pulse_engine-0.2.0/src/pulse_engine/mcp/server.py +108 -0
  80. pulse_engine-0.2.0/src/pulse_engine/mcp/tools_jobs.py +159 -0
  81. pulse_engine-0.2.0/src/pulse_engine/mcp/tools_kb.py +88 -0
  82. pulse_engine-0.2.0/src/pulse_engine/mcp/tools_modules.py +115 -0
  83. pulse_engine-0.2.0/src/pulse_engine/mcp/tools_pipelines.py +215 -0
  84. pulse_engine-0.2.0/src/pulse_engine/mcp/tools_processor.py +208 -0
  85. pulse_engine-0.2.0/src/pulse_engine/middleware/__init__.py +0 -0
  86. pulse_engine-0.2.0/src/pulse_engine/middleware/rate_limit.py +144 -0
  87. pulse_engine-0.2.0/src/pulse_engine/middleware/request_id.py +16 -0
  88. pulse_engine-0.2.0/src/pulse_engine/middleware/security_headers.py +25 -0
  89. pulse_engine-0.2.0/src/pulse_engine/middleware/tenant.py +90 -0
  90. pulse_engine-0.2.0/src/pulse_engine/pipeline/__init__.py +0 -0
  91. pulse_engine-0.2.0/src/pulse_engine/pipeline/config_parser.py +148 -0
  92. pulse_engine-0.2.0/src/pulse_engine/pipeline/expression.py +268 -0
  93. pulse_engine-0.2.0/src/pulse_engine/pipeline/models.py +98 -0
  94. pulse_engine-0.2.0/src/pulse_engine/pipeline/repositories.py +224 -0
  95. pulse_engine-0.2.0/src/pulse_engine/pipeline/router_modules.py +66 -0
  96. pulse_engine-0.2.0/src/pulse_engine/pipeline/router_pipelines.py +198 -0
  97. pulse_engine-0.2.0/src/pulse_engine/pipeline/schemas.py +200 -0
  98. pulse_engine-0.2.0/src/pulse_engine/pipeline/service.py +250 -0
  99. pulse_engine-0.2.0/src/pulse_engine/pipeline/translators/__init__.py +44 -0
  100. pulse_engine-0.2.0/src/pulse_engine/pipeline/translators/airflow_status.py +11 -0
  101. pulse_engine-0.2.0/src/pulse_engine/pipeline/translators/airflow_translator.py +22 -0
  102. pulse_engine-0.2.0/src/pulse_engine/pipeline/translators/base.py +42 -0
  103. pulse_engine-0.2.0/src/pulse_engine/pipeline/translators/prefect_status.py +93 -0
  104. pulse_engine-0.2.0/src/pulse_engine/pipeline/translators/prefect_translator.py +195 -0
  105. pulse_engine-0.2.0/src/pulse_engine/processor/__init__.py +0 -0
  106. pulse_engine-0.2.0/src/pulse_engine/processor/base.py +36 -0
  107. pulse_engine-0.2.0/src/pulse_engine/processor/core/__init__.py +0 -0
  108. pulse_engine-0.2.0/src/pulse_engine/processor/core/analysis.py +148 -0
  109. pulse_engine-0.2.0/src/pulse_engine/processor/core/chunking.py +158 -0
  110. pulse_engine-0.2.0/src/pulse_engine/processor/core/prompts.py +340 -0
  111. pulse_engine-0.2.0/src/pulse_engine/processor/core/topic_splitter.py +105 -0
  112. pulse_engine-0.2.0/src/pulse_engine/processor/defaults/__init__.py +11 -0
  113. pulse_engine-0.2.0/src/pulse_engine/processor/defaults/core_processor.py +12 -0
  114. pulse_engine-0.2.0/src/pulse_engine/processor/defaults/postprocessor.py +12 -0
  115. pulse_engine-0.2.0/src/pulse_engine/processor/defaults/preprocessor.py +12 -0
  116. pulse_engine-0.2.0/src/pulse_engine/processor/llm/__init__.py +0 -0
  117. pulse_engine-0.2.0/src/pulse_engine/processor/llm/provider.py +58 -0
  118. pulse_engine-0.2.0/src/pulse_engine/processor/ocr/gemini.py +52 -0
  119. pulse_engine-0.2.0/src/pulse_engine/processor/pipeline.py +107 -0
  120. pulse_engine-0.2.0/src/pulse_engine/processor/postprocessor/__init__.py +0 -0
  121. pulse_engine-0.2.0/src/pulse_engine/processor/postprocessor/embeddings.py +34 -0
  122. pulse_engine-0.2.0/src/pulse_engine/processor/postprocessor/tasks.py +180 -0
  123. pulse_engine-0.2.0/src/pulse_engine/processor/preprocessor/__init__.py +0 -0
  124. pulse_engine-0.2.0/src/pulse_engine/processor/preprocessor/tasks.py +71 -0
  125. pulse_engine-0.2.0/src/pulse_engine/processor/router.py +192 -0
  126. pulse_engine-0.2.0/src/pulse_engine/processor/schemas.py +167 -0
  127. pulse_engine-0.2.0/src/pulse_engine/registry.py +117 -0
  128. pulse_engine-0.2.0/src/pulse_engine/runners/__init__.py +0 -0
  129. pulse_engine-0.2.0/src/pulse_engine/runners/lambda_runner.py +26 -0
  130. pulse_engine-0.2.0/src/pulse_engine/runners/pipeline_runner.py +43 -0
  131. pulse_engine-0.2.0/src/pulse_engine/runners/prefect_pipeline_flow.py +904 -0
  132. pulse_engine-0.2.0/src/pulse_engine/runners/prefect_runner.py +33 -0
  133. pulse_engine-0.2.0/src/pulse_engine/s3.py +72 -0
  134. pulse_engine-0.2.0/src/pulse_engine/secrets.py +46 -0
  135. pulse_engine-0.2.0/src/pulse_engine/services/__init__.py +0 -0
  136. pulse_engine-0.2.0/src/pulse_engine/services/bootstrap.py +211 -0
  137. pulse_engine-0.2.0/src/pulse_engine/services/opensearch.py +84 -0
  138. pulse_engine-0.2.0/src/pulse_engine/storage/__init__.py +0 -0
  139. pulse_engine-0.2.0/src/pulse_engine/storage/connectors/__init__.py +0 -0
  140. pulse_engine-0.2.0/src/pulse_engine/storage/connectors/athena.py +226 -0
  141. pulse_engine-0.2.0/src/pulse_engine/storage/connectors/base.py +32 -0
  142. pulse_engine-0.2.0/src/pulse_engine/storage/connectors/opensearch.py +344 -0
  143. pulse_engine-0.2.0/src/pulse_engine/storage/knowledge_base.py +68 -0
  144. pulse_engine-0.2.0/src/pulse_engine/storage/router.py +78 -0
  145. pulse_engine-0.2.0/src/pulse_engine/storage/schemas.py +93 -0
  146. pulse_engine-0.2.0/src/pulse_engine/testing/__init__.py +13 -0
  147. pulse_engine-0.2.0/src/pulse_engine/testing/fixtures.py +50 -0
  148. pulse_engine-0.2.0/src/pulse_engine/testing/mocks.py +104 -0
  149. pulse_engine-0.2.0/src/pulse_engine/worker.py +53 -0
@@ -0,0 +1,654 @@
1
+ Metadata-Version: 2.1
2
+ Name: pulse-engine
3
+ Version: 0.2.0
4
+ Summary: Pulse Engine — Hybrid framework for building Pulse products
5
+ Author: Pulse Team
6
+ Requires-Python: >=3.11,<3.13
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.11
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Requires-Dist: alembic (>=1.14.0,<2.0.0)
11
+ Requires-Dist: asyncpg (>=0.30.0,<0.31.0)
12
+ Requires-Dist: beautifulsoup4 (>=4.12.0,<5.0.0)
13
+ Requires-Dist: boto3 (>=1.35.0,<2.0.0)
14
+ Requires-Dist: celery[redis] (>=5.4.0,<6.0.0)
15
+ Requires-Dist: cookiecutter (>=2.6.0,<3.0.0)
16
+ Requires-Dist: curl-cffi (>=0.15.0,<0.16.0)
17
+ Requires-Dist: email-validator (>=2.3.0,<3.0.0)
18
+ Requires-Dist: fastapi (>=0.115.0,<0.116.0)
19
+ Requires-Dist: httpx (>=0.28.0,<0.29.0)
20
+ Requires-Dist: langchain (>=1.2.13,<2.0.0)
21
+ Requires-Dist: langchain-anthropic (>=1.4.0,<2.0.0)
22
+ Requires-Dist: langchain-openai (>=1.1.11,<2.0.0)
23
+ Requires-Dist: langdetect (>=1.0.9,<2.0.0)
24
+ Requires-Dist: mcp[cli] (>=1.0.0,<2.0.0)
25
+ Requires-Dist: opensearch-py[async] (>=3.1.0,<4.0.0)
26
+ Requires-Dist: pydantic-settings (>=2.7.0,<3.0.0)
27
+ Requires-Dist: pymupdf (>=1.27.2.3,<2.0.0.0)
28
+ Requires-Dist: python-jose[cryptography] (>=3.3.0,<4.0.0)
29
+ Requires-Dist: redis (>=5.0.0,<6.0.0)
30
+ Requires-Dist: sqlalchemy[asyncio] (>=2.0,<3.0)
31
+ Requires-Dist: structlog (>=24.4.0,<25.0.0)
32
+ Requires-Dist: tiktoken (>=0.8.0,<0.9.0)
33
+ Requires-Dist: twikit (>=2.3.3,<3.0.0)
34
+ Requires-Dist: typer (>=0.15.0,<0.16.0)
35
+ Requires-Dist: uvicorn[standard] (>=0.34.0,<0.35.0)
36
+ Requires-Dist: yt-dlp (>=2026.3.17,<2027.0.0)
37
+ Description-Content-Type: text/markdown
38
+
39
+ # Pulse Engine
40
+
41
+ Hybrid Python framework for building multi-tenant data products. Products `pip install pulse-engine`, declare a manifest, and get a full FastAPI app with OpenSearch, Athena, Celery, Prefect, and MCP — out of the box.
42
+
43
+ ## How It Works
44
+
45
+ ```
46
+ ┌──────────────────────────────────────────────────┐
47
+ │ Product (pip install pulse-engine) │
48
+ │ manifest = ProductManifest( │
49
+ │ extractors=[...], preprocessor=..., ... │
50
+ │ ) │
51
+ └──────────────┬───────────────────────────────────┘
52
+
53
+ ┌──────────────▼───────────────────────────────────┐
54
+ │ pulse-engine │
55
+ │ Base ABCs · Default implementations · App │
56
+ │ factory · Storage connectors · Job lifecycle │
57
+ │ · CLI · Testing utilities │
58
+ └──────────────┬───────────────────────────────────┘
59
+
60
+ ┌──────────────▼───────────────────────────────────┐
61
+ │ Shared Infrastructure │
62
+ │ Prefect · OpenSearch · Redis · PostgreSQL │
63
+ └──────────────────────────────────────────────────┘
64
+ ```
65
+
66
+ Products customize behaviour by:
67
+ - Implementing `BaseExtractor` subclasses for data extraction
68
+ - Overriding pipeline stages (preprocessor, core, postprocessor) or using defaults
69
+ - Adding product-specific API routes, MCP tools, and Celery tasks
70
+ - Declaring everything in a `ProductManifest`
71
+
72
+ ## Prerequisites
73
+
74
+ - Python 3.11-3.12
75
+ - [Poetry](https://python-poetry.org/docs/#installation)
76
+ - Docker & Docker Compose (for shared infrastructure)
77
+
78
+ ## Quick Start
79
+
80
+ ### For Engine Development
81
+
82
+ ```bash
83
+ git clone <repo-url> && cd pulse-engine
84
+
85
+ # Install deps and pre-commit hooks
86
+ make install
87
+
88
+ # Copy env and configure
89
+ cp .env.example .env
90
+
91
+ # Run database migrations
92
+ make migrate
93
+
94
+ # Start the dev server
95
+ make run
96
+ ```
97
+
98
+ ### For Building a New Product
99
+
100
+ ```bash
101
+ # Install the engine (stable / prod release)
102
+ pip install pulse-engine
103
+
104
+ # Install the latest dev build (pre-release, published on every push to `dev`)
105
+ pip install pulse-engine --pre
106
+
107
+ # Scaffold a new product
108
+ pulse init my-product
109
+ cd pulse-my-product
110
+
111
+ # Set up the product
112
+ make install
113
+ cp .env.example .env
114
+
115
+ # Validate and test
116
+ make validate
117
+ make test
118
+
119
+ # Run
120
+ make run
121
+ ```
122
+
123
+ See **[docs/building-a-product.md](docs/building-a-product.md)** for the full guide, including how to add time-based filters (e.g. "last 2 days", "last 5 years") via the `config` dict in the trigger payload.
124
+
125
+ ## Versioning
126
+
127
+ Every push to `dev` publishes a pre-release to PyPI with the format `x.y.z.devYYYYMMDDHHMMSS` (e.g. `0.2.0.dev20250506120000`). Every push to `prod` publishes a stable release. pip and Poetry skip pre-releases by default.
128
+
129
+ | Goal | pip | Poetry (`pyproject.toml`) |
130
+ |------|-----|---------------------------|
131
+ | Latest stable | `pip install pulse-engine` | `pulse-engine = ">=0.2.0"` |
132
+ | Latest dev build | `pip install pulse-engine --pre` | `pulse-engine = {version = ">=0.2.0", allow-prereleases = true}` |
133
+ | Specific dev snapshot | `pip install "pulse-engine==0.2.0.dev20250506120000"` | `pulse-engine = ">=0.2.0.dev20250506120000"` |
134
+
135
+ ## CLI
136
+
137
+ The `pulse` command is the primary interface:
138
+
139
+ | Command | Description |
140
+ |---------|-------------|
141
+ | `pulse init <name>` | Scaffold a new product from template |
142
+ | `pulse validate [module]` | Validate a product manifest |
143
+ | `pulse run` | Discover manifest and start FastAPI server |
144
+ | `pulse run-worker` | Discover manifest and start Celery worker |
145
+ | `pulse run-mcp` | Discover manifest and start MCP server |
146
+
147
+ ## Product Manifest
148
+
149
+ Products declare their components via a `ProductManifest`:
150
+
151
+ ```python
152
+ from pulse_engine.registry import ProductManifest
153
+
154
+ manifest = ProductManifest(
155
+ name="my-product",
156
+ version="0.1.0"
157
+ )
158
+ ```
159
+
160
+ Products register via a `pyproject.toml` entry point:
161
+
162
+ ```toml
163
+ [tool.poetry.plugins."pulse_engine.products"]
164
+ my_product = "pulse_my_product:manifest"
165
+ ```
166
+
167
+ ## Project Structure
168
+
169
+ ```
170
+ src/pulse_engine/
171
+ ├── main.py # App factory (create_app)
172
+ ├── config.py # Settings (pydantic-settings)
173
+ ├── registry.py # ProductManifest, validation, discovery
174
+ ├── worker.py # Celery app factory
175
+ ├── database.py # SQLAlchemy async setup
176
+ ├── dependencies.py # FastAPI dependency injection
177
+ ├── client.py # PulseEngineClient (container → engine HTTP)
178
+ ├── s3.py # S3Stage (NDJSON inter-stage data exchange)
179
+ ├── chain_recovery.py # Background task for stalled pipeline recovery
180
+ ├── cli/
181
+ │ ├── main.py # pulse CLI (typer)
182
+ │ └── templates/ # Cookiecutter product template
183
+ ├── api/v1/
184
+ │ ├── router.py # v1 router aggregation
185
+ │ └── health.py # Health check endpoint
186
+ ├── core/
187
+ │ ├── security.py # JWT verification (Cognito + Job-scoped)
188
+ │ ├── job_token.py # Job-scoped JWT issuance & verification
189
+ │ ├── scope.py # require_scope() FastAPI dependency
190
+ │ ├── exceptions.py # Exception hierarchy
191
+ │ ├── error_handlers.py # Global error handlers
192
+ │ └── logging.py # Structured logging (structlog)
193
+ ├── middleware/
194
+ │ ├── request_id.py # X-Request-ID middleware
195
+ │ ├── security_headers.py # Defensive HTTP security headers (CSP, HSTS, etc.)
196
+ │ ├── rate_limit.py # Sliding-window per-IP rate limiter (100 req/60 s)
197
+ │ └── tenant.py # Dual-token middleware (Cognito + Job JWT)
198
+ ├── deployment/
199
+ │ ├── models.py # DeploymentModel ORM
200
+ │ ├── repository.py # DeploymentRepository
201
+ │ ├── service.py # DeploymentService
202
+ │ ├── router.py # POST /api/v1/deployments
203
+ │ └── schemas.py # Registration request/response
204
+ ├── extractor/
205
+ │ ├── base.py # BaseExtractor ABC
206
+ │ ├── models.py # SQLAlchemy ORM: job_records
207
+ │ ├── stage_models.py # SQLAlchemy ORM: job_stages
208
+ │ ├── repository.py # JobRepository
209
+ │ ├── stage_repository.py # StageRepository
210
+ │ ├── service.py # JobService (stage-aware)
211
+ │ ├── router.py # /api/v1/jobs/ endpoints
212
+ │ ├── schemas.py # Pydantic models
213
+ │ └── orchestrator/
214
+ │ ├── base.py # BaseOrchestratorAdapter ABC
215
+ │ ├── prefect.py # PrefectAdapter (deployments + flow runs)
216
+ │ └── noop.py # NoopAdapter
217
+ ├── processor/
218
+ │ ├── base.py # BasePreprocessor, BaseCoreProcessor,
219
+ │ │ # BasePostprocessor ABCs
220
+ │ ├── pipeline.py # Pluggable ProcessingPipeline
221
+ │ ├── router.py # /api/v1/process/ endpoints
222
+ │ ├── schemas.py # ProcessingContext, options
223
+ │ ├── defaults/ # Default stage implementations
224
+ │ ├── preprocessor/ # clean_html, normalize, detect_language
225
+ │ ├── core/ # chunking, NER, sentiment, topics
226
+ │ └── postprocessor/ # embeddings, dedup, quality scoring
227
+ ├── storage/
228
+ │ ├── knowledge_base.py # KnowledgeBaseService
229
+ │ ├── router.py # /api/v1/kb/ endpoints
230
+ │ ├── schemas.py # Document, SearchQuery, etc.
231
+ │ └── connectors/
232
+ │ ├── base.py # BaseStorageConnector ABC
233
+ │ ├── opensearch.py # OpenSearch connector
234
+ │ └── athena.py # Athena connector
235
+ ├── mcp/
236
+ │ ├── server.py # FastMCP instance
237
+ │ ├── tools_kb.py # KB tools (6)
238
+ │ ├── tools_jobs.py # Jobs tools (6)
239
+ │ ├── tools_processor.py # Processor tools (5)
240
+ │ ├── tools_pipelines.py # Pipeline tools (5)
241
+ │ └── tools_modules.py # Module registry tools (3)
242
+ ├── services/
243
+ │ ├── bootstrap.py # ServiceContainer, bootstrap_services()
244
+ │ └── opensearch.py # OpenSearch client wrapper
245
+ └── testing/
246
+ ├── fixtures.py # Reusable pytest fixtures
247
+ └── mocks.py # MockStorageConnector, MockExtractor, etc.
248
+
249
+ infra/
250
+ ├── docker-compose.yml # Prefect, Redis, OpenSearch, PostgreSQL
251
+ └── terraform/ # AWS modules (networking, ECS, ECR, ALB)
252
+
253
+ tests/
254
+ ├── unit/ # Unit tests
255
+ │ ├── framework/ # Manifest, pipeline, base class tests
256
+ │ ├── processor/
257
+ │ ├── storage/
258
+ │ ├── deployment/ # Deployment registration tests
259
+ │ └── extractor/
260
+ └── integration/ # Integration tests
261
+ ├── api/
262
+ ├── mcp/
263
+ └── pipelines/
264
+ ```
265
+
266
+ ## Code Quality
267
+
268
+ Pre-commit hooks run on every commit:
269
+
270
+ | Hook | What it checks |
271
+ |------|----------------|
272
+ | `trailing-whitespace` | No trailing whitespace |
273
+ | `end-of-file-fixer` | Files end with a newline |
274
+ | `check-yaml` | Valid YAML syntax |
275
+ | `ruff` | Linting (auto-fix enabled) |
276
+ | `ruff-format` | Code formatting |
277
+ | `mypy` | Strict static type-checking |
278
+
279
+ ```bash
280
+ make lint # run all hooks manually
281
+ ```
282
+
283
+ ## CI/CD
284
+
285
+ ### PR Checks (`pr-checks.yml`)
286
+
287
+ Runs on every pull request to `dev`, `uat`, `prod`:
288
+ - **lint** — ruff check + format
289
+ - **typecheck** — mypy strict
290
+ - **test** — unit tests with coverage
291
+ - **trivy** — vulnerability scan
292
+
293
+ ### Deploy (`deploy.yml`)
294
+
295
+ Runs on push to `dev`, `uat`, `prod`:
296
+
297
+ | Branch | PyPI Target | Infrastructure |
298
+ |--------|-------------|----------------|
299
+ | `dev` | PyPI (`.dev` suffix) | VM via docker-compose |
300
+ | `staging` | PyPI (`.dev` suffix) | VM via docker-compose |
301
+ | `prod` | PyPI (stable) | ECS cluster |
302
+
303
+ The pipeline: test → publish to PyPI → build Docker → push ECR → deploy → health check.
304
+
305
+ Terraform runs separately on `infra/terraform/**` changes: plan → apply → sync outputs to GitHub Secrets (pipeline infra vars auto-flow to `.env` on next deploy).
306
+
307
+ ### Required Secrets (per GitHub environment)
308
+
309
+ | Secret | Description |
310
+ |--------|-------------|
311
+ | `AWS_ROLE_ARN` | OIDC role for GitHub → AWS auth |
312
+ | `ECR_REPOSITORY_URL` | ECR repository URL |
313
+ | `PYPI_TOKEN` | PyPI API token |
314
+
315
+ ## MCP Server
316
+
317
+ Exposes 25 tools for AI agents via the Model Context Protocol:
318
+
319
+ | Category | Tools |
320
+ |---|---|
321
+ | **Jobs** (6) | `jobs_register`, `jobs_get`, `jobs_list`, `jobs_push_status`, `jobs_cancel`, `jobs_delete` |
322
+ | **Knowledge Base** (6) | `kb_store_documents`, `kb_retrieve_document`, `kb_search`, `kb_delete_document`, `kb_get_stats`, `kb_run_query` |
323
+ | **Processor** (5) | `process_pipeline`, `process_preprocess`, `process_analyze`, `process_postprocess`, `process_chunk` |
324
+ | **Pipelines** (5) | `pipelines_trigger`, `pipelines_status`, `pipelines_list`, `pipelines_cancel`, `pipelines_steps` |
325
+ | **Modules** (3) | `modules_register`, `modules_list`, `modules_delete` |
326
+
327
+ ```bash
328
+ pulse run-mcp
329
+ ```
330
+
331
+ Products register additional tools via `mcp_tool_modules` in the manifest.
332
+
333
+ ## Environment Variables
334
+
335
+ ### Core
336
+
337
+ | Variable | Required | Default | Description |
338
+ |----------|----------|---------|-------------|
339
+ | `APP_ENV` | No | `development` | Environment name (`development`, `production`, etc.) |
340
+ | `APP_VERSION` | No | `0.1.0` | Application version |
341
+ | `LOG_LEVEL` | No | `INFO` | Logging level |
342
+ | `AWS_REGION` | Yes | `ap-south-1` | AWS region |
343
+ | `AWS_ACCESS_KEY_ID` | No | — | AWS credentials (use IAM role in production) |
344
+ | `AWS_SECRET_ACCESS_KEY` | No | — | AWS credentials (use IAM role in production) |
345
+
346
+ ### Authentication (Cognito)
347
+
348
+ | Variable | Required | Default | Description |
349
+ |----------|----------|---------|-------------|
350
+ | `COGNITO_USER_POOL_ID` | Yes | — | Cognito User Pool ID |
351
+ | `COGNITO_APP_CLIENT_ID` | Yes | — | Cognito App Client ID |
352
+ | `COGNITO_APP_CLIENT_SECRET` | No | — | Client secret (required if app client has one) |
353
+
354
+ ### OpenSearch
355
+
356
+ | Variable | Required | Default | Description |
357
+ |----------|----------|---------|-------------|
358
+ | `OPENSEARCH_URL` | Yes | — | OpenSearch endpoint |
359
+ | `OPENSEARCH_USERNAME` | No | — | Basic-auth username (AWS managed domains) |
360
+ | `OPENSEARCH_PASSWORD` | No | — | Basic-auth password |
361
+ | `OPENSEARCH_USE_SSL` | No | `true` | Enable TLS |
362
+ | `OPENSEARCH_VERIFY_CERTS` | No | `true` | Verify TLS certificates |
363
+ | `OPENSEARCH_INDEX_PREFIX` | No | `pulse_kb` | Index name prefix per tenant |
364
+ | `EMBEDDING_DIMENSION` | No | `1536` | Vector dimension for kNN indexes |
365
+
366
+ ### Database & Cache
367
+
368
+ | Variable | Required | Default | Description |
369
+ |----------|----------|---------|-------------|
370
+ | `DATABASE_URL` | Yes | — | Async PostgreSQL DSN (`postgresql+asyncpg://...`) |
371
+ | `REDIS_URL` | No | `redis://localhost:6379/0` | Redis URL (enables Celery) |
372
+ | `CELERY_BROKER_URL` | No | — | Celery broker (defaults to `REDIS_URL`) |
373
+ | `CELERY_RESULT_BACKEND` | No | — | Celery result backend (defaults to `REDIS_URL`) |
374
+
375
+ ### Athena
376
+
377
+ | Variable | Required | Default | Description |
378
+ |----------|----------|---------|-------------|
379
+ | `ATHENA_AWS_ACCESS_KEY_ID` | No | — | Athena-specific AWS credentials |
380
+ | `ATHENA_AWS_SECRET_ACCESS_KEY` | No | — | Athena-specific AWS credentials |
381
+ | `ATHENA_OUTPUT_LOCATION` | Yes* | — | S3 URI for Athena query results |
382
+ | `ATHENA_WORKGROUP` | No | `primary` | Athena workgroup |
383
+ | `ATHENA_QUERY_TIMEOUT_SECONDS` | No | `60` | Athena query timeout |
384
+
385
+ ### Orchestrator (Prefect)
386
+
387
+ | Variable | Required | Default | Description |
388
+ |----------|----------|---------|-------------|
389
+ | `PULSE_ORCHESTRATOR_BACKEND` | No | `none` | `prefect` or `none` |
390
+ | `PREFECT_API_URL` | No | — | Prefect API endpoint |
391
+ | `PREFECT_API_KEY` | No | — | Prefect Cloud API key |
392
+ | `PREFECT_ECS_WORK_POOL_NAME` | No | `products-worker-pool` | ECS work pool name |
393
+ | `PREFECT_LAMBDA_WORK_POOL_NAME` | No | `lambda-worker-pool` | Lambda work pool name |
394
+ | `PREFECT_LAMBDA_FUNCTION_NAME_TEMPLATE` | No | `{product}-{stage}` | Lambda function name pattern |
395
+ | `PREFECT_K8S_WORK_POOL_NAME` | No | `k8s-worker-pool` | Kubernetes work pool name |
396
+ | `PREFECT_K8S_NAMESPACE` | No | `pulse-jobs` | Kubernetes namespace |
397
+ | `PREFECT_K8S_DEFAULT_CPU` | No | `500m` | Default CPU request |
398
+ | `PREFECT_K8S_DEFAULT_MEMORY` | No | `1Gi` | Default memory request |
399
+
400
+ ### LLM & Embeddings
401
+
402
+ | Variable | Required | Default | Description |
403
+ |----------|----------|---------|-------------|
404
+ | `PULSE_LLM_PROVIDER` | No | `openai` | LLM provider |
405
+ | `PULSE_LLM_MODEL` | No | `gpt-4o-mini` | LLM model ID |
406
+ | `PULSE_LLM_API_KEY` | No | — | LLM API key (also used as embedding fallback) |
407
+ | `PULSE_LLM_TEMPERATURE` | No | `0.0` | LLM sampling temperature |
408
+ | `PULSE_EMBEDDING_PROVIDER` | No | `openai` | Embedding provider |
409
+ | `PULSE_OPENAI_EMBEDDING_MODEL` | No | `text-embedding-3-small` | OpenAI embedding model |
410
+ | `PULSE_OPENAI_API_KEY` | No | — | OpenAI API key (overrides `PULSE_LLM_API_KEY`) |
411
+
412
+ ### Pipeline & Jobs
413
+
414
+ | Variable | Required | Default | Description |
415
+ |----------|----------|---------|-------------|
416
+ | `PULSE_ENGINE_URL` | No | — | Public URL containers use for callbacks |
417
+ | `PULSE_JOB_TOKEN_SECRET` | No | — | HMAC secret for job-scoped JWTs |
418
+ | `PULSE_S3_BUCKET` | No | — | S3 bucket for inter-stage NDJSON data |
419
+ | `PULSE_CHAIN_GRACE_PERIOD_SECONDS` | No | `300` | Seconds before chain recovery auto-triggers |
420
+ | `PULSE_MAX_CONCURRENT_JOBS_PER_TENANT` | No | `10` | Max concurrent jobs per tenant |
421
+ | `PULSE_DEFAULT_CHUNK_SIZE` | No | `512` | Default chunk token size |
422
+ | `PULSE_DEFAULT_CHUNK_STRATEGY` | No | `token_count` | Default chunking strategy |
423
+ | `PULSE_DEDUP_SIMILARITY_THRESHOLD` | No | `0.95` | Cosine similarity dedup threshold |
424
+
425
+ ### Pipeline Infrastructure (from Terraform)
426
+
427
+ These are auto-synced from Terraform outputs to GitHub Secrets, then written to `.env` at deploy time:
428
+
429
+ | Variable | Description |
430
+ |----------|-------------|
431
+ | `PIPELINE_TASK_DEFINITION` | ECS task definition family for pipeline steps |
432
+ | `PIPELINE_CLUSTER_NAME` | ECS cluster for pipeline step tasks |
433
+ | `PIPELINE_EXECUTION_ROLE_ARN` | ECS task execution role (ECR pull, logs, secrets) |
434
+ | `PIPELINE_TASK_ROLE_ARN` | ECS task role (S3, Lambda invoke, ECS dispatch) |
435
+ | `PIPELINE_LOG_GROUP` | CloudWatch log group for ECS pipeline steps |
436
+ | `PIPELINE_SUBNETS` | Comma-separated private subnet IDs |
437
+ | `PIPELINE_SECURITY_GROUPS` | Comma-separated security group IDs |
438
+ | `LAMBDA_EXECUTION_ROLE_ARN` | Lambda execution role for pipeline functions |
439
+ | `LAMBDA_SUBNETS` | Comma-separated subnet IDs for Lambda VPC config |
440
+ | `LAMBDA_SECURITY_GROUPS` | Comma-separated security group IDs for Lambda |
441
+ | `LAMBDA_LOG_GROUP` | CloudWatch log group for Lambda pipeline steps |
442
+
443
+ ### Data Source Adapters
444
+
445
+ | Variable | Required | Default | Description |
446
+ |----------|----------|---------|-------------|
447
+ | `YT_DLP_COOKIES_SECRET_ID` | No | — | Secrets Manager secret ID for YouTube cookies (Netscape format). Required for age-restricted or member-only videos. |
448
+ | `YT_DLP_PLAYER_CLIENTS` | No | `tv_embedded,web` | Comma-separated yt-dlp player client override. |
449
+ | `TWITTER_COOKIES_SECRET_ID` | No | — | Secrets Manager secret ID for Twitter cookies (JSON format). Takes precedence over `TWITTER_COOKIES_PATH`. |
450
+ | `TWITTER_COOKIES_PATH` | No | — | Local filesystem path to Twitter cookies JSON file. Used when Secrets Manager is not configured. |
451
+ | `OPENAI_API_KEY` | No | — | OpenAI API key for Whisper transcription in `YouTubeAudioAdapter`. |
452
+
453
+ ### MCP Server
454
+
455
+ | Variable | Required | Default | Description |
456
+ |----------|----------|---------|-------------|
457
+ | `MCP_TRANSPORT` | No | `sse` | Transport mode: `sse` or `stdio` |
458
+ | `MCP_SSE_HOST` | No | `127.0.0.1` | MCP SSE server host |
459
+ | `MCP_SSE_PORT` | No | `8001` | MCP SSE server port |
460
+
461
+ ## API Authentication
462
+
463
+ All endpoints (except `/api/v1/health` and `/api/v1/auth/login`) require a JWT token.
464
+
465
+ > **Rate limits** — enforced per IP address:
466
+ > - **Login** (`POST /api/v1/auth/login`): 5 attempts per 60 seconds. Returns `429` with `Retry-After: 60` on breach.
467
+ > - **Global**: 100 requests per 60 seconds across all endpoints. Responses include `X-RateLimit-Limit` and `X-RateLimit-Remaining` headers.
468
+ >
469
+ > **Note** — Swagger UI (`/docs`) and ReDoc (`/redoc`) are disabled in production (`APP_ENV=production`).
470
+
471
+ ### Get a Token
472
+
473
+ ```bash
474
+ curl -X POST https://api.dev.pulse.mananalabs.ai/api/v1/auth/login \
475
+ -H "Content-Type: application/json" \
476
+ -d '{"email": "$PULSE_AUTH_EMAIL", "password": "$PULSE_AUTH_PASSWORD"}'
477
+ ```
478
+
479
+ Response:
480
+
481
+ ```json
482
+ {
483
+ "id_token": "eyJ...",
484
+ "access_token": "eyJ...",
485
+ "refresh_token": "eyJ...",
486
+ "expires_in": 3600,
487
+ "token_type": "Bearer",
488
+ "tenant_id": "tenant-dev-001",
489
+ "email": "dev@pulse-engine.com"
490
+ }
491
+ ```
492
+
493
+ ### Use the Token
494
+
495
+ Pass the `id_token` as a Bearer token in the `Authorization` header:
496
+
497
+ ```bash
498
+ curl -H "Authorization: Bearer <id_token>" \
499
+ https://api.dev.pulse.mananalabs.ai/api/v1/kb/stats
500
+ ```
501
+
502
+ Tokens expire after **1 hour**. Call the login endpoint again to get a new one.
503
+
504
+ ## API Documentation
505
+
506
+ - Swagger UI: https://api.dev.pulse.mananalabs.ai/docs
507
+ - ReDoc: https://api.dev.pulse.mananalabs.ai/redoc
508
+
509
+ ## Library Usage
510
+
511
+ Pulse Engine can also be used as a standalone library for content processing:
512
+
513
+ ```python
514
+ from pulse_engine.processor.core.topic_splitter import TopicSplitter
515
+
516
+ splitter = TopicSplitter(provider="openai", api_key="sk-...")
517
+ result = splitter.split([
518
+ (1, "Hi there"),
519
+ (2, "Let's discuss Q1 metrics"),
520
+ (3, "Revenue grew 20%"),
521
+ ])
522
+ print(result.topic_shifts)
523
+ ```
524
+
525
+ See **[docs/pulse_engine_library.md](docs/pulse_engine_library.md)** for full documentation on the Topic Splitter, LLM configuration, and configurable embeddings.
526
+
527
+ ## Data Source Adapters
528
+
529
+ `pulse_engine.adapters` provides standalone data-fetching utilities. Each adapter is an independently importable class with no dependency on the Pulse pipeline — use them in any Python context.
530
+
531
+ All adapter data models are plain `@dataclass` instances defined in `pulse_engine.adapters.models`. Credentials are never passed as constructor parameters (except where noted); they are read from environment variables.
532
+
533
+ ### YouTube Metadata
534
+
535
+ Fetches video metadata from a YouTube channel via the YouTube Data API v3.
536
+
537
+ ```python
538
+ import asyncio
539
+ from pulse_engine.adapters.youtube_metadata import YouTubeMetadataAdapter
540
+
541
+ adapter = YouTubeMetadataAdapter(channel_name="@BBCNews", api_key="YOUR_YT_API_KEY")
542
+ videos = asyncio.run(adapter.fetch(max_results=50))
543
+ for v in videos:
544
+ print(v.title, v.published_at, v.url)
545
+ ```
546
+
547
+ Returns `list[VideoMetadata]` with fields: `video_id`, `title`, `description`, `published_at`, `channel_id`, `channel_name`, `thumbnail_url`, `duration`, `view_count`, `url`.
548
+
549
+ ### YouTube Audio (Download + Transcription)
550
+
551
+ Downloads audio from a YouTube video and transcribes it with OpenAI Whisper.
552
+
553
+ ```python
554
+ import asyncio
555
+ from pulse_engine.adapters.youtube_metadata import YouTubeMetadataAdapter
556
+ from pulse_engine.adapters.youtube_audio import YouTubeAudioAdapter
557
+
558
+ meta_adapter = YouTubeMetadataAdapter(channel_name="@BBCNews", api_key="YOUR_YT_API_KEY")
559
+ audio_adapter = YouTubeAudioAdapter(openai_api_key="sk-...")
560
+
561
+ videos = asyncio.run(meta_adapter.fetch(max_results=5))
562
+ audio = asyncio.run(audio_adapter.fetch(videos[0]))
563
+ print(audio.transcript)
564
+ ```
565
+
566
+ Returns `VideoAudio` with fields: `video_id`, `title`, `transcript`, `segments` (`list[TranscriptSegment]`), `language`, `duration_seconds`.
567
+
568
+ Set `YT_DLP_COOKIES_SECRET_ID` (Secrets Manager) to enable age-restricted video download. See the **Secrets Manager cookie format** note below.
569
+
570
+ ### Speeches (inc.in)
571
+
572
+ Scrapes speech metadata and extracts full text from PDFs published on inc.in.
573
+
574
+ ```python
575
+ import asyncio
576
+ from pulse_engine.adapters.speech_metadata import SpeechMetadataAdapter
577
+ from pulse_engine.adapters.speech_content import SpeechContentAdapter
578
+
579
+ meta_adapter = SpeechMetadataAdapter(base_url="https://www.inc.in/en/media/speeches")
580
+ content_adapter = SpeechContentAdapter()
581
+
582
+ speeches = asyncio.run(meta_adapter.fetch(pages=3))
583
+ results = asyncio.run(content_adapter.fetch(speeches))
584
+ for r in results:
585
+ print(r.metadata.title, r.content.page_count, "pages")
586
+ print(r.content.text[:500])
587
+ ```
588
+
589
+ `SpeechMetadata` fields: `title`, `speaker`, `date`, `pdf_url`, `source_url`.
590
+ `SpeechResult` fields: `metadata` (`SpeechMetadata`), `content` (`SpeechContent` with `text` and `page_count`).
591
+
592
+ ### Twitter / X
593
+
594
+ Searches tweets or looks up user profiles via `twikit` (unofficial Twitter API client).
595
+
596
+ ```python
597
+ import asyncio
598
+ from pulse_engine.adapters.twitter import TwitterAdapter
599
+
600
+ # Reads TWITTER_COOKIES_SECRET_ID or TWITTER_COOKIES_PATH from env
601
+ adapter = TwitterAdapter()
602
+
603
+ tweets = asyncio.run(adapter.fetch_tweets(query="Budget 2025", count=50))
604
+ for t in tweets:
605
+ print(t.author, t.created_at, t.text[:80])
606
+
607
+ user_results = asyncio.run(adapter.fetch_user(username="FinanceMinIndia"))
608
+ ```
609
+
610
+ `TweetMetadata` fields: `tweet_id`, `text`, `author`, `created_at`, `like_count`, `retweet_count`, `reply_count`, `url`.
611
+ `TwitterUserResult` fields: `user_id`, `username`, `display_name`, `description`, `followers_count`, `following_count`, `tweet_count`, `verified`, `url`.
612
+
613
+ ### Secrets Manager Cookie Format
614
+
615
+ Cookie secrets stored in AWS Secrets Manager must use the **wrapper JSON** format:
616
+
617
+ ```json
618
+ {"YT_DLP_COOKIES": "<full Netscape cookie file content>"}
619
+ ```
620
+
621
+ ```json
622
+ {"TWITTER_COOKIES": "{\"auth_token\": \"...\", \"ct0\": \"...\"}"}
623
+ ```
624
+
625
+ The adapter fetches the secret, JSON-parses the wrapper, and extracts the inner value automatically. The key name must match the convention used when the secret was stored.
626
+
627
+ ### Example Runner Scripts
628
+
629
+ Ready-to-run CLI scripts are provided under `examples/`:
630
+
631
+ | Script | What it does |
632
+ |--------|-------------|
633
+ | `examples/fetch_youtube.py` | Fetch channel videos + transcribe first result |
634
+ | `examples/fetch_speeches.py` | Scrape speech listing + extract PDF text |
635
+ | `examples/fetch_tweets.py` | Search tweets or look up a Twitter user |
636
+
637
+ ```bash
638
+ # YouTube
639
+ YT_API_KEY=... OPENAI_API_KEY=sk-... python examples/fetch_youtube.py --channel @BBCNews
640
+
641
+ # Speeches
642
+ python examples/fetch_speeches.py --pages 2
643
+
644
+ # Tweets
645
+ TWITTER_COOKIES_PATH=~/.twitter_cookies.json python examples/fetch_tweets.py --query "Budget 2025"
646
+ ```
647
+
648
+ ## Further Reading
649
+
650
+ - [Building a Product](docs/building-a-product.md) — step-by-step guide to creating a new product
651
+ - [Design Decisions](docs/design-decisions.md) — architectural decisions and rationale
652
+ - [Infrastructure](docs/infrastructure.md) — AWS deployment architecture
653
+ - [Library Usage](docs/pulse_engine_library.md) — topic splitting, LLM config, and embeddings
654
+