pulse-engine 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pulse_engine-0.2.0/PKG-INFO +654 -0
- pulse_engine-0.2.0/README.md +615 -0
- pulse_engine-0.2.0/pyproject.toml +116 -0
- pulse_engine-0.2.0/src/pulse_engine/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/adapters/__init__.py +58 -0
- pulse_engine-0.2.0/src/pulse_engine/adapters/audio_transcription.py +167 -0
- pulse_engine-0.2.0/src/pulse_engine/adapters/batcher.py +36 -0
- pulse_engine-0.2.0/src/pulse_engine/adapters/digital_news.py +128 -0
- pulse_engine-0.2.0/src/pulse_engine/adapters/digital_news_metadata.py +536 -0
- pulse_engine-0.2.0/src/pulse_engine/adapters/exceptions.py +10 -0
- pulse_engine-0.2.0/src/pulse_engine/adapters/models.py +134 -0
- pulse_engine-0.2.0/src/pulse_engine/adapters/opensearch_storage.py +160 -0
- pulse_engine-0.2.0/src/pulse_engine/adapters/speech_content.py +130 -0
- pulse_engine-0.2.0/src/pulse_engine/adapters/speech_metadata.py +374 -0
- pulse_engine-0.2.0/src/pulse_engine/adapters/twitter.py +423 -0
- pulse_engine-0.2.0/src/pulse_engine/adapters/youtube_downloader.py +186 -0
- pulse_engine-0.2.0/src/pulse_engine/adapters/youtube_metadata.py +261 -0
- pulse_engine-0.2.0/src/pulse_engine/api/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/api/v1/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/api/v1/auth.py +91 -0
- pulse_engine-0.2.0/src/pulse_engine/api/v1/health.py +62 -0
- pulse_engine-0.2.0/src/pulse_engine/api/v1/router.py +16 -0
- pulse_engine-0.2.0/src/pulse_engine/chain_recovery.py +131 -0
- pulse_engine-0.2.0/src/pulse_engine/cli/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/cli/main.py +169 -0
- pulse_engine-0.2.0/src/pulse_engine/cli/templates/cookiecutter.json +4 -0
- pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
- pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
- pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
- pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
- pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
- pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
- pulse_engine-0.2.0/src/pulse_engine/client.py +95 -0
- pulse_engine-0.2.0/src/pulse_engine/config.py +157 -0
- pulse_engine-0.2.0/src/pulse_engine/core/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/core/error_handlers.py +64 -0
- pulse_engine-0.2.0/src/pulse_engine/core/exceptions.py +67 -0
- pulse_engine-0.2.0/src/pulse_engine/core/job_token.py +109 -0
- pulse_engine-0.2.0/src/pulse_engine/core/logging.py +45 -0
- pulse_engine-0.2.0/src/pulse_engine/core/scope.py +23 -0
- pulse_engine-0.2.0/src/pulse_engine/core/security.py +130 -0
- pulse_engine-0.2.0/src/pulse_engine/database.py +30 -0
- pulse_engine-0.2.0/src/pulse_engine/dependencies.py +166 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/backend_deployment_repository.py +83 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/backends/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/backends/base.py +50 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/backends/exceptions.py +20 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/backends/native_lambda.py +125 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/backends/prefect_ecs.py +116 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/backends/prefect_k8s.py +131 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/backends/registry.py +50 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/infra_provisioner.py +285 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/job_launcher.py +178 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/models.py +48 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/repository.py +54 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/router.py +22 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/schemas.py +18 -0
- pulse_engine-0.2.0/src/pulse_engine/deployment/service.py +65 -0
- pulse_engine-0.2.0/src/pulse_engine/extractor/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/extractor/adapters/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/extractor/base.py +48 -0
- pulse_engine-0.2.0/src/pulse_engine/extractor/models.py +50 -0
- pulse_engine-0.2.0/src/pulse_engine/extractor/orchestrator/__init__.py +15 -0
- pulse_engine-0.2.0/src/pulse_engine/extractor/orchestrator/base.py +34 -0
- pulse_engine-0.2.0/src/pulse_engine/extractor/orchestrator/noop.py +37 -0
- pulse_engine-0.2.0/src/pulse_engine/extractor/orchestrator/prefect.py +163 -0
- pulse_engine-0.2.0/src/pulse_engine/extractor/repository.py +163 -0
- pulse_engine-0.2.0/src/pulse_engine/extractor/router.py +102 -0
- pulse_engine-0.2.0/src/pulse_engine/extractor/schemas.py +93 -0
- pulse_engine-0.2.0/src/pulse_engine/extractor/service.py +431 -0
- pulse_engine-0.2.0/src/pulse_engine/extractor/stage_models.py +36 -0
- pulse_engine-0.2.0/src/pulse_engine/extractor/stage_repository.py +109 -0
- pulse_engine-0.2.0/src/pulse_engine/main.py +195 -0
- pulse_engine-0.2.0/src/pulse_engine/mcp/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/mcp/__main__.py +5 -0
- pulse_engine-0.2.0/src/pulse_engine/mcp/server.py +108 -0
- pulse_engine-0.2.0/src/pulse_engine/mcp/tools_jobs.py +159 -0
- pulse_engine-0.2.0/src/pulse_engine/mcp/tools_kb.py +88 -0
- pulse_engine-0.2.0/src/pulse_engine/mcp/tools_modules.py +115 -0
- pulse_engine-0.2.0/src/pulse_engine/mcp/tools_pipelines.py +215 -0
- pulse_engine-0.2.0/src/pulse_engine/mcp/tools_processor.py +208 -0
- pulse_engine-0.2.0/src/pulse_engine/middleware/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/middleware/rate_limit.py +144 -0
- pulse_engine-0.2.0/src/pulse_engine/middleware/request_id.py +16 -0
- pulse_engine-0.2.0/src/pulse_engine/middleware/security_headers.py +25 -0
- pulse_engine-0.2.0/src/pulse_engine/middleware/tenant.py +90 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/config_parser.py +148 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/expression.py +268 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/models.py +98 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/repositories.py +224 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/router_modules.py +66 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/router_pipelines.py +198 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/schemas.py +200 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/service.py +250 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/translators/__init__.py +44 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/translators/airflow_status.py +11 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/translators/airflow_translator.py +22 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/translators/base.py +42 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/translators/prefect_status.py +93 -0
- pulse_engine-0.2.0/src/pulse_engine/pipeline/translators/prefect_translator.py +195 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/base.py +36 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/core/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/core/analysis.py +148 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/core/chunking.py +158 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/core/prompts.py +340 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/core/topic_splitter.py +105 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/defaults/__init__.py +11 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/defaults/core_processor.py +12 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/defaults/postprocessor.py +12 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/defaults/preprocessor.py +12 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/llm/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/llm/provider.py +58 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/ocr/gemini.py +52 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/pipeline.py +107 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/postprocessor/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/postprocessor/embeddings.py +34 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/postprocessor/tasks.py +180 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/preprocessor/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/preprocessor/tasks.py +71 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/router.py +192 -0
- pulse_engine-0.2.0/src/pulse_engine/processor/schemas.py +167 -0
- pulse_engine-0.2.0/src/pulse_engine/registry.py +117 -0
- pulse_engine-0.2.0/src/pulse_engine/runners/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/runners/lambda_runner.py +26 -0
- pulse_engine-0.2.0/src/pulse_engine/runners/pipeline_runner.py +43 -0
- pulse_engine-0.2.0/src/pulse_engine/runners/prefect_pipeline_flow.py +904 -0
- pulse_engine-0.2.0/src/pulse_engine/runners/prefect_runner.py +33 -0
- pulse_engine-0.2.0/src/pulse_engine/s3.py +72 -0
- pulse_engine-0.2.0/src/pulse_engine/secrets.py +46 -0
- pulse_engine-0.2.0/src/pulse_engine/services/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/services/bootstrap.py +211 -0
- pulse_engine-0.2.0/src/pulse_engine/services/opensearch.py +84 -0
- pulse_engine-0.2.0/src/pulse_engine/storage/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/storage/connectors/__init__.py +0 -0
- pulse_engine-0.2.0/src/pulse_engine/storage/connectors/athena.py +226 -0
- pulse_engine-0.2.0/src/pulse_engine/storage/connectors/base.py +32 -0
- pulse_engine-0.2.0/src/pulse_engine/storage/connectors/opensearch.py +344 -0
- pulse_engine-0.2.0/src/pulse_engine/storage/knowledge_base.py +68 -0
- pulse_engine-0.2.0/src/pulse_engine/storage/router.py +78 -0
- pulse_engine-0.2.0/src/pulse_engine/storage/schemas.py +93 -0
- pulse_engine-0.2.0/src/pulse_engine/testing/__init__.py +13 -0
- pulse_engine-0.2.0/src/pulse_engine/testing/fixtures.py +50 -0
- pulse_engine-0.2.0/src/pulse_engine/testing/mocks.py +104 -0
- pulse_engine-0.2.0/src/pulse_engine/worker.py +53 -0
|
@@ -0,0 +1,654 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pulse-engine
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Pulse Engine — Hybrid framework for building Pulse products
|
|
5
|
+
Author: Pulse Team
|
|
6
|
+
Requires-Python: >=3.11,<3.13
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Requires-Dist: alembic (>=1.14.0,<2.0.0)
|
|
11
|
+
Requires-Dist: asyncpg (>=0.30.0,<0.31.0)
|
|
12
|
+
Requires-Dist: beautifulsoup4 (>=4.12.0,<5.0.0)
|
|
13
|
+
Requires-Dist: boto3 (>=1.35.0,<2.0.0)
|
|
14
|
+
Requires-Dist: celery[redis] (>=5.4.0,<6.0.0)
|
|
15
|
+
Requires-Dist: cookiecutter (>=2.6.0,<3.0.0)
|
|
16
|
+
Requires-Dist: curl-cffi (>=0.15.0,<0.16.0)
|
|
17
|
+
Requires-Dist: email-validator (>=2.3.0,<3.0.0)
|
|
18
|
+
Requires-Dist: fastapi (>=0.115.0,<0.116.0)
|
|
19
|
+
Requires-Dist: httpx (>=0.28.0,<0.29.0)
|
|
20
|
+
Requires-Dist: langchain (>=1.2.13,<2.0.0)
|
|
21
|
+
Requires-Dist: langchain-anthropic (>=1.4.0,<2.0.0)
|
|
22
|
+
Requires-Dist: langchain-openai (>=1.1.11,<2.0.0)
|
|
23
|
+
Requires-Dist: langdetect (>=1.0.9,<2.0.0)
|
|
24
|
+
Requires-Dist: mcp[cli] (>=1.0.0,<2.0.0)
|
|
25
|
+
Requires-Dist: opensearch-py[async] (>=3.1.0,<4.0.0)
|
|
26
|
+
Requires-Dist: pydantic-settings (>=2.7.0,<3.0.0)
|
|
27
|
+
Requires-Dist: pymupdf (>=1.27.2.3,<2.0.0.0)
|
|
28
|
+
Requires-Dist: python-jose[cryptography] (>=3.3.0,<4.0.0)
|
|
29
|
+
Requires-Dist: redis (>=5.0.0,<6.0.0)
|
|
30
|
+
Requires-Dist: sqlalchemy[asyncio] (>=2.0,<3.0)
|
|
31
|
+
Requires-Dist: structlog (>=24.4.0,<25.0.0)
|
|
32
|
+
Requires-Dist: tiktoken (>=0.8.0,<0.9.0)
|
|
33
|
+
Requires-Dist: twikit (>=2.3.3,<3.0.0)
|
|
34
|
+
Requires-Dist: typer (>=0.15.0,<0.16.0)
|
|
35
|
+
Requires-Dist: uvicorn[standard] (>=0.34.0,<0.35.0)
|
|
36
|
+
Requires-Dist: yt-dlp (>=2026.3.17,<2027.0.0)
|
|
37
|
+
Description-Content-Type: text/markdown
|
|
38
|
+
|
|
39
|
+
# Pulse Engine
|
|
40
|
+
|
|
41
|
+
Hybrid Python framework for building multi-tenant data products. Products `pip install pulse-engine`, declare a manifest, and get a full FastAPI app with OpenSearch, Athena, Celery, Prefect, and MCP — out of the box.
|
|
42
|
+
|
|
43
|
+
## How It Works
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
┌──────────────────────────────────────────────────┐
|
|
47
|
+
│ Product (pip install pulse-engine) │
|
|
48
|
+
│ manifest = ProductManifest( │
|
|
49
|
+
│ extractors=[...], preprocessor=..., ... │
|
|
50
|
+
│ ) │
|
|
51
|
+
└──────────────┬───────────────────────────────────┘
|
|
52
|
+
│
|
|
53
|
+
┌──────────────▼───────────────────────────────────┐
|
|
54
|
+
│ pulse-engine │
|
|
55
|
+
│ Base ABCs · Default implementations · App │
|
|
56
|
+
│ factory · Storage connectors · Job lifecycle │
|
|
57
|
+
│ · CLI · Testing utilities │
|
|
58
|
+
└──────────────┬───────────────────────────────────┘
|
|
59
|
+
│
|
|
60
|
+
┌──────────────▼───────────────────────────────────┐
|
|
61
|
+
│ Shared Infrastructure │
|
|
62
|
+
│ Prefect · OpenSearch · Redis · PostgreSQL │
|
|
63
|
+
└──────────────────────────────────────────────────┘
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Products customize behaviour by:
|
|
67
|
+
- Implementing `BaseExtractor` subclasses for data extraction
|
|
68
|
+
- Overriding pipeline stages (preprocessor, core, postprocessor) or using defaults
|
|
69
|
+
- Adding product-specific API routes, MCP tools, and Celery tasks
|
|
70
|
+
- Declaring everything in a `ProductManifest`
|
|
71
|
+
|
|
72
|
+
## Prerequisites
|
|
73
|
+
|
|
74
|
+
- Python 3.11-3.12
|
|
75
|
+
- [Poetry](https://python-poetry.org/docs/#installation)
|
|
76
|
+
- Docker & Docker Compose (for shared infrastructure)
|
|
77
|
+
|
|
78
|
+
## Quick Start
|
|
79
|
+
|
|
80
|
+
### For Engine Development
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
git clone <repo-url> && cd pulse-engine
|
|
84
|
+
|
|
85
|
+
# Install deps and pre-commit hooks
|
|
86
|
+
make install
|
|
87
|
+
|
|
88
|
+
# Copy env and configure
|
|
89
|
+
cp .env.example .env
|
|
90
|
+
|
|
91
|
+
# Run database migrations
|
|
92
|
+
make migrate
|
|
93
|
+
|
|
94
|
+
# Start the dev server
|
|
95
|
+
make run
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### For Building a New Product
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
# Install the engine (stable / prod release)
|
|
102
|
+
pip install pulse-engine
|
|
103
|
+
|
|
104
|
+
# Install the latest dev build (pre-release, published on every push to `dev`)
|
|
105
|
+
pip install pulse-engine --pre
|
|
106
|
+
|
|
107
|
+
# Scaffold a new product
|
|
108
|
+
pulse init my-product
|
|
109
|
+
cd pulse-my-product
|
|
110
|
+
|
|
111
|
+
# Set up the product
|
|
112
|
+
make install
|
|
113
|
+
cp .env.example .env
|
|
114
|
+
|
|
115
|
+
# Validate and test
|
|
116
|
+
make validate
|
|
117
|
+
make test
|
|
118
|
+
|
|
119
|
+
# Run
|
|
120
|
+
make run
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
See **[docs/building-a-product.md](docs/building-a-product.md)** for the full guide, including how to add time-based filters (e.g. "last 2 days", "last 5 years") via the `config` dict in the trigger payload.
|
|
124
|
+
|
|
125
|
+
## Versioning
|
|
126
|
+
|
|
127
|
+
Every push to `dev` publishes a pre-release to PyPI with the format `x.y.z.devYYYYMMDDHHMMSS` (e.g. `0.2.0.dev20250506120000`). Every push to `prod` publishes a stable release. pip and Poetry skip pre-releases by default.
|
|
128
|
+
|
|
129
|
+
| Goal | pip | Poetry (`pyproject.toml`) |
|
|
130
|
+
|------|-----|---------------------------|
|
|
131
|
+
| Latest stable | `pip install pulse-engine` | `pulse-engine = ">=0.2.0"` |
|
|
132
|
+
| Latest dev build | `pip install pulse-engine --pre` | `pulse-engine = {version = ">=0.2.0", allow-prereleases = true}` |
|
|
133
|
+
| Specific dev snapshot | `pip install "pulse-engine==0.2.0.dev20250506120000"` | `pulse-engine = ">=0.2.0.dev20250506120000"` |
|
|
134
|
+
|
|
135
|
+
## CLI
|
|
136
|
+
|
|
137
|
+
The `pulse` command is the primary interface:
|
|
138
|
+
|
|
139
|
+
| Command | Description |
|
|
140
|
+
|---------|-------------|
|
|
141
|
+
| `pulse init <name>` | Scaffold a new product from template |
|
|
142
|
+
| `pulse validate [module]` | Validate a product manifest |
|
|
143
|
+
| `pulse run` | Discover manifest and start FastAPI server |
|
|
144
|
+
| `pulse run-worker` | Discover manifest and start Celery worker |
|
|
145
|
+
| `pulse run-mcp` | Discover manifest and start MCP server |
|
|
146
|
+
|
|
147
|
+
## Product Manifest
|
|
148
|
+
|
|
149
|
+
Products declare their components via a `ProductManifest`:
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
from pulse_engine.registry import ProductManifest
|
|
153
|
+
|
|
154
|
+
manifest = ProductManifest(
|
|
155
|
+
name="my-product",
|
|
156
|
+
version="0.1.0"
|
|
157
|
+
)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Products register via a `pyproject.toml` entry point:
|
|
161
|
+
|
|
162
|
+
```toml
|
|
163
|
+
[tool.poetry.plugins."pulse_engine.products"]
|
|
164
|
+
my_product = "pulse_my_product:manifest"
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Project Structure
|
|
168
|
+
|
|
169
|
+
```
|
|
170
|
+
src/pulse_engine/
|
|
171
|
+
├── main.py # App factory (create_app)
|
|
172
|
+
├── config.py # Settings (pydantic-settings)
|
|
173
|
+
├── registry.py # ProductManifest, validation, discovery
|
|
174
|
+
├── worker.py # Celery app factory
|
|
175
|
+
├── database.py # SQLAlchemy async setup
|
|
176
|
+
├── dependencies.py # FastAPI dependency injection
|
|
177
|
+
├── client.py # PulseEngineClient (container → engine HTTP)
|
|
178
|
+
├── s3.py # S3Stage (NDJSON inter-stage data exchange)
|
|
179
|
+
├── chain_recovery.py # Background task for stalled pipeline recovery
|
|
180
|
+
├── cli/
|
|
181
|
+
│ ├── main.py # pulse CLI (typer)
|
|
182
|
+
│ └── templates/ # Cookiecutter product template
|
|
183
|
+
├── api/v1/
|
|
184
|
+
│ ├── router.py # v1 router aggregation
|
|
185
|
+
│ └── health.py # Health check endpoint
|
|
186
|
+
├── core/
|
|
187
|
+
│ ├── security.py # JWT verification (Cognito + Job-scoped)
|
|
188
|
+
│ ├── job_token.py # Job-scoped JWT issuance & verification
|
|
189
|
+
│ ├── scope.py # require_scope() FastAPI dependency
|
|
190
|
+
│ ├── exceptions.py # Exception hierarchy
|
|
191
|
+
│ ├── error_handlers.py # Global error handlers
|
|
192
|
+
│ └── logging.py # Structured logging (structlog)
|
|
193
|
+
├── middleware/
|
|
194
|
+
│ ├── request_id.py # X-Request-ID middleware
|
|
195
|
+
│ ├── security_headers.py # Defensive HTTP security headers (CSP, HSTS, etc.)
|
|
196
|
+
│ ├── rate_limit.py # Sliding-window per-IP rate limiter (100 req/60 s)
|
|
197
|
+
│ └── tenant.py # Dual-token middleware (Cognito + Job JWT)
|
|
198
|
+
├── deployment/
|
|
199
|
+
│ ├── models.py # DeploymentModel ORM
|
|
200
|
+
│ ├── repository.py # DeploymentRepository
|
|
201
|
+
│ ├── service.py # DeploymentService
|
|
202
|
+
│ ├── router.py # POST /api/v1/deployments
|
|
203
|
+
│ └── schemas.py # Registration request/response
|
|
204
|
+
├── extractor/
|
|
205
|
+
│ ├── base.py # BaseExtractor ABC
|
|
206
|
+
│ ├── models.py # SQLAlchemy ORM: job_records
|
|
207
|
+
│ ├── stage_models.py # SQLAlchemy ORM: job_stages
|
|
208
|
+
│ ├── repository.py # JobRepository
|
|
209
|
+
│ ├── stage_repository.py # StageRepository
|
|
210
|
+
│ ├── service.py # JobService (stage-aware)
|
|
211
|
+
│ ├── router.py # /api/v1/jobs/ endpoints
|
|
212
|
+
│ ├── schemas.py # Pydantic models
|
|
213
|
+
│ └── orchestrator/
|
|
214
|
+
│ ├── base.py # BaseOrchestratorAdapter ABC
|
|
215
|
+
│ ├── prefect.py # PrefectAdapter (deployments + flow runs)
|
|
216
|
+
│ └── noop.py # NoopAdapter
|
|
217
|
+
├── processor/
|
|
218
|
+
│ ├── base.py # BasePreprocessor, BaseCoreProcessor,
|
|
219
|
+
│ │ # BasePostprocessor ABCs
|
|
220
|
+
│ ├── pipeline.py # Pluggable ProcessingPipeline
|
|
221
|
+
│ ├── router.py # /api/v1/process/ endpoints
|
|
222
|
+
│ ├── schemas.py # ProcessingContext, options
|
|
223
|
+
│ ├── defaults/ # Default stage implementations
|
|
224
|
+
│ ├── preprocessor/ # clean_html, normalize, detect_language
|
|
225
|
+
│ ├── core/ # chunking, NER, sentiment, topics
|
|
226
|
+
│ └── postprocessor/ # embeddings, dedup, quality scoring
|
|
227
|
+
├── storage/
|
|
228
|
+
│ ├── knowledge_base.py # KnowledgeBaseService
|
|
229
|
+
│ ├── router.py # /api/v1/kb/ endpoints
|
|
230
|
+
│ ├── schemas.py # Document, SearchQuery, etc.
|
|
231
|
+
│ └── connectors/
|
|
232
|
+
│ ├── base.py # BaseStorageConnector ABC
|
|
233
|
+
│ ├── opensearch.py # OpenSearch connector
|
|
234
|
+
│ └── athena.py # Athena connector
|
|
235
|
+
├── mcp/
|
|
236
|
+
│ ├── server.py # FastMCP instance
|
|
237
|
+
│ ├── tools_kb.py # KB tools (6)
|
|
238
|
+
│ ├── tools_jobs.py # Jobs tools (6)
|
|
239
|
+
│ ├── tools_processor.py # Processor tools (5)
|
|
240
|
+
│ ├── tools_pipelines.py # Pipeline tools (5)
|
|
241
|
+
│ └── tools_modules.py # Module registry tools (3)
|
|
242
|
+
├── services/
|
|
243
|
+
│ ├── bootstrap.py # ServiceContainer, bootstrap_services()
|
|
244
|
+
│ └── opensearch.py # OpenSearch client wrapper
|
|
245
|
+
└── testing/
|
|
246
|
+
├── fixtures.py # Reusable pytest fixtures
|
|
247
|
+
└── mocks.py # MockStorageConnector, MockExtractor, etc.
|
|
248
|
+
|
|
249
|
+
infra/
|
|
250
|
+
├── docker-compose.yml # Prefect, Redis, OpenSearch, PostgreSQL
|
|
251
|
+
└── terraform/ # AWS modules (networking, ECS, ECR, ALB)
|
|
252
|
+
|
|
253
|
+
tests/
|
|
254
|
+
├── unit/ # Unit tests
|
|
255
|
+
│ ├── framework/ # Manifest, pipeline, base class tests
|
|
256
|
+
│ ├── processor/
|
|
257
|
+
│ ├── storage/
|
|
258
|
+
│ ├── deployment/ # Deployment registration tests
|
|
259
|
+
│ └── extractor/
|
|
260
|
+
└── integration/ # Integration tests
|
|
261
|
+
├── api/
|
|
262
|
+
├── mcp/
|
|
263
|
+
└── pipelines/
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## Code Quality
|
|
267
|
+
|
|
268
|
+
Pre-commit hooks run on every commit:
|
|
269
|
+
|
|
270
|
+
| Hook | What it checks |
|
|
271
|
+
|------|----------------|
|
|
272
|
+
| `trailing-whitespace` | No trailing whitespace |
|
|
273
|
+
| `end-of-file-fixer` | Files end with a newline |
|
|
274
|
+
| `check-yaml` | Valid YAML syntax |
|
|
275
|
+
| `ruff` | Linting (auto-fix enabled) |
|
|
276
|
+
| `ruff-format` | Code formatting |
|
|
277
|
+
| `mypy` | Strict static type-checking |
|
|
278
|
+
|
|
279
|
+
```bash
|
|
280
|
+
make lint # run all hooks manually
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
## CI/CD
|
|
284
|
+
|
|
285
|
+
### PR Checks (`pr-checks.yml`)
|
|
286
|
+
|
|
287
|
+
Runs on every pull request to `dev`, `uat`, `prod`:
|
|
288
|
+
- **lint** — ruff check + format
|
|
289
|
+
- **typecheck** — mypy strict
|
|
290
|
+
- **test** — unit tests with coverage
|
|
291
|
+
- **trivy** — vulnerability scan
|
|
292
|
+
|
|
293
|
+
### Deploy (`deploy.yml`)
|
|
294
|
+
|
|
295
|
+
Runs on push to `dev`, `uat`, `prod`:
|
|
296
|
+
|
|
297
|
+
| Branch | PyPI Target | Infrastructure |
|
|
298
|
+
|--------|-------------|----------------|
|
|
299
|
+
| `dev` | PyPI (`.dev` suffix) | VM via docker-compose |
|
|
300
|
+
| `staging` | PyPI (`.dev` suffix) | VM via docker-compose |
|
|
301
|
+
| `prod` | PyPI (stable) | ECS cluster |
|
|
302
|
+
|
|
303
|
+
The pipeline: test → publish to PyPI → build Docker → push ECR → deploy → health check.
|
|
304
|
+
|
|
305
|
+
Terraform runs separately on `infra/terraform/**` changes: plan → apply → sync outputs to GitHub Secrets (pipeline infra vars auto-flow to `.env` on next deploy).
|
|
306
|
+
|
|
307
|
+
### Required Secrets (per GitHub environment)
|
|
308
|
+
|
|
309
|
+
| Secret | Description |
|
|
310
|
+
|--------|-------------|
|
|
311
|
+
| `AWS_ROLE_ARN` | OIDC role for GitHub → AWS auth |
|
|
312
|
+
| `ECR_REPOSITORY_URL` | ECR repository URL |
|
|
313
|
+
| `PYPI_TOKEN` | PyPI API token |
|
|
314
|
+
|
|
315
|
+
## MCP Server
|
|
316
|
+
|
|
317
|
+
Exposes 25 tools for AI agents via the Model Context Protocol:
|
|
318
|
+
|
|
319
|
+
| Category | Tools |
|
|
320
|
+
|---|---|
|
|
321
|
+
| **Jobs** (6) | `jobs_register`, `jobs_get`, `jobs_list`, `jobs_push_status`, `jobs_cancel`, `jobs_delete` |
|
|
322
|
+
| **Knowledge Base** (6) | `kb_store_documents`, `kb_retrieve_document`, `kb_search`, `kb_delete_document`, `kb_get_stats`, `kb_run_query` |
|
|
323
|
+
| **Processor** (5) | `process_pipeline`, `process_preprocess`, `process_analyze`, `process_postprocess`, `process_chunk` |
|
|
324
|
+
| **Pipelines** (5) | `pipelines_trigger`, `pipelines_status`, `pipelines_list`, `pipelines_cancel`, `pipelines_steps` |
|
|
325
|
+
| **Modules** (3) | `modules_register`, `modules_list`, `modules_delete` |
|
|
326
|
+
|
|
327
|
+
```bash
|
|
328
|
+
pulse run-mcp
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
Products register additional tools via `mcp_tool_modules` in the manifest.
|
|
332
|
+
|
|
333
|
+
## Environment Variables
|
|
334
|
+
|
|
335
|
+
### Core
|
|
336
|
+
|
|
337
|
+
| Variable | Required | Default | Description |
|
|
338
|
+
|----------|----------|---------|-------------|
|
|
339
|
+
| `APP_ENV` | No | `development` | Environment name (`development`, `production`, etc.) |
|
|
340
|
+
| `APP_VERSION` | No | `0.1.0` | Application version |
|
|
341
|
+
| `LOG_LEVEL` | No | `INFO` | Logging level |
|
|
342
|
+
| `AWS_REGION` | Yes | `ap-south-1` | AWS region |
|
|
343
|
+
| `AWS_ACCESS_KEY_ID` | No | — | AWS credentials (use IAM role in production) |
|
|
344
|
+
| `AWS_SECRET_ACCESS_KEY` | No | — | AWS credentials (use IAM role in production) |
|
|
345
|
+
|
|
346
|
+
### Authentication (Cognito)
|
|
347
|
+
|
|
348
|
+
| Variable | Required | Default | Description |
|
|
349
|
+
|----------|----------|---------|-------------|
|
|
350
|
+
| `COGNITO_USER_POOL_ID` | Yes | — | Cognito User Pool ID |
|
|
351
|
+
| `COGNITO_APP_CLIENT_ID` | Yes | — | Cognito App Client ID |
|
|
352
|
+
| `COGNITO_APP_CLIENT_SECRET` | No | — | Client secret (required if app client has one) |
|
|
353
|
+
|
|
354
|
+
### OpenSearch
|
|
355
|
+
|
|
356
|
+
| Variable | Required | Default | Description |
|
|
357
|
+
|----------|----------|---------|-------------|
|
|
358
|
+
| `OPENSEARCH_URL` | Yes | — | OpenSearch endpoint |
|
|
359
|
+
| `OPENSEARCH_USERNAME` | No | — | Basic-auth username (AWS managed domains) |
|
|
360
|
+
| `OPENSEARCH_PASSWORD` | No | — | Basic-auth password |
|
|
361
|
+
| `OPENSEARCH_USE_SSL` | No | `true` | Enable TLS |
|
|
362
|
+
| `OPENSEARCH_VERIFY_CERTS` | No | `true` | Verify TLS certificates |
|
|
363
|
+
| `OPENSEARCH_INDEX_PREFIX` | No | `pulse_kb` | Index name prefix per tenant |
|
|
364
|
+
| `EMBEDDING_DIMENSION` | No | `1536` | Vector dimension for kNN indexes |
|
|
365
|
+
|
|
366
|
+
### Database & Cache
|
|
367
|
+
|
|
368
|
+
| Variable | Required | Default | Description |
|
|
369
|
+
|----------|----------|---------|-------------|
|
|
370
|
+
| `DATABASE_URL` | Yes | — | Async PostgreSQL DSN (`postgresql+asyncpg://...`) |
|
|
371
|
+
| `REDIS_URL` | No | `redis://localhost:6379/0` | Redis URL (enables Celery) |
|
|
372
|
+
| `CELERY_BROKER_URL` | No | — | Celery broker (defaults to `REDIS_URL`) |
|
|
373
|
+
| `CELERY_RESULT_BACKEND` | No | — | Celery result backend (defaults to `REDIS_URL`) |
|
|
374
|
+
|
|
375
|
+
### Athena
|
|
376
|
+
|
|
377
|
+
| Variable | Required | Default | Description |
|
|
378
|
+
|----------|----------|---------|-------------|
|
|
379
|
+
| `ATHENA_AWS_ACCESS_KEY_ID` | No | — | Athena-specific AWS credentials |
|
|
380
|
+
| `ATHENA_AWS_SECRET_ACCESS_KEY` | No | — | Athena-specific AWS credentials |
|
|
381
|
+
| `ATHENA_OUTPUT_LOCATION` | Yes* | — | S3 URI for Athena query results |
|
|
382
|
+
| `ATHENA_WORKGROUP` | No | `primary` | Athena workgroup |
|
|
383
|
+
| `ATHENA_QUERY_TIMEOUT_SECONDS` | No | `60` | Athena query timeout |
|
|
384
|
+
|
|
385
|
+
### Orchestrator (Prefect)
|
|
386
|
+
|
|
387
|
+
| Variable | Required | Default | Description |
|
|
388
|
+
|----------|----------|---------|-------------|
|
|
389
|
+
| `PULSE_ORCHESTRATOR_BACKEND` | No | `none` | `prefect` or `none` |
|
|
390
|
+
| `PREFECT_API_URL` | No | — | Prefect API endpoint |
|
|
391
|
+
| `PREFECT_API_KEY` | No | — | Prefect Cloud API key |
|
|
392
|
+
| `PREFECT_ECS_WORK_POOL_NAME` | No | `products-worker-pool` | ECS work pool name |
|
|
393
|
+
| `PREFECT_LAMBDA_WORK_POOL_NAME` | No | `lambda-worker-pool` | Lambda work pool name |
|
|
394
|
+
| `PREFECT_LAMBDA_FUNCTION_NAME_TEMPLATE` | No | `{product}-{stage}` | Lambda function name pattern |
|
|
395
|
+
| `PREFECT_K8S_WORK_POOL_NAME` | No | `k8s-worker-pool` | Kubernetes work pool name |
|
|
396
|
+
| `PREFECT_K8S_NAMESPACE` | No | `pulse-jobs` | Kubernetes namespace |
|
|
397
|
+
| `PREFECT_K8S_DEFAULT_CPU` | No | `500m` | Default CPU request |
|
|
398
|
+
| `PREFECT_K8S_DEFAULT_MEMORY` | No | `1Gi` | Default memory request |
|
|
399
|
+
|
|
400
|
+
### LLM & Embeddings
|
|
401
|
+
|
|
402
|
+
| Variable | Required | Default | Description |
|
|
403
|
+
|----------|----------|---------|-------------|
|
|
404
|
+
| `PULSE_LLM_PROVIDER` | No | `openai` | LLM provider |
|
|
405
|
+
| `PULSE_LLM_MODEL` | No | `gpt-4o-mini` | LLM model ID |
|
|
406
|
+
| `PULSE_LLM_API_KEY` | No | — | LLM API key (also used as embedding fallback) |
|
|
407
|
+
| `PULSE_LLM_TEMPERATURE` | No | `0.0` | LLM sampling temperature |
|
|
408
|
+
| `PULSE_EMBEDDING_PROVIDER` | No | `openai` | Embedding provider |
|
|
409
|
+
| `PULSE_OPENAI_EMBEDDING_MODEL` | No | `text-embedding-3-small` | OpenAI embedding model |
|
|
410
|
+
| `PULSE_OPENAI_API_KEY` | No | — | OpenAI API key (overrides `PULSE_LLM_API_KEY`) |
|
|
411
|
+
|
|
412
|
+
### Pipeline & Jobs
|
|
413
|
+
|
|
414
|
+
| Variable | Required | Default | Description |
|
|
415
|
+
|----------|----------|---------|-------------|
|
|
416
|
+
| `PULSE_ENGINE_URL` | No | — | Public URL containers use for callbacks |
|
|
417
|
+
| `PULSE_JOB_TOKEN_SECRET` | No | — | HMAC secret for job-scoped JWTs |
|
|
418
|
+
| `PULSE_S3_BUCKET` | No | — | S3 bucket for inter-stage NDJSON data |
|
|
419
|
+
| `PULSE_CHAIN_GRACE_PERIOD_SECONDS` | No | `300` | Seconds before chain recovery auto-triggers |
|
|
420
|
+
| `PULSE_MAX_CONCURRENT_JOBS_PER_TENANT` | No | `10` | Max concurrent jobs per tenant |
|
|
421
|
+
| `PULSE_DEFAULT_CHUNK_SIZE` | No | `512` | Default chunk token size |
|
|
422
|
+
| `PULSE_DEFAULT_CHUNK_STRATEGY` | No | `token_count` | Default chunking strategy |
|
|
423
|
+
| `PULSE_DEDUP_SIMILARITY_THRESHOLD` | No | `0.95` | Cosine similarity dedup threshold |
|
|
424
|
+
|
|
425
|
+
### Pipeline Infrastructure (from Terraform)
|
|
426
|
+
|
|
427
|
+
These are auto-synced from Terraform outputs to GitHub Secrets, then written to `.env` at deploy time:
|
|
428
|
+
|
|
429
|
+
| Variable | Description |
|
|
430
|
+
|----------|-------------|
|
|
431
|
+
| `PIPELINE_TASK_DEFINITION` | ECS task definition family for pipeline steps |
|
|
432
|
+
| `PIPELINE_CLUSTER_NAME` | ECS cluster for pipeline step tasks |
|
|
433
|
+
| `PIPELINE_EXECUTION_ROLE_ARN` | ECS task execution role (ECR pull, logs, secrets) |
|
|
434
|
+
| `PIPELINE_TASK_ROLE_ARN` | ECS task role (S3, Lambda invoke, ECS dispatch) |
|
|
435
|
+
| `PIPELINE_LOG_GROUP` | CloudWatch log group for ECS pipeline steps |
|
|
436
|
+
| `PIPELINE_SUBNETS` | Comma-separated private subnet IDs |
|
|
437
|
+
| `PIPELINE_SECURITY_GROUPS` | Comma-separated security group IDs |
|
|
438
|
+
| `LAMBDA_EXECUTION_ROLE_ARN` | Lambda execution role for pipeline functions |
|
|
439
|
+
| `LAMBDA_SUBNETS` | Comma-separated subnet IDs for Lambda VPC config |
|
|
440
|
+
| `LAMBDA_SECURITY_GROUPS` | Comma-separated security group IDs for Lambda |
|
|
441
|
+
| `LAMBDA_LOG_GROUP` | CloudWatch log group for Lambda pipeline steps |
|
|
442
|
+
|
|
443
|
+
### Data Source Adapters
|
|
444
|
+
|
|
445
|
+
| Variable | Required | Default | Description |
|
|
446
|
+
|----------|----------|---------|-------------|
|
|
447
|
+
| `YT_DLP_COOKIES_SECRET_ID` | No | — | Secrets Manager secret ID for YouTube cookies (Netscape format). Required for age-restricted or member-only videos. |
|
|
448
|
+
| `YT_DLP_PLAYER_CLIENTS` | No | `tv_embedded,web` | Comma-separated yt-dlp player client override. |
|
|
449
|
+
| `TWITTER_COOKIES_SECRET_ID` | No | — | Secrets Manager secret ID for Twitter cookies (JSON format). Takes precedence over `TWITTER_COOKIES_PATH`. |
|
|
450
|
+
| `TWITTER_COOKIES_PATH` | No | — | Local filesystem path to Twitter cookies JSON file. Used when Secrets Manager is not configured. |
|
|
451
|
+
| `OPENAI_API_KEY` | No | — | OpenAI API key for Whisper transcription in `YouTubeAudioAdapter`. |
|
|
452
|
+
|
|
453
|
+
### MCP Server
|
|
454
|
+
|
|
455
|
+
| Variable | Required | Default | Description |
|
|
456
|
+
|----------|----------|---------|-------------|
|
|
457
|
+
| `MCP_TRANSPORT` | No | `sse` | Transport mode: `sse` or `stdio` |
|
|
458
|
+
| `MCP_SSE_HOST` | No | `127.0.0.1` | MCP SSE server host |
|
|
459
|
+
| `MCP_SSE_PORT` | No | `8001` | MCP SSE server port |
|
|
460
|
+
|
|
461
|
+
## API Authentication
|
|
462
|
+
|
|
463
|
+
All endpoints (except `/api/v1/health` and `/api/v1/auth/login`) require a JWT token.
|
|
464
|
+
|
|
465
|
+
> **Rate limits** — enforced per IP address:
|
|
466
|
+
> - **Login** (`POST /api/v1/auth/login`): 5 attempts per 60 seconds. Returns `429` with `Retry-After: 60` on breach.
|
|
467
|
+
> - **Global**: 100 requests per 60 seconds across all endpoints. Responses include `X-RateLimit-Limit` and `X-RateLimit-Remaining` headers.
|
|
468
|
+
>
|
|
469
|
+
> **Note** — Swagger UI (`/docs`) and ReDoc (`/redoc`) are disabled in production (`APP_ENV=production`).
|
|
470
|
+
|
|
471
|
+
### Get a Token
|
|
472
|
+
|
|
473
|
+
```bash
|
|
474
|
+
curl -X POST https://api.dev.pulse.mananalabs.ai/api/v1/auth/login \
|
|
475
|
+
-H "Content-Type: application/json" \
|
|
476
|
+
-d '{"email": "$PULSE_AUTH_EMAIL", "password": "$PULSE_AUTH_PASSWORD"}'
|
|
477
|
+
```
|
|
478
|
+
|
|
479
|
+
Response:
|
|
480
|
+
|
|
481
|
+
```json
|
|
482
|
+
{
|
|
483
|
+
"id_token": "eyJ...",
|
|
484
|
+
"access_token": "eyJ...",
|
|
485
|
+
"refresh_token": "eyJ...",
|
|
486
|
+
"expires_in": 3600,
|
|
487
|
+
"token_type": "Bearer",
|
|
488
|
+
"tenant_id": "tenant-dev-001",
|
|
489
|
+
"email": "dev@pulse-engine.com"
|
|
490
|
+
}
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
### Use the Token
|
|
494
|
+
|
|
495
|
+
Pass the `id_token` as a Bearer token in the `Authorization` header:
|
|
496
|
+
|
|
497
|
+
```bash
|
|
498
|
+
curl -H "Authorization: Bearer <id_token>" \
|
|
499
|
+
https://api.dev.pulse.mananalabs.ai/api/v1/kb/stats
|
|
500
|
+
```
|
|
501
|
+
|
|
502
|
+
Tokens expire after **1 hour**. Call the login endpoint again to get a new one.
|
|
503
|
+
|
|
504
|
+
## API Documentation
|
|
505
|
+
|
|
506
|
+
- Swagger UI: https://api.dev.pulse.mananalabs.ai/docs
|
|
507
|
+
- ReDoc: https://api.dev.pulse.mananalabs.ai/redoc
|
|
508
|
+
|
|
509
|
+
## Library Usage
|
|
510
|
+
|
|
511
|
+
Pulse Engine can also be used as a standalone library for content processing:
|
|
512
|
+
|
|
513
|
+
```python
|
|
514
|
+
from pulse_engine.processor.core.topic_splitter import TopicSplitter
|
|
515
|
+
|
|
516
|
+
splitter = TopicSplitter(provider="openai", api_key="sk-...")
|
|
517
|
+
result = splitter.split([
|
|
518
|
+
(1, "Hi there"),
|
|
519
|
+
(2, "Let's discuss Q1 metrics"),
|
|
520
|
+
(3, "Revenue grew 20%"),
|
|
521
|
+
])
|
|
522
|
+
print(result.topic_shifts)
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
See **[docs/pulse_engine_library.md](docs/pulse_engine_library.md)** for full documentation on the Topic Splitter, LLM configuration, and configurable embeddings.
|
|
526
|
+
|
|
527
|
+
## Data Source Adapters
|
|
528
|
+
|
|
529
|
+
`pulse_engine.adapters` provides standalone data-fetching utilities. Each adapter is an independently importable class with no dependency on the Pulse pipeline — use them in any Python context.
|
|
530
|
+
|
|
531
|
+
All adapter data models are plain `@dataclass` instances defined in `pulse_engine.adapters.models`. Credentials are never passed as constructor parameters (except where noted); they are read from environment variables.
|
|
532
|
+
|
|
533
|
+
### YouTube Metadata
|
|
534
|
+
|
|
535
|
+
Fetches video metadata from a YouTube channel via the YouTube Data API v3.
|
|
536
|
+
|
|
537
|
+
```python
|
|
538
|
+
import asyncio
|
|
539
|
+
from pulse_engine.adapters.youtube_metadata import YouTubeMetadataAdapter
|
|
540
|
+
|
|
541
|
+
adapter = YouTubeMetadataAdapter(channel_name="@BBCNews", api_key="YOUR_YT_API_KEY")
|
|
542
|
+
videos = asyncio.run(adapter.fetch(max_results=50))
|
|
543
|
+
for v in videos:
|
|
544
|
+
print(v.title, v.published_at, v.url)
|
|
545
|
+
```
|
|
546
|
+
|
|
547
|
+
Returns `list[VideoMetadata]` with fields: `video_id`, `title`, `description`, `published_at`, `channel_id`, `channel_name`, `thumbnail_url`, `duration`, `view_count`, `url`.
|
|
548
|
+
|
|
549
|
+
### YouTube Audio (Download + Transcription)
|
|
550
|
+
|
|
551
|
+
Downloads audio from a YouTube video and transcribes it with OpenAI Whisper.
|
|
552
|
+
|
|
553
|
+
```python
|
|
554
|
+
import asyncio
|
|
555
|
+
from pulse_engine.adapters.youtube_metadata import YouTubeMetadataAdapter
|
|
556
|
+
from pulse_engine.adapters.youtube_audio import YouTubeAudioAdapter
|
|
557
|
+
|
|
558
|
+
meta_adapter = YouTubeMetadataAdapter(channel_name="@BBCNews", api_key="YOUR_YT_API_KEY")
|
|
559
|
+
audio_adapter = YouTubeAudioAdapter(openai_api_key="sk-...")
|
|
560
|
+
|
|
561
|
+
videos = asyncio.run(meta_adapter.fetch(max_results=5))
|
|
562
|
+
audio = asyncio.run(audio_adapter.fetch(videos[0]))
|
|
563
|
+
print(audio.transcript)
|
|
564
|
+
```
|
|
565
|
+
|
|
566
|
+
Returns `VideoAudio` with fields: `video_id`, `title`, `transcript`, `segments` (`list[TranscriptSegment]`), `language`, `duration_seconds`.
|
|
567
|
+
|
|
568
|
+
Set `YT_DLP_COOKIES_SECRET_ID` (Secrets Manager) to enable age-restricted video download. See the **Secrets Manager cookie format** note below.
|
|
569
|
+
|
|
570
|
+
### Speeches (inc.in)
|
|
571
|
+
|
|
572
|
+
Scrapes speech metadata and extracts full text from PDFs published on inc.in.
|
|
573
|
+
|
|
574
|
+
```python
|
|
575
|
+
import asyncio
|
|
576
|
+
from pulse_engine.adapters.speech_metadata import SpeechMetadataAdapter
|
|
577
|
+
from pulse_engine.adapters.speech_content import SpeechContentAdapter
|
|
578
|
+
|
|
579
|
+
meta_adapter = SpeechMetadataAdapter(base_url="https://www.inc.in/en/media/speeches")
|
|
580
|
+
content_adapter = SpeechContentAdapter()
|
|
581
|
+
|
|
582
|
+
speeches = asyncio.run(meta_adapter.fetch(pages=3))
|
|
583
|
+
results = asyncio.run(content_adapter.fetch(speeches))
|
|
584
|
+
for r in results:
|
|
585
|
+
print(r.metadata.title, r.content.page_count, "pages")
|
|
586
|
+
print(r.content.text[:500])
|
|
587
|
+
```
|
|
588
|
+
|
|
589
|
+
`SpeechMetadata` fields: `title`, `speaker`, `date`, `pdf_url`, `source_url`.
|
|
590
|
+
`SpeechResult` fields: `metadata` (`SpeechMetadata`), `content` (`SpeechContent` with `text` and `page_count`).
|
|
591
|
+
|
|
592
|
+
### Twitter / X
|
|
593
|
+
|
|
594
|
+
Searches tweets or looks up user profiles via `twikit` (unofficial Twitter API client).
|
|
595
|
+
|
|
596
|
+
```python
|
|
597
|
+
import asyncio
|
|
598
|
+
from pulse_engine.adapters.twitter import TwitterAdapter
|
|
599
|
+
|
|
600
|
+
# Reads TWITTER_COOKIES_SECRET_ID or TWITTER_COOKIES_PATH from env
|
|
601
|
+
adapter = TwitterAdapter()
|
|
602
|
+
|
|
603
|
+
tweets = asyncio.run(adapter.fetch_tweets(query="Budget 2025", count=50))
|
|
604
|
+
for t in tweets:
|
|
605
|
+
print(t.author, t.created_at, t.text[:80])
|
|
606
|
+
|
|
607
|
+
user_results = asyncio.run(adapter.fetch_user(username="FinanceMinIndia"))
|
|
608
|
+
```
|
|
609
|
+
|
|
610
|
+
`TweetMetadata` fields: `tweet_id`, `text`, `author`, `created_at`, `like_count`, `retweet_count`, `reply_count`, `url`.
|
|
611
|
+
`TwitterUserResult` fields: `user_id`, `username`, `display_name`, `description`, `followers_count`, `following_count`, `tweet_count`, `verified`, `url`.
|
|
612
|
+
|
|
613
|
+
### Secrets Manager Cookie Format
|
|
614
|
+
|
|
615
|
+
Cookie secrets stored in AWS Secrets Manager must use the **wrapper JSON** format:
|
|
616
|
+
|
|
617
|
+
```json
|
|
618
|
+
{"YT_DLP_COOKIES": "<full Netscape cookie file content>"}
|
|
619
|
+
```
|
|
620
|
+
|
|
621
|
+
```json
|
|
622
|
+
{"TWITTER_COOKIES": "{\"auth_token\": \"...\", \"ct0\": \"...\"}"}
|
|
623
|
+
```
|
|
624
|
+
|
|
625
|
+
The adapter fetches the secret, JSON-parses the wrapper, and extracts the inner value automatically. The key name must match the convention used when the secret was stored.
|
|
626
|
+
|
|
627
|
+
### Example Runner Scripts
|
|
628
|
+
|
|
629
|
+
Ready-to-run CLI scripts are provided under `examples/`:
|
|
630
|
+
|
|
631
|
+
| Script | What it does |
|
|
632
|
+
|--------|-------------|
|
|
633
|
+
| `examples/fetch_youtube.py` | Fetch channel videos + transcribe first result |
|
|
634
|
+
| `examples/fetch_speeches.py` | Scrape speech listing + extract PDF text |
|
|
635
|
+
| `examples/fetch_tweets.py` | Search tweets or look up a Twitter user |
|
|
636
|
+
|
|
637
|
+
```bash
|
|
638
|
+
# YouTube
|
|
639
|
+
YT_API_KEY=... OPENAI_API_KEY=sk-... python examples/fetch_youtube.py --channel @BBCNews
|
|
640
|
+
|
|
641
|
+
# Speeches
|
|
642
|
+
python examples/fetch_speeches.py --pages 2
|
|
643
|
+
|
|
644
|
+
# Tweets
|
|
645
|
+
TWITTER_COOKIES_PATH=~/.twitter_cookies.json python examples/fetch_tweets.py --query "Budget 2025"
|
|
646
|
+
```
|
|
647
|
+
|
|
648
|
+
## Further Reading
|
|
649
|
+
|
|
650
|
+
- [Building a Product](docs/building-a-product.md) — step-by-step guide to creating a new product
|
|
651
|
+
- [Design Decisions](docs/design-decisions.md) — architectural decisions and rationale
|
|
652
|
+
- [Infrastructure](docs/infrastructure.md) — AWS deployment architecture
|
|
653
|
+
- [Library Usage](docs/pulse_engine_library.md) — topic splitting, LLM config, and embeddings
|
|
654
|
+
|