pulse-engine 0.2.0.dev20260407065251__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pulse_engine-0.2.0.dev20260407065251/PKG-INFO +563 -0
- pulse_engine-0.2.0.dev20260407065251/README.md +528 -0
- pulse_engine-0.2.0.dev20260407065251/pyproject.toml +105 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/api/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/api/v1/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/api/v1/auth.py +91 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/api/v1/health.py +62 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/api/v1/router.py +16 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/chain_recovery.py +131 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/main.py +169 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/cookiecutter.json +4 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/client.py +95 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/config.py +153 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/core/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/core/error_handlers.py +64 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/core/exceptions.py +67 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/core/job_token.py +109 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/core/logging.py +45 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/core/scope.py +23 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/core/security.py +130 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/database.py +30 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/dependencies.py +166 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backend_deployment_repository.py +83 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backends/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backends/base.py +50 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backends/exceptions.py +20 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backends/native_lambda.py +125 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backends/prefect_ecs.py +116 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backends/prefect_k8s.py +131 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backends/registry.py +50 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/infra_provisioner.py +278 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/job_launcher.py +178 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/models.py +48 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/repository.py +54 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/router.py +22 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/schemas.py +18 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/service.py +65 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/base.py +48 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/models.py +50 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/orchestrator/__init__.py +15 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/orchestrator/base.py +34 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/orchestrator/noop.py +37 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/orchestrator/prefect.py +163 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/repository.py +163 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/router.py +102 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/schemas.py +93 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/service.py +431 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/stage_models.py +36 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/stage_repository.py +109 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/main.py +195 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/mcp/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/mcp/__main__.py +5 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/mcp/server.py +103 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/mcp/tools_jobs.py +159 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/mcp/tools_kb.py +88 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/mcp/tools_processor.py +208 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/middleware/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/middleware/rate_limit.py +144 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/middleware/request_id.py +16 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/middleware/security_headers.py +25 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/middleware/tenant.py +90 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/config_parser.py +120 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/models.py +67 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/repositories.py +153 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/router_modules.py +66 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/router_pipelines.py +186 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/schemas.py +139 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/service.py +158 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/translators/__init__.py +44 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/translators/airflow_status.py +11 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/translators/airflow_translator.py +23 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/translators/base.py +43 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/translators/prefect_status.py +93 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/translators/prefect_translator.py +135 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/base.py +36 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/core/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/core/analysis.py +148 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/core/chunking.py +158 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/core/prompts.py +340 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/core/topic_splitter.py +105 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/defaults/__init__.py +11 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/defaults/core_processor.py +12 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/defaults/postprocessor.py +12 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/defaults/preprocessor.py +12 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/llm/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/llm/provider.py +58 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/ocr/gemini.py +52 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/pipeline.py +107 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/postprocessor/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/postprocessor/embeddings.py +34 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/postprocessor/tasks.py +180 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/preprocessor/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/preprocessor/tasks.py +71 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/router.py +192 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/schemas.py +167 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/registry.py +117 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/runners/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/runners/lambda_runner.py +26 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/runners/pipeline_runner.py +43 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/runners/prefect_pipeline_flow.py +677 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/runners/prefect_runner.py +33 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/s3.py +72 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/services/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/services/bootstrap.py +210 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/services/opensearch.py +84 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/connectors/__init__.py +0 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/connectors/athena.py +226 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/connectors/base.py +32 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/connectors/opensearch.py +344 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/knowledge_base.py +68 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/router.py +78 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/schemas.py +93 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/testing/__init__.py +13 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/testing/fixtures.py +50 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/testing/mocks.py +104 -0
- pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/worker.py +53 -0
|
@@ -0,0 +1,563 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pulse-engine
|
|
3
|
+
Version: 0.2.0.dev20260407065251
|
|
4
|
+
Summary: Pulse Engine — Hybrid framework for building Pulse products
|
|
5
|
+
Author: Pulse Team
|
|
6
|
+
Requires-Python: >=3.11,<3.13
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Requires-Dist: alembic (>=1.14.0,<2.0.0)
|
|
11
|
+
Requires-Dist: asyncpg (>=0.30.0,<0.31.0)
|
|
12
|
+
Requires-Dist: beautifulsoup4 (>=4.12.0,<5.0.0)
|
|
13
|
+
Requires-Dist: boto3 (>=1.35.0,<2.0.0)
|
|
14
|
+
Requires-Dist: celery[redis] (>=5.4.0,<6.0.0)
|
|
15
|
+
Requires-Dist: cookiecutter (>=2.6.0,<3.0.0)
|
|
16
|
+
Requires-Dist: email-validator (>=2.3.0,<3.0.0)
|
|
17
|
+
Requires-Dist: fastapi (>=0.115.0,<0.116.0)
|
|
18
|
+
Requires-Dist: httpx (>=0.28.0,<0.29.0)
|
|
19
|
+
Requires-Dist: langchain (>=1.2.13,<2.0.0)
|
|
20
|
+
Requires-Dist: langchain-anthropic (>=1.4.0,<2.0.0)
|
|
21
|
+
Requires-Dist: langchain-openai (>=1.1.11,<2.0.0)
|
|
22
|
+
Requires-Dist: langdetect (>=1.0.9,<2.0.0)
|
|
23
|
+
Requires-Dist: mcp[cli] (>=1.0.0,<2.0.0)
|
|
24
|
+
Requires-Dist: opensearch-py[async] (>=3.1.0,<4.0.0)
|
|
25
|
+
Requires-Dist: pydantic-settings (>=2.7.0,<3.0.0)
|
|
26
|
+
Requires-Dist: python-jose[cryptography] (>=3.3.0,<4.0.0)
|
|
27
|
+
Requires-Dist: redis (>=5.0.0,<6.0.0)
|
|
28
|
+
Requires-Dist: sqlalchemy[asyncio] (>=2.0,<3.0)
|
|
29
|
+
Requires-Dist: structlog (>=24.4.0,<25.0.0)
|
|
30
|
+
Requires-Dist: tiktoken (>=0.8.0,<0.9.0)
|
|
31
|
+
Requires-Dist: typer (>=0.15.0,<0.16.0)
|
|
32
|
+
Requires-Dist: uvicorn[standard] (>=0.34.0,<0.35.0)
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# Pulse Engine
|
|
36
|
+
|
|
37
|
+
Hybrid Python framework for building multi-tenant data products. Products `pip install pulse-core-engine`, declare a manifest, and get a full FastAPI app with OpenSearch, Athena, Celery, Prefect, and MCP — out of the box.
|
|
38
|
+
|
|
39
|
+
## How It Works
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
┌──────────────────────────────────────────────────┐
|
|
43
|
+
│ Product (pip install pulse-core-engine) │
|
|
44
|
+
│ manifest = ProductManifest( │
|
|
45
|
+
│ extractors=[...], preprocessor=..., ... │
|
|
46
|
+
│ ) │
|
|
47
|
+
└──────────────┬───────────────────────────────────┘
|
|
48
|
+
│
|
|
49
|
+
┌──────────────▼───────────────────────────────────┐
|
|
50
|
+
│ pulse-core-engine │
|
|
51
|
+
│ Base ABCs · Default implementations · App │
|
|
52
|
+
│ factory · Storage connectors · Job lifecycle │
|
|
53
|
+
│ · CLI · Testing utilities │
|
|
54
|
+
└──────────────┬───────────────────────────────────┘
|
|
55
|
+
│
|
|
56
|
+
┌──────────────▼───────────────────────────────────┐
|
|
57
|
+
│ Shared Infrastructure │
|
|
58
|
+
│ Prefect · OpenSearch · Redis · PostgreSQL │
|
|
59
|
+
└──────────────────────────────────────────────────┘
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Products customize behaviour by:
|
|
63
|
+
- Implementing `BaseExtractor` subclasses for data extraction
|
|
64
|
+
- Overriding pipeline stages (preprocessor, core, postprocessor) or using defaults
|
|
65
|
+
- Adding product-specific API routes, MCP tools, and Celery tasks
|
|
66
|
+
- Declaring everything in a `ProductManifest`
|
|
67
|
+
|
|
68
|
+
## Prerequisites
|
|
69
|
+
|
|
70
|
+
- Python 3.11-3.12
|
|
71
|
+
- [Poetry](https://python-poetry.org/docs/#installation)
|
|
72
|
+
- Docker & Docker Compose (for shared infrastructure)
|
|
73
|
+
|
|
74
|
+
## Quick Start
|
|
75
|
+
|
|
76
|
+
### For Engine Development
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
git clone <repo-url> && cd pulse-core-engine
|
|
80
|
+
|
|
81
|
+
# Install deps and pre-commit hooks
|
|
82
|
+
make install
|
|
83
|
+
|
|
84
|
+
# Copy env and configure
|
|
85
|
+
cp .env.example .env
|
|
86
|
+
|
|
87
|
+
# Run database migrations
|
|
88
|
+
make migrate
|
|
89
|
+
|
|
90
|
+
# Start the dev server
|
|
91
|
+
make run
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### For Building a New Product
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
# Install the engine
|
|
98
|
+
pip install pulse-core-engine
|
|
99
|
+
|
|
100
|
+
# Scaffold a new product
|
|
101
|
+
pulse init my-product
|
|
102
|
+
cd pulse-my-product
|
|
103
|
+
|
|
104
|
+
# Set up the product
|
|
105
|
+
make install
|
|
106
|
+
cp .env.example .env
|
|
107
|
+
|
|
108
|
+
# Validate and test
|
|
109
|
+
make validate
|
|
110
|
+
make test
|
|
111
|
+
|
|
112
|
+
# Run
|
|
113
|
+
make run
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
See **[docs/building-a-product.md](docs/building-a-product.md)** for the full guide.
|
|
117
|
+
|
|
118
|
+
## CLI
|
|
119
|
+
|
|
120
|
+
The `pulse` command is the primary interface:
|
|
121
|
+
|
|
122
|
+
| Command | Description |
|
|
123
|
+
|---------|-------------|
|
|
124
|
+
| `pulse init <name>` | Scaffold a new product from template |
|
|
125
|
+
| `pulse validate [module]` | Validate a product manifest |
|
|
126
|
+
| `pulse run` | Discover manifest and start FastAPI server |
|
|
127
|
+
| `pulse run-worker` | Discover manifest and start Celery worker |
|
|
128
|
+
| `pulse run-mcp` | Discover manifest and start MCP server |
|
|
129
|
+
|
|
130
|
+
## Product Manifest
|
|
131
|
+
|
|
132
|
+
Products declare their components via a `ProductManifest`:
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from pulse_engine.registry import ProductManifest
|
|
136
|
+
|
|
137
|
+
manifest = ProductManifest(
|
|
138
|
+
name="my-product",
|
|
139
|
+
version="0.1.0",
|
|
140
|
+
extractors=[MyExtractor], # data extraction classes
|
|
141
|
+
preprocessor=..., # ... = default, None = skip
|
|
142
|
+
core_processor=..., # custom instance = override
|
|
143
|
+
postprocessor=None, # skip postprocessing
|
|
144
|
+
routers=[my_router], # FastAPI routers
|
|
145
|
+
mcp_tool_modules=["my_pkg.mcp"], # MCP tool modules
|
|
146
|
+
celery_task_modules=["my_pkg.tasks"],
|
|
147
|
+
athena_database="my_db",
|
|
148
|
+
)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Products register via a `pyproject.toml` entry point:
|
|
152
|
+
|
|
153
|
+
```toml
|
|
154
|
+
[tool.poetry.plugins."pulse_engine.products"]
|
|
155
|
+
my_product = "pulse_my_product:manifest"
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Project Structure
|
|
159
|
+
|
|
160
|
+
```
|
|
161
|
+
src/pulse_engine/
|
|
162
|
+
├── main.py # App factory (create_app)
|
|
163
|
+
├── config.py # Settings (pydantic-settings)
|
|
164
|
+
├── registry.py # ProductManifest, validation, discovery
|
|
165
|
+
├── worker.py # Celery app factory
|
|
166
|
+
├── database.py # SQLAlchemy async setup
|
|
167
|
+
├── dependencies.py # FastAPI dependency injection
|
|
168
|
+
├── client.py # PulseEngineClient (container → engine HTTP)
|
|
169
|
+
├── s3.py # S3Stage (NDJSON inter-stage data exchange)
|
|
170
|
+
├── chain_recovery.py # Background task for stalled pipeline recovery
|
|
171
|
+
├── cli/
|
|
172
|
+
│ ├── main.py # pulse CLI (typer)
|
|
173
|
+
│ └── templates/ # Cookiecutter product template
|
|
174
|
+
├── api/v1/
|
|
175
|
+
│ ├── router.py # v1 router aggregation
|
|
176
|
+
│ └── health.py # Health check endpoint
|
|
177
|
+
├── core/
|
|
178
|
+
│ ├── security.py # JWT verification (Cognito + Job-scoped)
|
|
179
|
+
│ ├── job_token.py # Job-scoped JWT issuance & verification
|
|
180
|
+
│ ├── scope.py # require_scope() FastAPI dependency
|
|
181
|
+
│ ├── exceptions.py # Exception hierarchy
|
|
182
|
+
│ ├── error_handlers.py # Global error handlers
|
|
183
|
+
│ └── logging.py # Structured logging (structlog)
|
|
184
|
+
├── middleware/
|
|
185
|
+
│ ├── request_id.py # X-Request-ID middleware
|
|
186
|
+
│ ├── security_headers.py # Defensive HTTP security headers (CSP, HSTS, etc.)
|
|
187
|
+
│ ├── rate_limit.py # Sliding-window per-IP rate limiter (100 req/60 s)
|
|
188
|
+
│ └── tenant.py # Dual-token middleware (Cognito + Job JWT)
|
|
189
|
+
├── deployment/
|
|
190
|
+
│ ├── models.py # DeploymentModel ORM
|
|
191
|
+
│ ├── repository.py # DeploymentRepository
|
|
192
|
+
│ ├── service.py # DeploymentService
|
|
193
|
+
│ ├── router.py # POST /api/v1/deployments
|
|
194
|
+
│ └── schemas.py # Registration request/response
|
|
195
|
+
├── extractor/
|
|
196
|
+
│ ├── base.py # BaseExtractor ABC
|
|
197
|
+
│ ├── models.py # SQLAlchemy ORM: job_records
|
|
198
|
+
│ ├── stage_models.py # SQLAlchemy ORM: job_stages
|
|
199
|
+
│ ├── repository.py # JobRepository
|
|
200
|
+
│ ├── stage_repository.py # StageRepository
|
|
201
|
+
│ ├── service.py # JobService (stage-aware)
|
|
202
|
+
│ ├── router.py # /api/v1/jobs/ endpoints
|
|
203
|
+
│ ├── schemas.py # Pydantic models
|
|
204
|
+
│ └── orchestrator/
|
|
205
|
+
│ ├── base.py # BaseOrchestratorAdapter ABC
|
|
206
|
+
│ ├── prefect.py # PrefectAdapter (deployments + flow runs)
|
|
207
|
+
│ └── noop.py # NoopAdapter
|
|
208
|
+
├── processor/
|
|
209
|
+
│ ├── base.py # BasePreprocessor, BaseCoreProcessor,
|
|
210
|
+
│ │ # BasePostprocessor ABCs
|
|
211
|
+
│ ├── pipeline.py # Pluggable ProcessingPipeline
|
|
212
|
+
│ ├── router.py # /api/v1/process/ endpoints
|
|
213
|
+
│ ├── schemas.py # ProcessingContext, options
|
|
214
|
+
│ ├── defaults/ # Default stage implementations
|
|
215
|
+
│ ├── preprocessor/ # clean_html, normalize, detect_language
|
|
216
|
+
│ ├── core/ # chunking, NER, sentiment, topics
|
|
217
|
+
│ └── postprocessor/ # embeddings, dedup, quality scoring
|
|
218
|
+
├── storage/
|
|
219
|
+
│ ├── knowledge_base.py # KnowledgeBaseService
|
|
220
|
+
│ ├── router.py # /api/v1/kb/ endpoints
|
|
221
|
+
│ ├── schemas.py # Document, SearchQuery, etc.
|
|
222
|
+
│ └── connectors/
|
|
223
|
+
│ ├── base.py # BaseStorageConnector ABC
|
|
224
|
+
│ ├── opensearch.py # OpenSearch connector
|
|
225
|
+
│ └── athena.py # Athena connector
|
|
226
|
+
├── mcp/
|
|
227
|
+
│ ├── server.py # FastMCP instance
|
|
228
|
+
│ ├── tools_kb.py # KB tools
|
|
229
|
+
│ ├── tools_jobs.py # Jobs tools
|
|
230
|
+
│ └── tools_processor.py # Processor tools
|
|
231
|
+
├── services/
|
|
232
|
+
│ ├── bootstrap.py # ServiceContainer, bootstrap_services()
|
|
233
|
+
│ └── opensearch.py # OpenSearch client wrapper
|
|
234
|
+
└── testing/
|
|
235
|
+
├── fixtures.py # Reusable pytest fixtures
|
|
236
|
+
└── mocks.py # MockStorageConnector, MockExtractor, etc.
|
|
237
|
+
|
|
238
|
+
infra/
|
|
239
|
+
├── docker-compose.yml # Prefect, Redis, OpenSearch, PostgreSQL
|
|
240
|
+
└── terraform/ # AWS modules (networking, ECS, ECR, ALB)
|
|
241
|
+
|
|
242
|
+
tests/
|
|
243
|
+
├── unit/ # Unit tests
|
|
244
|
+
│ ├── framework/ # Manifest, pipeline, base class tests
|
|
245
|
+
│ ├── processor/
|
|
246
|
+
│ ├── storage/
|
|
247
|
+
│ ├── deployment/ # Deployment registration tests
|
|
248
|
+
│ └── extractor/
|
|
249
|
+
└── integration/ # Integration tests
|
|
250
|
+
├── api/
|
|
251
|
+
├── mcp/
|
|
252
|
+
└── pipelines/
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
## Base Classes
|
|
256
|
+
|
|
257
|
+
Products extend these ABCs:
|
|
258
|
+
|
|
259
|
+
| ABC | Module | Purpose |
|
|
260
|
+
|-----|--------|---------|
|
|
261
|
+
| `BaseExtractor` | `pulse_engine.extractor.base` | Data extraction from external sources |
|
|
262
|
+
| `BasePreprocessor` | `pulse_engine.processor.base` | Content cleaning and normalization |
|
|
263
|
+
| `BaseCoreProcessor` | `pulse_engine.processor.base` | Chunking, NER, sentiment, topics |
|
|
264
|
+
| `BasePostprocessor` | `pulse_engine.processor.base` | Embeddings, dedup, storage formatting |
|
|
265
|
+
| `BaseStorageConnector` | `pulse_engine.storage.connectors.base` | Custom storage backends |
|
|
266
|
+
| `BaseOrchestratorAdapter` | `pulse_engine.extractor.orchestrator.base` | Custom orchestrator integrations |
|
|
267
|
+
|
|
268
|
+
## Testing
|
|
269
|
+
|
|
270
|
+
```bash
|
|
271
|
+
make test # full suite with coverage
|
|
272
|
+
make test-unit # unit tests only
|
|
273
|
+
make test-integration # integration tests only
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
Products import engine test fixtures in their `conftest.py`:
|
|
277
|
+
|
|
278
|
+
```python
|
|
279
|
+
from pulse_engine.testing.fixtures import * # noqa: F401, F403
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
Available fixtures: `mock_storage_connector`, `mock_orchestrator`, `mock_extractor`, `kb_service`, `processing_pipeline`.
|
|
283
|
+
|
|
284
|
+
## Code Quality
|
|
285
|
+
|
|
286
|
+
Pre-commit hooks run on every commit:
|
|
287
|
+
|
|
288
|
+
| Hook | What it checks |
|
|
289
|
+
|------|----------------|
|
|
290
|
+
| `trailing-whitespace` | No trailing whitespace |
|
|
291
|
+
| `end-of-file-fixer` | Files end with a newline |
|
|
292
|
+
| `check-yaml` | Valid YAML syntax |
|
|
293
|
+
| `ruff` | Linting (auto-fix enabled) |
|
|
294
|
+
| `ruff-format` | Code formatting |
|
|
295
|
+
| `mypy` | Strict static type-checking |
|
|
296
|
+
|
|
297
|
+
```bash
|
|
298
|
+
make lint # run all hooks manually
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
## CI/CD
|
|
302
|
+
|
|
303
|
+
### PR Checks (`pr-checks.yml`)
|
|
304
|
+
|
|
305
|
+
Runs on every pull request to `dev`, `uat`, `prod`:
|
|
306
|
+
- **lint** — ruff check + format
|
|
307
|
+
- **typecheck** — mypy strict
|
|
308
|
+
- **test** — unit tests with coverage
|
|
309
|
+
- **trivy** — vulnerability scan
|
|
310
|
+
|
|
311
|
+
### Deploy (`deploy.yml`)
|
|
312
|
+
|
|
313
|
+
Runs on push to `dev`, `uat`, `prod`:
|
|
314
|
+
|
|
315
|
+
| Branch | PyPI Target | Infrastructure |
|
|
316
|
+
|--------|-------------|----------------|
|
|
317
|
+
| `dev` | TestPyPI | dev ECS cluster |
|
|
318
|
+
| `uat` | TestPyPI | uat ECS cluster |
|
|
319
|
+
| `prod` | PyPI | prod ECS cluster |
|
|
320
|
+
|
|
321
|
+
The pipeline: test → publish to PyPI/TestPyPI → build Docker → push ECR → deploy ECS → wait for stability.
|
|
322
|
+
|
|
323
|
+
### Required Secrets (per GitHub environment)
|
|
324
|
+
|
|
325
|
+
| Secret | Description |
|
|
326
|
+
|--------|-------------|
|
|
327
|
+
| `AWS_ROLE_ARN` | OIDC role for GitHub → AWS auth |
|
|
328
|
+
| `ECR_REPOSITORY_URL` | ECR repository URL |
|
|
329
|
+
| `PYPI_TOKEN` | PyPI API token (prod only) |
|
|
330
|
+
| `TEST_PYPI_TOKEN` | TestPyPI API token (dev/uat) |
|
|
331
|
+
|
|
332
|
+
## MCP Server
|
|
333
|
+
|
|
334
|
+
Exposes KB, Jobs, and Processor as MCP tools for AI agents:
|
|
335
|
+
|
|
336
|
+
```bash
|
|
337
|
+
pulse run-mcp
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
Products register additional tools via `mcp_tool_modules` in the manifest.
|
|
341
|
+
|
|
342
|
+
## Environment Variables
|
|
343
|
+
|
|
344
|
+
### Core
|
|
345
|
+
|
|
346
|
+
| Variable | Required | Default | Description |
|
|
347
|
+
|----------|----------|---------|-------------|
|
|
348
|
+
| `APP_ENV` | No | `development` | Environment name (`development`, `production`, etc.) |
|
|
349
|
+
| `APP_VERSION` | No | `0.1.0` | Application version |
|
|
350
|
+
| `LOG_LEVEL` | No | `INFO` | Logging level |
|
|
351
|
+
| `AWS_REGION` | Yes | `ap-south-1` | AWS region |
|
|
352
|
+
| `AWS_ACCESS_KEY_ID` | No | — | AWS credentials (use IAM role in production) |
|
|
353
|
+
| `AWS_SECRET_ACCESS_KEY` | No | — | AWS credentials (use IAM role in production) |
|
|
354
|
+
|
|
355
|
+
### Authentication (Cognito)
|
|
356
|
+
|
|
357
|
+
| Variable | Required | Default | Description |
|
|
358
|
+
|----------|----------|---------|-------------|
|
|
359
|
+
| `COGNITO_USER_POOL_ID` | Yes | — | Cognito User Pool ID |
|
|
360
|
+
| `COGNITO_APP_CLIENT_ID` | Yes | — | Cognito App Client ID |
|
|
361
|
+
| `COGNITO_APP_CLIENT_SECRET` | No | — | Client secret (required if app client has one) |
|
|
362
|
+
|
|
363
|
+
### OpenSearch
|
|
364
|
+
|
|
365
|
+
| Variable | Required | Default | Description |
|
|
366
|
+
|----------|----------|---------|-------------|
|
|
367
|
+
| `OPENSEARCH_URL` | Yes | — | OpenSearch endpoint |
|
|
368
|
+
| `OPENSEARCH_USERNAME` | No | — | Basic-auth username (AWS managed domains) |
|
|
369
|
+
| `OPENSEARCH_PASSWORD` | No | — | Basic-auth password |
|
|
370
|
+
| `OPENSEARCH_USE_SSL` | No | `true` | Enable TLS |
|
|
371
|
+
| `OPENSEARCH_VERIFY_CERTS` | No | `true` | Verify TLS certificates |
|
|
372
|
+
| `OPENSEARCH_INDEX_PREFIX` | No | `pulse_kb` | Index name prefix per tenant |
|
|
373
|
+
| `EMBEDDING_DIMENSION` | No | `1536` | Vector dimension for kNN indexes |
|
|
374
|
+
|
|
375
|
+
### Database & Cache
|
|
376
|
+
|
|
377
|
+
| Variable | Required | Default | Description |
|
|
378
|
+
|----------|----------|---------|-------------|
|
|
379
|
+
| `DATABASE_URL` | Yes | — | Async PostgreSQL DSN (`postgresql+asyncpg://...`) |
|
|
380
|
+
| `REDIS_URL` | No | `redis://localhost:6379/0` | Redis URL (enables Celery) |
|
|
381
|
+
| `CELERY_BROKER_URL` | No | — | Celery broker (defaults to `REDIS_URL`) |
|
|
382
|
+
| `CELERY_RESULT_BACKEND` | No | — | Celery result backend (defaults to `REDIS_URL`) |
|
|
383
|
+
|
|
384
|
+
### Athena
|
|
385
|
+
|
|
386
|
+
| Variable | Required | Default | Description |
|
|
387
|
+
|----------|----------|---------|-------------|
|
|
388
|
+
| `ATHENA_AWS_ACCESS_KEY_ID` | No | — | Athena-specific AWS credentials |
|
|
389
|
+
| `ATHENA_AWS_SECRET_ACCESS_KEY` | No | — | Athena-specific AWS credentials |
|
|
390
|
+
| `ATHENA_OUTPUT_LOCATION` | Yes* | — | S3 URI for Athena query results |
|
|
391
|
+
| `ATHENA_WORKGROUP` | No | `primary` | Athena workgroup |
|
|
392
|
+
| `ATHENA_QUERY_TIMEOUT_SECONDS` | No | `60` | Athena query timeout |
|
|
393
|
+
|
|
394
|
+
### Orchestrator (Prefect)
|
|
395
|
+
|
|
396
|
+
| Variable | Required | Default | Description |
|
|
397
|
+
|----------|----------|---------|-------------|
|
|
398
|
+
| `PULSE_ORCHESTRATOR_BACKEND` | No | `none` | `prefect` or `none` |
|
|
399
|
+
| `PREFECT_API_URL` | No | — | Prefect API endpoint |
|
|
400
|
+
| `PREFECT_API_KEY` | No | — | Prefect Cloud API key |
|
|
401
|
+
| `PREFECT_ECS_WORK_POOL_NAME` | No | `products-worker-pool` | ECS work pool name |
|
|
402
|
+
| `PREFECT_LAMBDA_WORK_POOL_NAME` | No | `lambda-worker-pool` | Lambda work pool name |
|
|
403
|
+
| `PREFECT_LAMBDA_FUNCTION_NAME_TEMPLATE` | No | `{product}-{stage}` | Lambda function name pattern |
|
|
404
|
+
| `PREFECT_K8S_WORK_POOL_NAME` | No | `k8s-worker-pool` | Kubernetes work pool name |
|
|
405
|
+
| `PREFECT_K8S_NAMESPACE` | No | `pulse-jobs` | Kubernetes namespace |
|
|
406
|
+
| `PREFECT_K8S_DEFAULT_CPU` | No | `500m` | Default CPU request |
|
|
407
|
+
| `PREFECT_K8S_DEFAULT_MEMORY` | No | `1Gi` | Default memory request |
|
|
408
|
+
|
|
409
|
+
### LLM & Embeddings
|
|
410
|
+
|
|
411
|
+
| Variable | Required | Default | Description |
|
|
412
|
+
|----------|----------|---------|-------------|
|
|
413
|
+
| `PULSE_LLM_PROVIDER` | No | `openai` | LLM provider |
|
|
414
|
+
| `PULSE_LLM_MODEL` | No | `gpt-4o-mini` | LLM model ID |
|
|
415
|
+
| `PULSE_LLM_API_KEY` | No | — | LLM API key (also used as embedding fallback) |
|
|
416
|
+
| `PULSE_LLM_TEMPERATURE` | No | `0.0` | LLM sampling temperature |
|
|
417
|
+
| `PULSE_EMBEDDING_PROVIDER` | No | `openai` | Embedding provider |
|
|
418
|
+
| `PULSE_OPENAI_EMBEDDING_MODEL` | No | `text-embedding-3-small` | OpenAI embedding model |
|
|
419
|
+
| `PULSE_OPENAI_API_KEY` | No | — | OpenAI API key (overrides `PULSE_LLM_API_KEY`) |
|
|
420
|
+
|
|
421
|
+
### Pipeline & Jobs
|
|
422
|
+
|
|
423
|
+
| Variable | Required | Default | Description |
|
|
424
|
+
|----------|----------|---------|-------------|
|
|
425
|
+
| `PULSE_ENGINE_URL` | No | — | Public URL containers use for callbacks |
|
|
426
|
+
| `PULSE_JOB_TOKEN_SECRET` | No | — | HMAC secret for job-scoped JWTs |
|
|
427
|
+
| `PULSE_S3_BUCKET` | No | — | S3 bucket for inter-stage NDJSON data |
|
|
428
|
+
| `PULSE_CHAIN_GRACE_PERIOD_SECONDS` | No | `300` | Seconds before chain recovery auto-triggers |
|
|
429
|
+
| `PULSE_MAX_CONCURRENT_JOBS_PER_TENANT` | No | `10` | Max concurrent jobs per tenant |
|
|
430
|
+
| `PULSE_DEFAULT_CHUNK_SIZE` | No | `512` | Default chunk token size |
|
|
431
|
+
| `PULSE_DEFAULT_CHUNK_STRATEGY` | No | `token_count` | Default chunking strategy |
|
|
432
|
+
| `PULSE_DEDUP_SIMILARITY_THRESHOLD` | No | `0.95` | Cosine similarity dedup threshold |
|
|
433
|
+
|
|
434
|
+
### MCP Server
|
|
435
|
+
|
|
436
|
+
| Variable | Required | Default | Description |
|
|
437
|
+
|----------|----------|---------|-------------|
|
|
438
|
+
| `MCP_TRANSPORT` | No | `sse` | Transport mode: `sse` or `stdio` |
|
|
439
|
+
| `MCP_SSE_HOST` | No | `127.0.0.1` | MCP SSE server host |
|
|
440
|
+
| `MCP_SSE_PORT` | No | `8001` | MCP SSE server port |
|
|
441
|
+
|
|
442
|
+
## API Authentication
|
|
443
|
+
|
|
444
|
+
All endpoints (except `/api/v1/health` and `/api/v1/auth/login`) require a JWT token.
|
|
445
|
+
|
|
446
|
+
> **Rate limits** — enforced per IP address:
|
|
447
|
+
> - **Login** (`POST /api/v1/auth/login`): 5 attempts per 60 seconds. Returns `429` with `Retry-After: 60` on breach.
|
|
448
|
+
> - **Global**: 100 requests per 60 seconds across all endpoints. Responses include `X-RateLimit-Limit` and `X-RateLimit-Remaining` headers.
|
|
449
|
+
>
|
|
450
|
+
> **Note** — Swagger UI (`/docs`) and ReDoc (`/redoc`) are disabled in production (`APP_ENV=production`).
|
|
451
|
+
|
|
452
|
+
### Get a Token
|
|
453
|
+
|
|
454
|
+
```bash
|
|
455
|
+
curl -X POST https://api.dev.pulse.mananalabs.ai/api/v1/auth/login \
|
|
456
|
+
-H "Content-Type: application/json" \
|
|
457
|
+
-d '{"email": "dev@pulse-engine.com", "password": "PulseDev@2026"}'
|
|
458
|
+
```
|
|
459
|
+
|
|
460
|
+
Response:
|
|
461
|
+
|
|
462
|
+
```json
|
|
463
|
+
{
|
|
464
|
+
"id_token": "eyJ...",
|
|
465
|
+
"access_token": "eyJ...",
|
|
466
|
+
"refresh_token": "eyJ...",
|
|
467
|
+
"expires_in": 3600,
|
|
468
|
+
"token_type": "Bearer",
|
|
469
|
+
"tenant_id": "tenant-dev-001",
|
|
470
|
+
"email": "dev@pulse-engine.com"
|
|
471
|
+
}
|
|
472
|
+
```
|
|
473
|
+
|
|
474
|
+
### Use the Token
|
|
475
|
+
|
|
476
|
+
Pass the `id_token` as a Bearer token in the `Authorization` header:
|
|
477
|
+
|
|
478
|
+
```bash
|
|
479
|
+
curl -H "Authorization: Bearer <id_token>" \
|
|
480
|
+
https://api.dev.pulse.mananalabs.ai/api/v1/kb/stats
|
|
481
|
+
```
|
|
482
|
+
|
|
483
|
+
Tokens expire after **1 hour**. Call the login endpoint again to get a new one.
|
|
484
|
+
|
|
485
|
+
## API Documentation
|
|
486
|
+
|
|
487
|
+
- Swagger UI: https://api.dev.pulse.mananalabs.ai/docs
|
|
488
|
+
- ReDoc: https://api.dev.pulse.mananalabs.ai/redoc
|
|
489
|
+
|
|
490
|
+
## Library Usage
|
|
491
|
+
|
|
492
|
+
Pulse Engine can also be used as a standalone library for content processing:
|
|
493
|
+
|
|
494
|
+
```python
|
|
495
|
+
from pulse_engine.processor.core.topic_splitter import TopicSplitter
|
|
496
|
+
|
|
497
|
+
splitter = TopicSplitter(provider="openai", api_key="sk-...")
|
|
498
|
+
result = splitter.split([
|
|
499
|
+
(1, "Hi there"),
|
|
500
|
+
(2, "Let's discuss Q1 metrics"),
|
|
501
|
+
(3, "Revenue grew 20%"),
|
|
502
|
+
])
|
|
503
|
+
print(result.topic_shifts)
|
|
504
|
+
```
|
|
505
|
+
|
|
506
|
+
See **[docs/pulse_engine_library.md](docs/pulse_engine_library.md)** for full documentation on the Topic Splitter, LLM configuration, and configurable embeddings.
|
|
507
|
+
|
|
508
|
+
## OCR Module
|
|
509
|
+
|
|
510
|
+
Extract text from PDFs and images using Google Gemini's vision capabilities:
|
|
511
|
+
|
|
512
|
+
```python
|
|
513
|
+
from pulse_engine.processor.ocr.gemini import GeminiOCRProvider
|
|
514
|
+
from pulse_engine.processor.schemas import OCRInput
|
|
515
|
+
|
|
516
|
+
# Create provider
|
|
517
|
+
provider = GeminiOCRProvider()
|
|
518
|
+
|
|
519
|
+
# Option 1: Extract from file path
|
|
520
|
+
ocr_input = OCRInput(
|
|
521
|
+
file_path="/path/to/document.pdf",
|
|
522
|
+
mime_type="application/pdf",
|
|
523
|
+
prompt="Extract all text and structure as JSON",
|
|
524
|
+
temperature=0.0,
|
|
525
|
+
model="gemini-2.0-flash",
|
|
526
|
+
api_key="your-gemini-api-key",
|
|
527
|
+
max_output_tokens=4096,
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
# Option 2: Extract from bytes
|
|
531
|
+
ocr_input = OCRInput(
|
|
532
|
+
file_bytes=open("document.pdf", "rb").read(),
|
|
533
|
+
mime_type="application/pdf",
|
|
534
|
+
prompt="Extract invoice line items as JSON",
|
|
535
|
+
api_key="your-gemini-api-key",
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
# Extract
|
|
539
|
+
response = provider.extract(ocr_input)
|
|
540
|
+
print(response.text) # Raw JSON response from Gemini
|
|
541
|
+
```
|
|
542
|
+
|
|
543
|
+
**OCRInput parameters:**
|
|
544
|
+
- `file_path` or `file_bytes`: PDF/image content (provide one)
|
|
545
|
+
- `mime_type`: `application/pdf`, `image/png`, `image/jpeg`, `image/webp` (default: `application/pdf`)
|
|
546
|
+
- `prompt`: Instructions for extraction (e.g., "Extract all text as JSON")
|
|
547
|
+
- `temperature`: LLM randomness, 0.0-1.0 (default: `0.1`)
|
|
548
|
+
- `model`: Gemini model ID (default: `gemini-2.5-flash`)
|
|
549
|
+
- `api_key`: Google Gemini API key
|
|
550
|
+
- `max_output_tokens`: Max response length (default: `65536`)
|
|
551
|
+
|
|
552
|
+
**Response:**
|
|
553
|
+
Returns raw Gemini response object with `.text` property containing the extraction result.
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
## Further Reading
|
|
558
|
+
|
|
559
|
+
- [Building a Product](docs/building-a-product.md) — step-by-step guide to creating a new product
|
|
560
|
+
- [Design Decisions](docs/design-decisions.md) — architectural decisions and rationale
|
|
561
|
+
- [Infrastructure](docs/infrastructure.md) — AWS deployment architecture
|
|
562
|
+
- [Library Usage](docs/pulse_engine_library.md) — topic splitting, LLM config, and embeddings
|
|
563
|
+
|