pulse-engine 0.2.0.dev20260407065251__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. pulse_engine-0.2.0.dev20260407065251/PKG-INFO +563 -0
  2. pulse_engine-0.2.0.dev20260407065251/README.md +528 -0
  3. pulse_engine-0.2.0.dev20260407065251/pyproject.toml +105 -0
  4. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/__init__.py +0 -0
  5. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/api/__init__.py +0 -0
  6. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/api/v1/__init__.py +0 -0
  7. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/api/v1/auth.py +91 -0
  8. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/api/v1/health.py +62 -0
  9. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/api/v1/router.py +16 -0
  10. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/chain_recovery.py +131 -0
  11. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/__init__.py +0 -0
  12. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/main.py +169 -0
  13. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/cookiecutter.json +4 -0
  14. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
  15. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
  16. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
  17. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
  18. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
  19. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
  20. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
  21. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
  22. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/client.py +95 -0
  23. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/config.py +153 -0
  24. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/core/__init__.py +0 -0
  25. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/core/error_handlers.py +64 -0
  26. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/core/exceptions.py +67 -0
  27. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/core/job_token.py +109 -0
  28. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/core/logging.py +45 -0
  29. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/core/scope.py +23 -0
  30. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/core/security.py +130 -0
  31. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/database.py +30 -0
  32. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/dependencies.py +166 -0
  33. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/__init__.py +0 -0
  34. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backend_deployment_repository.py +83 -0
  35. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backends/__init__.py +0 -0
  36. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backends/base.py +50 -0
  37. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backends/exceptions.py +20 -0
  38. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backends/native_lambda.py +125 -0
  39. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backends/prefect_ecs.py +116 -0
  40. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backends/prefect_k8s.py +131 -0
  41. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/backends/registry.py +50 -0
  42. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/infra_provisioner.py +278 -0
  43. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/job_launcher.py +178 -0
  44. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/models.py +48 -0
  45. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/repository.py +54 -0
  46. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/router.py +22 -0
  47. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/schemas.py +18 -0
  48. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/deployment/service.py +65 -0
  49. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/__init__.py +0 -0
  50. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/base.py +48 -0
  51. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/models.py +50 -0
  52. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/orchestrator/__init__.py +15 -0
  53. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/orchestrator/base.py +34 -0
  54. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/orchestrator/noop.py +37 -0
  55. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/orchestrator/prefect.py +163 -0
  56. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/repository.py +163 -0
  57. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/router.py +102 -0
  58. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/schemas.py +93 -0
  59. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/service.py +431 -0
  60. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/stage_models.py +36 -0
  61. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/extractor/stage_repository.py +109 -0
  62. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/main.py +195 -0
  63. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/mcp/__init__.py +0 -0
  64. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/mcp/__main__.py +5 -0
  65. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/mcp/server.py +103 -0
  66. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/mcp/tools_jobs.py +159 -0
  67. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/mcp/tools_kb.py +88 -0
  68. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/mcp/tools_processor.py +208 -0
  69. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/middleware/__init__.py +0 -0
  70. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/middleware/rate_limit.py +144 -0
  71. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/middleware/request_id.py +16 -0
  72. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/middleware/security_headers.py +25 -0
  73. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/middleware/tenant.py +90 -0
  74. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/__init__.py +0 -0
  75. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/config_parser.py +120 -0
  76. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/models.py +67 -0
  77. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/repositories.py +153 -0
  78. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/router_modules.py +66 -0
  79. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/router_pipelines.py +186 -0
  80. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/schemas.py +139 -0
  81. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/service.py +158 -0
  82. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/translators/__init__.py +44 -0
  83. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/translators/airflow_status.py +11 -0
  84. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/translators/airflow_translator.py +23 -0
  85. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/translators/base.py +43 -0
  86. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/translators/prefect_status.py +93 -0
  87. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/pipeline/translators/prefect_translator.py +135 -0
  88. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/__init__.py +0 -0
  89. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/base.py +36 -0
  90. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/core/__init__.py +0 -0
  91. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/core/analysis.py +148 -0
  92. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/core/chunking.py +158 -0
  93. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/core/prompts.py +340 -0
  94. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/core/topic_splitter.py +105 -0
  95. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/defaults/__init__.py +11 -0
  96. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/defaults/core_processor.py +12 -0
  97. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/defaults/postprocessor.py +12 -0
  98. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/defaults/preprocessor.py +12 -0
  99. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/llm/__init__.py +0 -0
  100. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/llm/provider.py +58 -0
  101. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/ocr/gemini.py +52 -0
  102. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/pipeline.py +107 -0
  103. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/postprocessor/__init__.py +0 -0
  104. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/postprocessor/embeddings.py +34 -0
  105. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/postprocessor/tasks.py +180 -0
  106. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/preprocessor/__init__.py +0 -0
  107. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/preprocessor/tasks.py +71 -0
  108. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/router.py +192 -0
  109. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/processor/schemas.py +167 -0
  110. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/registry.py +117 -0
  111. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/runners/__init__.py +0 -0
  112. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/runners/lambda_runner.py +26 -0
  113. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/runners/pipeline_runner.py +43 -0
  114. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/runners/prefect_pipeline_flow.py +677 -0
  115. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/runners/prefect_runner.py +33 -0
  116. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/s3.py +72 -0
  117. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/services/__init__.py +0 -0
  118. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/services/bootstrap.py +210 -0
  119. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/services/opensearch.py +84 -0
  120. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/__init__.py +0 -0
  121. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/connectors/__init__.py +0 -0
  122. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/connectors/athena.py +226 -0
  123. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/connectors/base.py +32 -0
  124. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/connectors/opensearch.py +344 -0
  125. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/knowledge_base.py +68 -0
  126. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/router.py +78 -0
  127. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/storage/schemas.py +93 -0
  128. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/testing/__init__.py +13 -0
  129. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/testing/fixtures.py +50 -0
  130. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/testing/mocks.py +104 -0
  131. pulse_engine-0.2.0.dev20260407065251/src/pulse_engine/worker.py +53 -0
@@ -0,0 +1,563 @@
1
+ Metadata-Version: 2.1
2
+ Name: pulse-engine
3
+ Version: 0.2.0.dev20260407065251
4
+ Summary: Pulse Engine — Hybrid framework for building Pulse products
5
+ Author: Pulse Team
6
+ Requires-Python: >=3.11,<3.13
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.11
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Requires-Dist: alembic (>=1.14.0,<2.0.0)
11
+ Requires-Dist: asyncpg (>=0.30.0,<0.31.0)
12
+ Requires-Dist: beautifulsoup4 (>=4.12.0,<5.0.0)
13
+ Requires-Dist: boto3 (>=1.35.0,<2.0.0)
14
+ Requires-Dist: celery[redis] (>=5.4.0,<6.0.0)
15
+ Requires-Dist: cookiecutter (>=2.6.0,<3.0.0)
16
+ Requires-Dist: email-validator (>=2.3.0,<3.0.0)
17
+ Requires-Dist: fastapi (>=0.115.0,<0.116.0)
18
+ Requires-Dist: httpx (>=0.28.0,<0.29.0)
19
+ Requires-Dist: langchain (>=1.2.13,<2.0.0)
20
+ Requires-Dist: langchain-anthropic (>=1.4.0,<2.0.0)
21
+ Requires-Dist: langchain-openai (>=1.1.11,<2.0.0)
22
+ Requires-Dist: langdetect (>=1.0.9,<2.0.0)
23
+ Requires-Dist: mcp[cli] (>=1.0.0,<2.0.0)
24
+ Requires-Dist: opensearch-py[async] (>=3.1.0,<4.0.0)
25
+ Requires-Dist: pydantic-settings (>=2.7.0,<3.0.0)
26
+ Requires-Dist: python-jose[cryptography] (>=3.3.0,<4.0.0)
27
+ Requires-Dist: redis (>=5.0.0,<6.0.0)
28
+ Requires-Dist: sqlalchemy[asyncio] (>=2.0,<3.0)
29
+ Requires-Dist: structlog (>=24.4.0,<25.0.0)
30
+ Requires-Dist: tiktoken (>=0.8.0,<0.9.0)
31
+ Requires-Dist: typer (>=0.15.0,<0.16.0)
32
+ Requires-Dist: uvicorn[standard] (>=0.34.0,<0.35.0)
33
+ Description-Content-Type: text/markdown
34
+
35
+ # Pulse Engine
36
+
37
+ Hybrid Python framework for building multi-tenant data products. Products `pip install pulse-core-engine`, declare a manifest, and get a full FastAPI app with OpenSearch, Athena, Celery, Prefect, and MCP — out of the box.
38
+
39
+ ## How It Works
40
+
41
+ ```
42
+ ┌──────────────────────────────────────────────────┐
43
+ │ Product (pip install pulse-core-engine) │
44
+ │ manifest = ProductManifest( │
45
+ │ extractors=[...], preprocessor=..., ... │
46
+ │ ) │
47
+ └──────────────┬───────────────────────────────────┘
48
+
49
+ ┌──────────────▼───────────────────────────────────┐
50
+ │ pulse-core-engine │
51
+ │ Base ABCs · Default implementations · App │
52
+ │ factory · Storage connectors · Job lifecycle │
53
+ │ · CLI · Testing utilities │
54
+ └──────────────┬───────────────────────────────────┘
55
+
56
+ ┌──────────────▼───────────────────────────────────┐
57
+ │ Shared Infrastructure │
58
+ │ Prefect · OpenSearch · Redis · PostgreSQL │
59
+ └──────────────────────────────────────────────────┘
60
+ ```
61
+
62
+ Products customize behaviour by:
63
+ - Implementing `BaseExtractor` subclasses for data extraction
64
+ - Overriding pipeline stages (preprocessor, core, postprocessor) or using defaults
65
+ - Adding product-specific API routes, MCP tools, and Celery tasks
66
+ - Declaring everything in a `ProductManifest`
67
+
68
+ ## Prerequisites
69
+
70
+ - Python 3.11-3.12
71
+ - [Poetry](https://python-poetry.org/docs/#installation)
72
+ - Docker & Docker Compose (for shared infrastructure)
73
+
74
+ ## Quick Start
75
+
76
+ ### For Engine Development
77
+
78
+ ```bash
79
+ git clone <repo-url> && cd pulse-core-engine
80
+
81
+ # Install deps and pre-commit hooks
82
+ make install
83
+
84
+ # Copy env and configure
85
+ cp .env.example .env
86
+
87
+ # Run database migrations
88
+ make migrate
89
+
90
+ # Start the dev server
91
+ make run
92
+ ```
93
+
94
+ ### For Building a New Product
95
+
96
+ ```bash
97
+ # Install the engine
98
+ pip install pulse-core-engine
99
+
100
+ # Scaffold a new product
101
+ pulse init my-product
102
+ cd pulse-my-product
103
+
104
+ # Set up the product
105
+ make install
106
+ cp .env.example .env
107
+
108
+ # Validate and test
109
+ make validate
110
+ make test
111
+
112
+ # Run
113
+ make run
114
+ ```
115
+
116
+ See **[docs/building-a-product.md](docs/building-a-product.md)** for the full guide.
117
+
118
+ ## CLI
119
+
120
+ The `pulse` command is the primary interface:
121
+
122
+ | Command | Description |
123
+ |---------|-------------|
124
+ | `pulse init <name>` | Scaffold a new product from template |
125
+ | `pulse validate [module]` | Validate a product manifest |
126
+ | `pulse run` | Discover manifest and start FastAPI server |
127
+ | `pulse run-worker` | Discover manifest and start Celery worker |
128
+ | `pulse run-mcp` | Discover manifest and start MCP server |
129
+
130
+ ## Product Manifest
131
+
132
+ Products declare their components via a `ProductManifest`:
133
+
134
+ ```python
135
+ from pulse_engine.registry import ProductManifest
136
+
137
+ manifest = ProductManifest(
138
+ name="my-product",
139
+ version="0.1.0",
140
+ extractors=[MyExtractor], # data extraction classes
141
+ preprocessor=..., # ... = default, None = skip
142
+ core_processor=..., # custom instance = override
143
+ postprocessor=None, # skip postprocessing
144
+ routers=[my_router], # FastAPI routers
145
+ mcp_tool_modules=["my_pkg.mcp"], # MCP tool modules
146
+ celery_task_modules=["my_pkg.tasks"],
147
+ athena_database="my_db",
148
+ )
149
+ ```
150
+
151
+ Products register via a `pyproject.toml` entry point:
152
+
153
+ ```toml
154
+ [tool.poetry.plugins."pulse_engine.products"]
155
+ my_product = "pulse_my_product:manifest"
156
+ ```
157
+
158
+ ## Project Structure
159
+
160
+ ```
161
+ src/pulse_engine/
162
+ ├── main.py # App factory (create_app)
163
+ ├── config.py # Settings (pydantic-settings)
164
+ ├── registry.py # ProductManifest, validation, discovery
165
+ ├── worker.py # Celery app factory
166
+ ├── database.py # SQLAlchemy async setup
167
+ ├── dependencies.py # FastAPI dependency injection
168
+ ├── client.py # PulseEngineClient (container → engine HTTP)
169
+ ├── s3.py # S3Stage (NDJSON inter-stage data exchange)
170
+ ├── chain_recovery.py # Background task for stalled pipeline recovery
171
+ ├── cli/
172
+ │ ├── main.py # pulse CLI (typer)
173
+ │ └── templates/ # Cookiecutter product template
174
+ ├── api/v1/
175
+ │ ├── router.py # v1 router aggregation
176
+ │ └── health.py # Health check endpoint
177
+ ├── core/
178
+ │ ├── security.py # JWT verification (Cognito + Job-scoped)
179
+ │ ├── job_token.py # Job-scoped JWT issuance & verification
180
+ │ ├── scope.py # require_scope() FastAPI dependency
181
+ │ ├── exceptions.py # Exception hierarchy
182
+ │ ├── error_handlers.py # Global error handlers
183
+ │ └── logging.py # Structured logging (structlog)
184
+ ├── middleware/
185
+ │ ├── request_id.py # X-Request-ID middleware
186
+ │ ├── security_headers.py # Defensive HTTP security headers (CSP, HSTS, etc.)
187
+ │ ├── rate_limit.py # Sliding-window per-IP rate limiter (100 req/60 s)
188
+ │ └── tenant.py # Dual-token middleware (Cognito + Job JWT)
189
+ ├── deployment/
190
+ │ ├── models.py # DeploymentModel ORM
191
+ │ ├── repository.py # DeploymentRepository
192
+ │ ├── service.py # DeploymentService
193
+ │ ├── router.py # POST /api/v1/deployments
194
+ │ └── schemas.py # Registration request/response
195
+ ├── extractor/
196
+ │ ├── base.py # BaseExtractor ABC
197
+ │ ├── models.py # SQLAlchemy ORM: job_records
198
+ │ ├── stage_models.py # SQLAlchemy ORM: job_stages
199
+ │ ├── repository.py # JobRepository
200
+ │ ├── stage_repository.py # StageRepository
201
+ │ ├── service.py # JobService (stage-aware)
202
+ │ ├── router.py # /api/v1/jobs/ endpoints
203
+ │ ├── schemas.py # Pydantic models
204
+ │ └── orchestrator/
205
+ │ ├── base.py # BaseOrchestratorAdapter ABC
206
+ │ ├── prefect.py # PrefectAdapter (deployments + flow runs)
207
+ │ └── noop.py # NoopAdapter
208
+ ├── processor/
209
+ │ ├── base.py # BasePreprocessor, BaseCoreProcessor,
210
+ │ │ # BasePostprocessor ABCs
211
+ │ ├── pipeline.py # Pluggable ProcessingPipeline
212
+ │ ├── router.py # /api/v1/process/ endpoints
213
+ │ ├── schemas.py # ProcessingContext, options
214
+ │ ├── defaults/ # Default stage implementations
215
+ │ ├── preprocessor/ # clean_html, normalize, detect_language
216
+ │ ├── core/ # chunking, NER, sentiment, topics
217
+ │ └── postprocessor/ # embeddings, dedup, quality scoring
218
+ ├── storage/
219
+ │ ├── knowledge_base.py # KnowledgeBaseService
220
+ │ ├── router.py # /api/v1/kb/ endpoints
221
+ │ ├── schemas.py # Document, SearchQuery, etc.
222
+ │ └── connectors/
223
+ │ ├── base.py # BaseStorageConnector ABC
224
+ │ ├── opensearch.py # OpenSearch connector
225
+ │ └── athena.py # Athena connector
226
+ ├── mcp/
227
+ │ ├── server.py # FastMCP instance
228
+ │ ├── tools_kb.py # KB tools
229
+ │ ├── tools_jobs.py # Jobs tools
230
+ │ └── tools_processor.py # Processor tools
231
+ ├── services/
232
+ │ ├── bootstrap.py # ServiceContainer, bootstrap_services()
233
+ │ └── opensearch.py # OpenSearch client wrapper
234
+ └── testing/
235
+ ├── fixtures.py # Reusable pytest fixtures
236
+ └── mocks.py # MockStorageConnector, MockExtractor, etc.
237
+
238
+ infra/
239
+ ├── docker-compose.yml # Prefect, Redis, OpenSearch, PostgreSQL
240
+ └── terraform/ # AWS modules (networking, ECS, ECR, ALB)
241
+
242
+ tests/
243
+ ├── unit/ # Unit tests
244
+ │ ├── framework/ # Manifest, pipeline, base class tests
245
+ │ ├── processor/
246
+ │ ├── storage/
247
+ │ ├── deployment/ # Deployment registration tests
248
+ │ └── extractor/
249
+ └── integration/ # Integration tests
250
+ ├── api/
251
+ ├── mcp/
252
+ └── pipelines/
253
+ ```
254
+
255
+ ## Base Classes
256
+
257
+ Products extend these ABCs:
258
+
259
+ | ABC | Module | Purpose |
260
+ |-----|--------|---------|
261
+ | `BaseExtractor` | `pulse_engine.extractor.base` | Data extraction from external sources |
262
+ | `BasePreprocessor` | `pulse_engine.processor.base` | Content cleaning and normalization |
263
+ | `BaseCoreProcessor` | `pulse_engine.processor.base` | Chunking, NER, sentiment, topics |
264
+ | `BasePostprocessor` | `pulse_engine.processor.base` | Embeddings, dedup, storage formatting |
265
+ | `BaseStorageConnector` | `pulse_engine.storage.connectors.base` | Custom storage backends |
266
+ | `BaseOrchestratorAdapter` | `pulse_engine.extractor.orchestrator.base` | Custom orchestrator integrations |
267
+
268
+ ## Testing
269
+
270
+ ```bash
271
+ make test # full suite with coverage
272
+ make test-unit # unit tests only
273
+ make test-integration # integration tests only
274
+ ```
275
+
276
+ Products import engine test fixtures in their `conftest.py`:
277
+
278
+ ```python
279
+ from pulse_engine.testing.fixtures import * # noqa: F401, F403
280
+ ```
281
+
282
+ Available fixtures: `mock_storage_connector`, `mock_orchestrator`, `mock_extractor`, `kb_service`, `processing_pipeline`.
283
+
284
+ ## Code Quality
285
+
286
+ Pre-commit hooks run on every commit:
287
+
288
+ | Hook | What it checks |
289
+ |------|----------------|
290
+ | `trailing-whitespace` | No trailing whitespace |
291
+ | `end-of-file-fixer` | Files end with a newline |
292
+ | `check-yaml` | Valid YAML syntax |
293
+ | `ruff` | Linting (auto-fix enabled) |
294
+ | `ruff-format` | Code formatting |
295
+ | `mypy` | Strict static type-checking |
296
+
297
+ ```bash
298
+ make lint # run all hooks manually
299
+ ```
300
+
301
+ ## CI/CD
302
+
303
+ ### PR Checks (`pr-checks.yml`)
304
+
305
+ Runs on every pull request to `dev`, `uat`, `prod`:
306
+ - **lint** — ruff check + format
307
+ - **typecheck** — mypy strict
308
+ - **test** — unit tests with coverage
309
+ - **trivy** — vulnerability scan
310
+
311
+ ### Deploy (`deploy.yml`)
312
+
313
+ Runs on push to `dev`, `uat`, `prod`:
314
+
315
+ | Branch | PyPI Target | Infrastructure |
316
+ |--------|-------------|----------------|
317
+ | `dev` | TestPyPI | dev ECS cluster |
318
+ | `uat` | TestPyPI | uat ECS cluster |
319
+ | `prod` | PyPI | prod ECS cluster |
320
+
321
+ The pipeline: test → publish to PyPI/TestPyPI → build Docker → push ECR → deploy ECS → wait for stability.
322
+
323
+ ### Required Secrets (per GitHub environment)
324
+
325
+ | Secret | Description |
326
+ |--------|-------------|
327
+ | `AWS_ROLE_ARN` | OIDC role for GitHub → AWS auth |
328
+ | `ECR_REPOSITORY_URL` | ECR repository URL |
329
+ | `PYPI_TOKEN` | PyPI API token (prod only) |
330
+ | `TEST_PYPI_TOKEN` | TestPyPI API token (dev/uat) |
331
+
332
+ ## MCP Server
333
+
334
+ Exposes KB, Jobs, and Processor as MCP tools for AI agents:
335
+
336
+ ```bash
337
+ pulse run-mcp
338
+ ```
339
+
340
+ Products register additional tools via `mcp_tool_modules` in the manifest.
341
+
342
+ ## Environment Variables
343
+
344
+ ### Core
345
+
346
+ | Variable | Required | Default | Description |
347
+ |----------|----------|---------|-------------|
348
+ | `APP_ENV` | No | `development` | Environment name (`development`, `production`, etc.) |
349
+ | `APP_VERSION` | No | `0.1.0` | Application version |
350
+ | `LOG_LEVEL` | No | `INFO` | Logging level |
351
+ | `AWS_REGION` | Yes | `ap-south-1` | AWS region |
352
+ | `AWS_ACCESS_KEY_ID` | No | — | AWS credentials (use IAM role in production) |
353
+ | `AWS_SECRET_ACCESS_KEY` | No | — | AWS credentials (use IAM role in production) |
354
+
355
+ ### Authentication (Cognito)
356
+
357
+ | Variable | Required | Default | Description |
358
+ |----------|----------|---------|-------------|
359
+ | `COGNITO_USER_POOL_ID` | Yes | — | Cognito User Pool ID |
360
+ | `COGNITO_APP_CLIENT_ID` | Yes | — | Cognito App Client ID |
361
+ | `COGNITO_APP_CLIENT_SECRET` | No | — | Client secret (required if app client has one) |
362
+
363
+ ### OpenSearch
364
+
365
+ | Variable | Required | Default | Description |
366
+ |----------|----------|---------|-------------|
367
+ | `OPENSEARCH_URL` | Yes | — | OpenSearch endpoint |
368
+ | `OPENSEARCH_USERNAME` | No | — | Basic-auth username (AWS managed domains) |
369
+ | `OPENSEARCH_PASSWORD` | No | — | Basic-auth password |
370
+ | `OPENSEARCH_USE_SSL` | No | `true` | Enable TLS |
371
+ | `OPENSEARCH_VERIFY_CERTS` | No | `true` | Verify TLS certificates |
372
+ | `OPENSEARCH_INDEX_PREFIX` | No | `pulse_kb` | Index name prefix per tenant |
373
+ | `EMBEDDING_DIMENSION` | No | `1536` | Vector dimension for kNN indexes |
374
+
375
+ ### Database & Cache
376
+
377
+ | Variable | Required | Default | Description |
378
+ |----------|----------|---------|-------------|
379
+ | `DATABASE_URL` | Yes | — | Async PostgreSQL DSN (`postgresql+asyncpg://...`) |
380
+ | `REDIS_URL` | No | `redis://localhost:6379/0` | Redis URL (enables Celery) |
381
+ | `CELERY_BROKER_URL` | No | — | Celery broker (defaults to `REDIS_URL`) |
382
+ | `CELERY_RESULT_BACKEND` | No | — | Celery result backend (defaults to `REDIS_URL`) |
383
+
384
+ ### Athena
385
+
386
+ | Variable | Required | Default | Description |
387
+ |----------|----------|---------|-------------|
388
+ | `ATHENA_AWS_ACCESS_KEY_ID` | No | — | Athena-specific AWS credentials |
389
+ | `ATHENA_AWS_SECRET_ACCESS_KEY` | No | — | Athena-specific AWS credentials |
390
+ | `ATHENA_OUTPUT_LOCATION` | Yes* | — | S3 URI for Athena query results |
391
+ | `ATHENA_WORKGROUP` | No | `primary` | Athena workgroup |
392
+ | `ATHENA_QUERY_TIMEOUT_SECONDS` | No | `60` | Athena query timeout |
393
+
394
+ ### Orchestrator (Prefect)
395
+
396
+ | Variable | Required | Default | Description |
397
+ |----------|----------|---------|-------------|
398
+ | `PULSE_ORCHESTRATOR_BACKEND` | No | `none` | `prefect` or `none` |
399
+ | `PREFECT_API_URL` | No | — | Prefect API endpoint |
400
+ | `PREFECT_API_KEY` | No | — | Prefect Cloud API key |
401
+ | `PREFECT_ECS_WORK_POOL_NAME` | No | `products-worker-pool` | ECS work pool name |
402
+ | `PREFECT_LAMBDA_WORK_POOL_NAME` | No | `lambda-worker-pool` | Lambda work pool name |
403
+ | `PREFECT_LAMBDA_FUNCTION_NAME_TEMPLATE` | No | `{product}-{stage}` | Lambda function name pattern |
404
+ | `PREFECT_K8S_WORK_POOL_NAME` | No | `k8s-worker-pool` | Kubernetes work pool name |
405
+ | `PREFECT_K8S_NAMESPACE` | No | `pulse-jobs` | Kubernetes namespace |
406
+ | `PREFECT_K8S_DEFAULT_CPU` | No | `500m` | Default CPU request |
407
+ | `PREFECT_K8S_DEFAULT_MEMORY` | No | `1Gi` | Default memory request |
408
+
409
+ ### LLM & Embeddings
410
+
411
+ | Variable | Required | Default | Description |
412
+ |----------|----------|---------|-------------|
413
+ | `PULSE_LLM_PROVIDER` | No | `openai` | LLM provider |
414
+ | `PULSE_LLM_MODEL` | No | `gpt-4o-mini` | LLM model ID |
415
+ | `PULSE_LLM_API_KEY` | No | — | LLM API key (also used as embedding fallback) |
416
+ | `PULSE_LLM_TEMPERATURE` | No | `0.0` | LLM sampling temperature |
417
+ | `PULSE_EMBEDDING_PROVIDER` | No | `openai` | Embedding provider |
418
+ | `PULSE_OPENAI_EMBEDDING_MODEL` | No | `text-embedding-3-small` | OpenAI embedding model |
419
+ | `PULSE_OPENAI_API_KEY` | No | — | OpenAI API key (overrides `PULSE_LLM_API_KEY`) |
420
+
421
+ ### Pipeline & Jobs
422
+
423
+ | Variable | Required | Default | Description |
424
+ |----------|----------|---------|-------------|
425
+ | `PULSE_ENGINE_URL` | No | — | Public URL containers use for callbacks |
426
+ | `PULSE_JOB_TOKEN_SECRET` | No | — | HMAC secret for job-scoped JWTs |
427
+ | `PULSE_S3_BUCKET` | No | — | S3 bucket for inter-stage NDJSON data |
428
+ | `PULSE_CHAIN_GRACE_PERIOD_SECONDS` | No | `300` | Seconds before chain recovery auto-triggers |
429
+ | `PULSE_MAX_CONCURRENT_JOBS_PER_TENANT` | No | `10` | Max concurrent jobs per tenant |
430
+ | `PULSE_DEFAULT_CHUNK_SIZE` | No | `512` | Default chunk token size |
431
+ | `PULSE_DEFAULT_CHUNK_STRATEGY` | No | `token_count` | Default chunking strategy |
432
+ | `PULSE_DEDUP_SIMILARITY_THRESHOLD` | No | `0.95` | Cosine similarity dedup threshold |
433
+
434
+ ### MCP Server
435
+
436
+ | Variable | Required | Default | Description |
437
+ |----------|----------|---------|-------------|
438
+ | `MCP_TRANSPORT` | No | `sse` | Transport mode: `sse` or `stdio` |
439
+ | `MCP_SSE_HOST` | No | `127.0.0.1` | MCP SSE server host |
440
+ | `MCP_SSE_PORT` | No | `8001` | MCP SSE server port |
441
+
442
+ ## API Authentication
443
+
444
+ All endpoints (except `/api/v1/health` and `/api/v1/auth/login`) require a JWT token.
445
+
446
+ > **Rate limits** — enforced per IP address:
447
+ > - **Login** (`POST /api/v1/auth/login`): 5 attempts per 60 seconds. Returns `429` with `Retry-After: 60` on breach.
448
+ > - **Global**: 100 requests per 60 seconds across all endpoints. Responses include `X-RateLimit-Limit` and `X-RateLimit-Remaining` headers.
449
+ >
450
+ > **Note** — Swagger UI (`/docs`) and ReDoc (`/redoc`) are disabled in production (`APP_ENV=production`).
451
+
452
+ ### Get a Token
453
+
454
+ ```bash
455
+ curl -X POST https://api.dev.pulse.mananalabs.ai/api/v1/auth/login \
456
+ -H "Content-Type: application/json" \
457
+ -d '{"email": "dev@pulse-engine.com", "password": "PulseDev@2026"}'
458
+ ```
459
+
460
+ Response:
461
+
462
+ ```json
463
+ {
464
+ "id_token": "eyJ...",
465
+ "access_token": "eyJ...",
466
+ "refresh_token": "eyJ...",
467
+ "expires_in": 3600,
468
+ "token_type": "Bearer",
469
+ "tenant_id": "tenant-dev-001",
470
+ "email": "dev@pulse-engine.com"
471
+ }
472
+ ```
473
+
474
+ ### Use the Token
475
+
476
+ Pass the `id_token` as a Bearer token in the `Authorization` header:
477
+
478
+ ```bash
479
+ curl -H "Authorization: Bearer <id_token>" \
480
+ https://api.dev.pulse.mananalabs.ai/api/v1/kb/stats
481
+ ```
482
+
483
+ Tokens expire after **1 hour**. Call the login endpoint again to get a new one.
484
+
485
+ ## API Documentation
486
+
487
+ - Swagger UI: https://api.dev.pulse.mananalabs.ai/docs
488
+ - ReDoc: https://api.dev.pulse.mananalabs.ai/redoc
489
+
490
+ ## Library Usage
491
+
492
+ Pulse Engine can also be used as a standalone library for content processing:
493
+
494
+ ```python
495
+ from pulse_engine.processor.core.topic_splitter import TopicSplitter
496
+
497
+ splitter = TopicSplitter(provider="openai", api_key="sk-...")
498
+ result = splitter.split([
499
+ (1, "Hi there"),
500
+ (2, "Let's discuss Q1 metrics"),
501
+ (3, "Revenue grew 20%"),
502
+ ])
503
+ print(result.topic_shifts)
504
+ ```
505
+
506
+ See **[docs/pulse_engine_library.md](docs/pulse_engine_library.md)** for full documentation on the Topic Splitter, LLM configuration, and configurable embeddings.
507
+
508
+ ## OCR Module
509
+
510
+ Extract text from PDFs and images using Google Gemini's vision capabilities:
511
+
512
+ ```python
513
+ from pulse_engine.processor.ocr.gemini import GeminiOCRProvider
514
+ from pulse_engine.processor.schemas import OCRInput
515
+
516
+ # Create provider
517
+ provider = GeminiOCRProvider()
518
+
519
+ # Option 1: Extract from file path
520
+ ocr_input = OCRInput(
521
+ file_path="/path/to/document.pdf",
522
+ mime_type="application/pdf",
523
+ prompt="Extract all text and structure as JSON",
524
+ temperature=0.0,
525
+ model="gemini-2.0-flash",
526
+ api_key="your-gemini-api-key",
527
+ max_output_tokens=4096,
528
+ )
529
+
530
+ # Option 2: Extract from bytes
531
+ ocr_input = OCRInput(
532
+ file_bytes=open("document.pdf", "rb").read(),
533
+ mime_type="application/pdf",
534
+ prompt="Extract invoice line items as JSON",
535
+ api_key="your-gemini-api-key",
536
+ )
537
+
538
+ # Extract
539
+ response = provider.extract(ocr_input)
540
+ print(response.text) # Raw JSON response from Gemini
541
+ ```
542
+
543
+ **OCRInput parameters:**
544
+ - `file_path` or `file_bytes`: PDF/image content (provide one)
545
+ - `mime_type`: `application/pdf`, `image/png`, `image/jpeg`, `image/webp` (default: `application/pdf`)
546
+ - `prompt`: Instructions for extraction (e.g., "Extract all text as JSON")
547
+ - `temperature`: LLM randomness, 0.0-1.0 (default: `0.1`)
548
+ - `model`: Gemini model ID (default: `gemini-2.5-flash`)
549
+ - `api_key`: Google Gemini API key
550
+ - `max_output_tokens`: Max response length (default: `65536`)
551
+
552
+ **Response:**
553
+ Returns raw Gemini response object with `.text` property containing the extraction result.
554
+
555
+
556
+
557
+ ## Further Reading
558
+
559
+ - [Building a Product](docs/building-a-product.md) — step-by-step guide to creating a new product
560
+ - [Design Decisions](docs/design-decisions.md) — architectural decisions and rationale
561
+ - [Infrastructure](docs/infrastructure.md) — AWS deployment architecture
562
+ - [Library Usage](docs/pulse_engine_library.md) — topic splitting, LLM config, and embeddings
563
+