@pgflow/core 0.0.5-prealpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. package/LICENSE.md +660 -0
  2. package/README.md +373 -0
  3. package/__tests__/mocks/index.ts +1 -0
  4. package/__tests__/mocks/postgres.ts +37 -0
  5. package/__tests__/types/PgflowSqlClient.test-d.ts +59 -0
  6. package/dist/LICENSE.md +660 -0
  7. package/dist/README.md +373 -0
  8. package/dist/index.js +54 -0
  9. package/docs/options_for_flow_and_steps.md +75 -0
  10. package/docs/pgflow-blob-reference-system.md +179 -0
  11. package/eslint.config.cjs +22 -0
  12. package/example-flow.mermaid +5 -0
  13. package/example-flow.svg +1 -0
  14. package/flow-lifecycle.mermaid +83 -0
  15. package/flow-lifecycle.svg +1 -0
  16. package/out-tsc/vitest/__tests__/mocks/index.d.ts +2 -0
  17. package/out-tsc/vitest/__tests__/mocks/index.d.ts.map +1 -0
  18. package/out-tsc/vitest/__tests__/mocks/postgres.d.ts +15 -0
  19. package/out-tsc/vitest/__tests__/mocks/postgres.d.ts.map +1 -0
  20. package/out-tsc/vitest/__tests__/types/PgflowSqlClient.test-d.d.ts +2 -0
  21. package/out-tsc/vitest/__tests__/types/PgflowSqlClient.test-d.d.ts.map +1 -0
  22. package/out-tsc/vitest/tsconfig.spec.tsbuildinfo +1 -0
  23. package/out-tsc/vitest/vite.config.d.ts +3 -0
  24. package/out-tsc/vitest/vite.config.d.ts.map +1 -0
  25. package/package.json +28 -0
  26. package/pkgs/core/dist/index.js +54 -0
  27. package/pkgs/core/dist/pkgs/core/LICENSE.md +660 -0
  28. package/pkgs/core/dist/pkgs/core/README.md +373 -0
  29. package/pkgs/dsl/dist/index.js +123 -0
  30. package/pkgs/dsl/dist/pkgs/dsl/README.md +11 -0
  31. package/project.json +125 -0
  32. package/prompts/architect.md +87 -0
  33. package/prompts/condition.md +33 -0
  34. package/prompts/declarative_sql.md +15 -0
  35. package/prompts/deps_in_payloads.md +20 -0
  36. package/prompts/dsl-multi-arg.ts +48 -0
  37. package/prompts/dsl-options.md +39 -0
  38. package/prompts/dsl-single-arg.ts +51 -0
  39. package/prompts/dsl-two-arg.ts +61 -0
  40. package/prompts/dsl.md +119 -0
  41. package/prompts/fanout_steps.md +1 -0
  42. package/prompts/json_schemas.md +36 -0
  43. package/prompts/one_shot.md +286 -0
  44. package/prompts/pgtap.md +229 -0
  45. package/prompts/sdk.md +59 -0
  46. package/prompts/step_types.md +62 -0
  47. package/prompts/versioning.md +16 -0
  48. package/queries/fail_permanently.sql +17 -0
  49. package/queries/fail_task.sql +21 -0
  50. package/queries/sequential.sql +47 -0
  51. package/queries/two_roots_left_right.sql +59 -0
  52. package/schema.svg +1 -0
  53. package/scripts/colorize-pgtap-output.awk +72 -0
  54. package/scripts/run-test-with-colors +5 -0
  55. package/scripts/watch-test +7 -0
  56. package/src/PgflowSqlClient.ts +85 -0
  57. package/src/database-types.ts +759 -0
  58. package/src/index.ts +3 -0
  59. package/src/types.ts +103 -0
  60. package/supabase/config.toml +32 -0
  61. package/supabase/migrations/000000_schema.sql +150 -0
  62. package/supabase/migrations/000005_create_flow.sql +29 -0
  63. package/supabase/migrations/000010_add_step.sql +48 -0
  64. package/supabase/migrations/000015_start_ready_steps.sql +45 -0
  65. package/supabase/migrations/000020_start_flow.sql +46 -0
  66. package/supabase/migrations/000030_read_with_poll_backport.sql +70 -0
  67. package/supabase/migrations/000040_poll_for_tasks.sql +100 -0
  68. package/supabase/migrations/000045_maybe_complete_run.sql +30 -0
  69. package/supabase/migrations/000050_complete_task.sql +98 -0
  70. package/supabase/migrations/000055_calculate_retry_delay.sql +11 -0
  71. package/supabase/migrations/000060_fail_task.sql +124 -0
  72. package/supabase/migrations/000_edge_worker_initial.sql +86 -0
  73. package/supabase/seed.sql +202 -0
  74. package/supabase/tests/add_step/basic_step_addition.test.sql +29 -0
  75. package/supabase/tests/add_step/circular_dependency.test.sql +21 -0
  76. package/supabase/tests/add_step/flow_isolation.test.sql +26 -0
  77. package/supabase/tests/add_step/idempotent_step_addition.test.sql +20 -0
  78. package/supabase/tests/add_step/invalid_step_slug.test.sql +16 -0
  79. package/supabase/tests/add_step/nonexistent_dependency.test.sql +16 -0
  80. package/supabase/tests/add_step/nonexistent_flow.test.sql +13 -0
  81. package/supabase/tests/add_step/options.test.sql +66 -0
  82. package/supabase/tests/add_step/step_with_dependency.test.sql +36 -0
  83. package/supabase/tests/add_step/step_with_multiple_dependencies.test.sql +46 -0
  84. package/supabase/tests/complete_task/archives_message.test.sql +67 -0
  85. package/supabase/tests/complete_task/completes_run_if_no_more_remaining_steps.test.sql +62 -0
  86. package/supabase/tests/complete_task/completes_task_and_updates_dependents.test.sql +64 -0
  87. package/supabase/tests/complete_task/decrements_remaining_steps_if_completing_step.test.sql +62 -0
  88. package/supabase/tests/complete_task/saves_output_when_completing_run.test.sql +57 -0
  89. package/supabase/tests/create_flow/flow_creation.test.sql +27 -0
  90. package/supabase/tests/create_flow/idempotency_and_duplicates.test.sql +26 -0
  91. package/supabase/tests/create_flow/invalid_slug.test.sql +13 -0
  92. package/supabase/tests/create_flow/options.test.sql +57 -0
  93. package/supabase/tests/fail_task/exponential_backoff.test.sql +70 -0
  94. package/supabase/tests/fail_task/mark_as_failed_if_no_retries_available.test.sql +49 -0
  95. package/supabase/tests/fail_task/respects_flow_retry_settings.test.sql +48 -0
  96. package/supabase/tests/fail_task/respects_step_retry_settings.test.sql +48 -0
  97. package/supabase/tests/fail_task/retry_task_if_retries_available.test.sql +39 -0
  98. package/supabase/tests/is_valid_slug.test.sql +72 -0
  99. package/supabase/tests/poll_for_tasks/builds_proper_input_from_deps_outputs.test.sql +35 -0
  100. package/supabase/tests/poll_for_tasks/hides_messages.test.sql +35 -0
  101. package/supabase/tests/poll_for_tasks/increments_attempts_count.test.sql +35 -0
  102. package/supabase/tests/poll_for_tasks/multiple_task_processing.test.sql +24 -0
  103. package/supabase/tests/poll_for_tasks/polls_only_queued_tasks.test.sql +35 -0
  104. package/supabase/tests/poll_for_tasks/reads_messages.test.sql +38 -0
  105. package/supabase/tests/poll_for_tasks/returns_no_tasks_if_no_step_task_for_message.test.sql +34 -0
  106. package/supabase/tests/poll_for_tasks/returns_no_tasks_if_queue_is_empty.test.sql +19 -0
  107. package/supabase/tests/poll_for_tasks/returns_no_tasks_when_qty_set_to_0.test.sql +22 -0
  108. package/supabase/tests/poll_for_tasks/sets_vt_delay_based_on_opt_timeout.test.sql +41 -0
  109. package/supabase/tests/poll_for_tasks/tasks_reapppear_if_not_processed_in_time.test.sql +59 -0
  110. package/supabase/tests/start_flow/creates_run.test.sql +24 -0
  111. package/supabase/tests/start_flow/creates_step_states_for_all_steps.test.sql +25 -0
  112. package/supabase/tests/start_flow/creates_step_tasks_only_for_root_steps.test.sql +54 -0
  113. package/supabase/tests/start_flow/returns_run.test.sql +24 -0
  114. package/supabase/tests/start_flow/sends_messages_on_the_queue.test.sql +50 -0
  115. package/supabase/tests/start_flow/starts_only_root_steps.test.sql +21 -0
  116. package/supabase/tests/step_dsl_is_idempotent.test.sql +34 -0
  117. package/tsconfig.json +16 -0
  118. package/tsconfig.lib.json +26 -0
  119. package/tsconfig.spec.json +35 -0
  120. package/vite.config.ts +57 -0
package/dist/README.md ADDED
@@ -0,0 +1,373 @@
1
+ # pgflow SQL Core
2
+
3
+ PostgreSQL-native workflow engine for defining, managing, and tracking DAG-based workflows directly in your database.
4
+
5
+ > [!NOTE]
6
+ > This project is licensed under [AGPL v3](./LICENSE.md) license and is part of **pgflow** stack.
7
+ > See [LICENSING_OVERVIEW.md](../../LICENSING_OVERVIEW.md) in root of this monorepo for more details.
8
+
9
+ ## Table of Contents
10
+
11
+ - [Overview](#overview)
12
+ - [Key Features](#key-features)
13
+ - [Architecture](#architecture)
14
+ - [Schema Design](#schema-design)
15
+ - [Execution Model](#execution-model)
16
+ - [Example Flow and its life](#example-flow-and-its-life)
17
+ - [Defining a Workflow](#defining-a-workflow)
18
+ - [Starting a Workflow Run](#starting-a-workflow-run)
19
+ - [Workflow Execution](#workflow-execution)
20
+ - [Task Polling](#task-polling)
21
+ - [Task Completion](#task-completion)
22
+ - [Error Handling](#error-handling)
23
+ - [Retries and Timeouts](#retries-and-timeouts)
24
+ - [TypeScript Flow DSL](#typescript-flow-dsl)
25
+ - [Overview](#overview-1)
26
+ - [Type Inference System](#type-inference-system)
27
+ - [Basic Example](#basic-example)
28
+ - [How Payload Types Are Built](#how-payload-types-are-built)
29
+ - [Benefits of Automatic Type Inference](#benefits-of-automatic-type-inference)
30
+ - [Data Flow](#data-flow)
31
+ - [Input and Output Handling](#input-and-output-handling)
32
+ - [Run Completion](#run-completion)
33
+
34
+ ## Overview
35
+
36
+ The pgflow SQL Core provides the data model, state machine, and transactional functions for workflow management. It treats workflows as Directed Acyclic Graphs (DAGs) of steps, each step being a simple state machine.
37
+
38
+ This package focuses on:
39
+
40
+ - Defining and storing workflow shapes
41
+ - Managing workflow state transitions
42
+ - Exposing transactional functions for workflow operations
43
+ - Providing APIs for task polling and status updates
44
+
45
+ The actual execution of workflow tasks is handled by the [Edge Worker](../edge-worker/README.md), which calls back to the SQL Core to acknowledge task completion or failure.
46
+
47
+ ## Key Features
48
+
49
+ - **Declarative Workflows**: Define flows and steps via SQL tables
50
+ - **Dependency Management**: Explicit step dependencies with atomic transitions
51
+ - **Configurable Behavior**: Per-flow and per-step options for timeouts, retries, and delays
52
+ - **Queue Integration**: Built on pgmq for reliable task processing
53
+ - **Transactional Guarantees**: All state transitions are ACID-compliant
54
+
55
+ ## Architecture
56
+
57
+ ### Schema Design
58
+
59
+ [Schema ERD Diagram (click to enlarge)](./schema.svg)
60
+
61
+ <a href="./schema.svg">
62
+ <img src="./schema.svg" alt="Schema ERD Diagram" width="25%" height="25%">
63
+ </a>
64
+
65
+ ---
66
+
67
+ The schema consists of two main categories of tables:
68
+
69
+ #### Static definition tables
70
+
71
+ - `flows` (just an identity for the workflow with some global options)
72
+ - `steps` (DAG nodes belonging to particular `flows`, with option overrides)
73
+ - `deps` (DAG edges between `steps`)
74
+
75
+ #### Runtime state tables
76
+
77
+ - `runs` (execution instances of `flows`)
78
+ - `step_states` (states of individual `steps` within a `run`)
79
+ - `step_tasks` (units of work for individual `steps` within a `run`, so we can have fanouts)
80
+
81
+ ### Execution Model
82
+
83
+ The SQL Core handles the workflow lifecycle through these key operations:
84
+
85
+ 1. **Definition**: Workflows are defined using `create_flow` and `add_step`
86
+ 2. **Instantiation**: Workflow instances are started with `start_flow`, creating a new run
87
+ 3. **Task Management**: The [Edge Worker](../edge-worker/README.md) polls for available tasks using `poll_for_tasks`
88
+ 4. **State Transitions**: When the Edge Worker reports back using `complete_task` or `fail_task`, the SQL Core handles state transitions and schedules dependent steps
89
+
90
+ [Flow lifecycle diagram (click to enlarge)](./flow-lifecycle.svg)
91
+
92
+ <a href="./flow-lifecycle.svg"><img src="./flow-lifecycle.svg" alt="Flow Lifecycle" width="25%" height="25%"></a>
93
+
94
+ ## Example flow and its life
95
+
96
+ Let's walk through creating and running a workflow that fetches a website,
97
+ does summarization and sentiment analysis in parallel steps
98
+ and saves the results to a database.
99
+
100
+ ![example flow graph](./example-flow.svg)
101
+
102
+ ### Defining a Workflow
103
+
104
+ Workflows are defined using two SQL functions: `create_flow` and `add_step`.
105
+
106
+ In this example, we'll create a workflow with:
107
+ - `website` as the entry point ("root step")
108
+ - `sentiment` and `summary` as parallel steps that depend on `website`
109
+ - `saveToDb` as the final step, depending on both parallel steps
110
+
111
+ ```sql
112
+ -- Define workflow with parallel steps
113
+ SELECT pgflow.create_flow('analyze_website');
114
+ SELECT pgflow.add_step('analyze_website', 'website');
115
+ SELECT pgflow.add_step('analyze_website', 'sentiment', deps_slugs => ARRAY['website']);
116
+ SELECT pgflow.add_step('analyze_website', 'summary', deps_slugs => ARRAY['website']);
117
+ SELECT pgflow.add_step('analyze_website', 'saveToDb', deps_slugs => ARRAY['sentiment', 'summary']);
118
+ ```
119
+
120
+ > [!WARNING]
121
+ > You need to call `add_step` in topological order, which is enforced by foreign key constraints.
122
+
123
+ > [!NOTE]
124
+ > You can have multiple "root steps" in a workflow. You can even create a root-steps-only workflow
125
+ > to process a single input in parallel, because at the end, all of the outputs from steps
126
+ > that does not have dependents ("final steps") are aggregated and saved as run's `output`.
127
+
128
+ ### Starting a Workflow Run
129
+
130
+ To start a workflow, call `start_flow` with a flow slug and input arguments:
131
+
132
+ ```sql
133
+ SELECT * FROM pgflow.start_flow(
134
+ flow_slug => 'analyze_website',
135
+ input => '{"url": "https://example.com"}'::jsonb
136
+ );
137
+
138
+ -- run_id | flow_slug | status | input | output | remaining_steps
139
+ -- ------------+-----------------+---------+--------------------------------+--------+-----------------
140
+ -- <run uuid> | analyze_website | started | {"url": "https://example.com"} | [NULL] | 4
141
+ ```
142
+
143
+ When a workflow starts:
144
+ - A new `run` record is created
145
+ - Initial states for all steps are created
146
+ - Root steps are marked as `started`
147
+ - Tasks are created for root steps
148
+ - Messages are enqueued on PGMQ for worker processing
149
+
150
+ > [!NOTE]
151
+ > The `input` argument must be a valid JSONB object: string, number, boolean, array, object or null.
152
+
153
+ ### Workflow Execution
154
+
155
+ #### Task Polling
156
+
157
+ The Edge Worker continuously polls for available tasks using the `poll_for_tasks` function:
158
+
159
+ ```sql
160
+ SELECT * FROM pgflow.poll_for_tasks(
161
+ queue_name => 'analyze_website',
162
+ vt => 60, -- visibility timeout in seconds
163
+ qty => 5 -- maximum number of tasks to fetch
164
+ );
165
+ ```
166
+
167
+ When a task is polled:
168
+
169
+ 1. The message is hidden from other workers for the specified timeout period
170
+ 2. The task's attempts counter is incremented for retry tracking
171
+ 3. An input object is built by combining the run input with outputs from completed dependency steps
172
+ 4. Task metadata and input are returned to the worker
173
+
174
+ This process happens in a single transaction to ensure reliability. The worker then executes the appropriate handler function based on the task metadata.
175
+
176
+ #### Task Completion
177
+
178
+ After successful processing, the worker acknowledges completion:
179
+
180
+ ```sql
181
+ SELECT pgflow.complete_task(
182
+ run_id => '<run_uuid>',
183
+ step_slug => 'website',
184
+ task_index => 0, -- we will have multiple tasks for a step in the future
185
+ output => '{"content": "HTML content", "status": 200}'::jsonb
186
+ );
187
+ ```
188
+
189
+ When a task completes:
190
+ 1. The task status is updated to 'completed' and the output is saved
191
+ 2. The message is archived in PGMQ
192
+ 3. The step state is updated to 'completed'
193
+ 4. Dependent steps with all dependencies completed are automatically started
194
+ 5. The run's remaining_steps counter is decremented
195
+ 6. If all steps are completed, the run is marked as completed with aggregated outputs
196
+
197
+ #### Error Handling
198
+
199
+ If a task fails, the worker acknowledges this using `fail_task`:
200
+
201
+ ```sql
202
+ SELECT pgflow.fail_task(
203
+ run_id => '<run_uuid>',
204
+ step_slug => 'website',
205
+ task_index => 0,
206
+ error_message => 'Connection timeout when fetching URL'::text
207
+ );
208
+ ```
209
+
210
+ The system handles failures by:
211
+
212
+ 1. Checking if retry attempts are available
213
+ 2. For available retries:
214
+ - Keeping the task in 'queued' status
215
+ - Applying exponential backoff for visibility
216
+ - Preventing processing until the visibility timeout expires
217
+ 3. When retries are exhausted:
218
+ - Marking the task as 'failed'
219
+ - Marking the step as 'failed'
220
+ - Marking the run as 'failed'
221
+ - Archiving the message in PGMQ
222
+ - Notifying workers to abort pending tasks (future feature)
223
+
224
+ #### Retries and Timeouts
225
+
226
+ Retry behavior can be configured at both the flow and step level:
227
+
228
+ ```sql
229
+ -- Flow-level defaults
230
+ SELECT pgflow.create_flow(
231
+ flow_slug => 'analyze_website',
232
+ max_attempts => 3, -- Maximum retry attempts (including first attempt)
233
+ base_delay => 5, -- Base delay in seconds for exponential backoff
234
+ timeout => 60 -- Task timeout in seconds
235
+ );
236
+
237
+ -- Step-level overrides
238
+ SELECT pgflow.add_step(
239
+ flow_slug => 'analyze_website',
240
+ step_slug => 'sentiment',
241
+ deps_slugs => ARRAY['website']::text[],
242
+ max_attempts => 5, -- Override max attempts for this step
243
+ base_delay => 2, -- Override base delay for exponential backoff
244
+ timeout => 30 -- Override timeout for this step
245
+ );
246
+ ```
247
+
248
+ The system applies exponential backoff for retries using the formula:
249
+ ```
250
+ delay = base_delay * (2 ^ attempts_count)
251
+ ```
252
+
253
+ Timeouts are enforced by setting the message visibility timeout to the step's timeout value plus a small buffer. If a worker doesn't acknowledge completion or failure within this period, the task becomes visible again and can be retried.
254
+
255
+ ## TypeScript Flow DSL
256
+
257
+ > [!NOTE]
258
+ > TypeScript Flow DSL is a Work In Progress and is not ready yet!
259
+
260
+ ### Overview
261
+
262
+ While the SQL Core engine handles workflow definitions and state management, the primary way to define and work with your workflow logic is via the Flow DSL in TypeScript. This DSL offers a fluent API that makes it straightforward to outline the steps in your flow with full type safety.
263
+
264
+ ### Type Inference System
265
+
266
+ The most powerful feature of the Flow DSL is its **automatic type inference system**:
267
+
268
+ 1. You only need to annotate the initial Flow input type
269
+ 2. The return type of each step is automatically inferred from your handler function
270
+ 3. These return types become available in the payload of dependent steps
271
+ 4. The TypeScript compiler builds a complete type graph matching your workflow DAG
272
+
273
+ This means you get full IDE autocompletion and type checking throughout your workflow without manual type annotations.
274
+
275
+ ### Basic Example
276
+
277
+ Here's an example that matches our website analysis workflow:
278
+
279
+ ```ts
280
+ // Provide a type for the input of the Flow
281
+ type Input = {
282
+ url: string;
283
+ };
284
+
285
+ const AnalyzeWebsite = new Flow<Input>({
286
+ slug: "analyze_website",
287
+ maxAttempts: 3,
288
+ baseDelay: 5,
289
+ timeout: 10,
290
+ })
291
+ .step({ slug: "website" }, async (input) => await scrapeWebsite(input.run.url))
292
+ .step(
293
+ { slug: "sentiment", dependsOn: ["website"], timeout: 30, maxAttempts: 5 },
294
+ async (input) => await analyzeSentiment(input.website.content)
295
+ )
296
+ .step(
297
+ { slug: "summary", dependsOn: ["website"] },
298
+ async (input) => await summarizeWithAI(input.website.content)
299
+ )
300
+ .step(
301
+ { slug: "saveToDb", dependsOn: ["sentiment", "summary"] },
302
+ async (input) =>
303
+ await saveToDb({
304
+ websiteUrl: input.run.url,
305
+ sentiment: input.sentiment.score,
306
+ summary: input.summary,
307
+ }).status
308
+ );
309
+ ```
310
+
311
+ ### How Payload Types Are Built
312
+
313
+ The payload object for each step is constructed dynamically based on:
314
+
315
+ 1. **The `run` property**: Always contains the original workflow input
316
+ 2. **Dependency outputs**: Each dependency's output is available under a key matching the dependency's ID
317
+ 3. **DAG structure**: Only outputs from direct dependencies are included in the payload
318
+
319
+ This means your step handlers receive exactly the data they need, properly typed, without any manual type declarations beyond the initial Flow input type.
320
+
321
+ ### Benefits of Automatic Type Inference
322
+
323
+ - **Refactoring safety**: Change a step's output, and TypeScript will flag all dependent steps that need updates
324
+ - **Discoverability**: IDE autocompletion shows exactly what data is available in each step
325
+ - **Error prevention**: Catch typos and type mismatches at compile time, not runtime
326
+ - **Documentation**: The types themselves serve as living documentation of your workflow's data flow
327
+
328
+ ## Data Flow
329
+
330
+ ### Input and Output Handling
331
+
332
+ Handlers in pgflow **must return** JSON-serializable values that are captured and saved when `complete_task` is called. These outputs become available as inputs to dependent steps, allowing data to flow through your workflow pipeline.
333
+
334
+ When a step is executed, it receives an input object where:
335
+ - Each key is a step_slug of a completed dependency
336
+ - Each value is that step's output
337
+ - A special "run" key contains the original workflow input
338
+
339
+ #### Example: `sentiment`
340
+
341
+ When the `sentiment` step runs, it receives:
342
+
343
+ ```json
344
+ {
345
+ "run": {"url": "https://example.com"},
346
+ "website": {"content": "HTML content", "status": 200}
347
+ }
348
+ ```
349
+
350
+ #### Example: `saveToDb`
351
+
352
+ The `saveToDb` step depends on both `sentiment` and `summary`:
353
+
354
+ ```json
355
+ {
356
+ "run": {"url": "https://example.com"},
357
+ "sentiment": {"score": 0.85, "label": "positive"},
358
+ "summary": "This website discusses various topics related to technology and innovation."
359
+ }
360
+ ```
361
+
362
+ ### Run Completion
363
+
364
+ When all steps in a run are completed, the run status is automatically updated to 'completed' and its output is set. The output is an aggregation of all the outputs from final steps (steps that have no dependents):
365
+
366
+ ```sql
367
+ -- Example of a completed run with output
368
+ SELECT run_id, status, output FROM pgflow.runs WHERE run_id = '<run_uuid>';
369
+
370
+ -- run_id | status | output
371
+ -- ------------+-----------+-----------------------------------------------------
372
+ -- <run uuid> | completed | {"saveToDb": {"status": "success"}}
373
+ ```
package/dist/index.js ADDED
@@ -0,0 +1,54 @@
1
+ // pkgs/core/src/PgflowSqlClient.ts
2
+ var PgflowSqlClient = class {
3
+ constructor(sql) {
4
+ this.sql = sql;
5
+ }
6
+ async pollForTasks(queueName, batchSize = 20, visibilityTimeout = 2, maxPollSeconds = 5, pollIntervalMs = 200) {
7
+ return await this.sql`
8
+ SELECT *
9
+ FROM pgflow.poll_for_tasks(
10
+ queue_name => ${queueName},
11
+ vt => ${visibilityTimeout},
12
+ qty => ${batchSize},
13
+ max_poll_seconds => ${maxPollSeconds},
14
+ poll_interval_ms => ${pollIntervalMs}
15
+ );
16
+ `;
17
+ }
18
+ async completeTask(stepTask, output) {
19
+ await this.sql`
20
+ SELECT pgflow.complete_task(
21
+ run_id => ${stepTask.run_id}::uuid,
22
+ step_slug => ${stepTask.step_slug}::text,
23
+ task_index => ${0}::int,
24
+ output => ${this.sql.json(output || null)}::jsonb
25
+ );
26
+ `;
27
+ }
28
+ async failTask(stepTask, error) {
29
+ const errorString = typeof error === "string" ? error : error instanceof Error ? error.message : JSON.stringify(error);
30
+ await this.sql`
31
+ SELECT pgflow.fail_task(
32
+ run_id => ${stepTask.run_id}::uuid,
33
+ step_slug => ${stepTask.step_slug}::text,
34
+ task_index => ${0}::int,
35
+ error_message => ${errorString}::text
36
+ );
37
+ `;
38
+ }
39
+ async startFlow(flow, input) {
40
+ const results = await this.sql`
41
+ SELECT * FROM pgflow.start_flow(${flow.slug}::text, ${this.sql.json(
42
+ input
43
+ )}::jsonb);
44
+ `;
45
+ if (results.length === 0) {
46
+ throw new Error(`Failed to start flow ${flow.slug}`);
47
+ }
48
+ const [flowRun] = results;
49
+ return flowRun;
50
+ }
51
+ };
52
+ export {
53
+ PgflowSqlClient
54
+ };
@@ -0,0 +1,75 @@
1
+ :::
2
+
3
+ 1. MVP will allow only 1:1 queue:flow mappings, because it simplifies a lot.
4
+ Queue is created when creating a flow, in pgflow.create_flow() function.
5
+
6
+ 2. No, we will start with either static delay or simple exponential and
7
+ I do not plan to expand on it further.
8
+
9
+ The backoff will be calculated by the retry attempts, that can be deducted
10
+ from pgmq's "read_ct" counter.
11
+
12
+ No need for jitter for now.
13
+
14
+ 3. **Execution Timeouts**
15
+ I'm not sure about flow timeouts, I think for MVP we should probably
16
+ skip them and I'm not even sure if I want to have them for the steps too.
17
+
18
+ I would be definitely adding them in future.
19
+
20
+ 4. We want everything to by statically typed in TypeScript, because the main
21
+ way to define flows would be to use TS DSL.
22
+
23
+ So a condition must be a JSON-serializable object that will get saved
24
+ in a JSONB column.
25
+
26
+ My initial idea for conditions was to just provide a JSON object that we
27
+ will use to perform containment check on the step inputs using @> operator.
28
+
29
+ I am considering expanding it to more robust condition, but they would need
30
+ to be defined in JSON-serializable way.
31
+
32
+ I do not want to have any SQL snippets in conditions, because I cannot
33
+ statically type them and they can fail at runtime.
34
+
35
+ No, conditions should probably only be able to reference step inputs.
36
+
37
+ 5. Yes it should be able to disable retries per step or override their params.
38
+ How those circuit breakers should work and what would be the benefit of them?
39
+ What is retry budget? Do we have a distributed system really? Everything
40
+ lives in a single postgres instance.
41
+
42
+ 6. No we do not want to support dependency resolution at runtime,
43
+ but conditions could be used to implement something similar.
44
+ Steps should not be able to add new steps during runtime,
45
+ but I plan to have fanout steps that will spawn either multiple tasks
46
+ or multiple subflows, one per the input array item.
47
+ Those are meant to be aggregated back to the output array when completed.
48
+
49
+ It's not for MVP tho!!
50
+
51
+ ### Cross-Cutting Concerns
52
+ 7. We will advise users to not put anything sensitive into flow options,
53
+ by writing docs and also not having handlers any ability to access the
54
+ step options.
55
+
56
+ We will provide a Context object that users can define in the Flow DSL,
57
+ that will be passed to the step handlers at runtime and will encourage
58
+ users to use this Context object to store sensitive data.
59
+
60
+ 8. What you mean by tracked metadata for options and options affecting metric?
61
+
62
+ 9. Versioning is a big problem for my project - I have few ideas how to solve
63
+ it (basically topologically sort graph and hash it to create a version hash).
64
+ But for MVP we definitely don't need versioning - users must take care of
65
+ this on their own by just creating new flows if they change the shape of
66
+ the flow. There is no way and will be no way to UPDATE flows.
67
+
68
+ Graph shape should be immutable after creation, but i'm not sure about
69
+ the options - maybe it would be a good idea to allow updating retry configs,
70
+ because only running the flows in production can allow users to gather
71
+ enough data to make educated decisions about those params.
72
+
73
+ 10. No modification for flows/steps at all, so easy.
74
+
75
+ :::
@@ -0,0 +1,179 @@
1
+ # PgFlow Blob Reference System
2
+
3
+ ## Overview
4
+
5
+ PgFlow needs an efficient way to handle large data outputs from workflow steps. The Blob Reference System provides a solution by separating large data payloads from workflow control information while maintaining a seamless developer experience.
6
+
7
+ ## How It Works
8
+
9
+ ### Core Concept
10
+
11
+ When steps produce large outputs (e.g., HTML content from web scraping, binary data, large API responses), these outputs are stored separately in a dedicated blob storage table. The workflow state maintains references to these blobs rather than storing the actual large data.
12
+
13
+ ### Database Structure
14
+
15
+ The system uses a dedicated table for blob storage:
16
+
17
+ ```sql
18
+ CREATE TABLE pgflow.output_blobs (
19
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
20
+ content JSONB NOT NULL,
21
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
22
+ );
23
+ ```
24
+
25
+ ### Worker Task Structure
26
+
27
+ The `poll_for_tasks` function returns tasks with both regular inputs and blob references through a custom type:
28
+
29
+ ```sql
30
+ CREATE TYPE pgflow.step_task_record AS (
31
+ flow_slug TEXT,
32
+ run_id UUID,
33
+ step_slug TEXT,
34
+ input JSONB,
35
+ blobs_refs JSONB
36
+ );
37
+ ```
38
+
39
+ This design provides a clean separation between:
40
+
41
+ - `input`: Regular small data that can be directly included in the task
42
+ - `blobs_refs`: References to large data stored separately in the blob table
43
+
44
+ ### Example Return Value
45
+
46
+ A task returned by `poll_for_tasks` might look like:
47
+
48
+ ```json
49
+ {
50
+ "flow_slug": "my_flow",
51
+ "run_id": "1234-5678-90ab-cdef",
52
+ "step_slug": "my_step",
53
+ "input": {
54
+ "run": "run input",
55
+ "dependency_a": "dependency_a output"
56
+ },
57
+ "blobs_refs": {
58
+ "dependency_b": "<uuid to the blob saved for dependency_b which returned binary data>"
59
+ }
60
+ }
61
+ ```
62
+
63
+ In this example:
64
+
65
+ - `dependency_a` had a small output that's included directly in the `input` object
66
+ - `dependency_b` had a large output (possibly binary data) that's stored as a blob, with only a reference included
67
+
68
+ ### Queue Efficiency
69
+
70
+ A critical optimization in PgFlow is that the task queue only stores minimal task identification information:
71
+
72
+ - flow_slug
73
+ - run_id
74
+ - step_slug
75
+ - task_index
76
+
77
+ This lightweight approach keeps queue messages small and efficient. When a worker picks up a task, it uses these identifiers to:
78
+
79
+ 1. Call `poll_for_tasks` to get the full task data
80
+ 2. Receive both the regular `input` and `blobs_refs` in a single query result
81
+ 3. Fetch the actual blob content for any referenced blobs
82
+ 4. Combine all data to form the complete input for the task handler
83
+
84
+ ## Implementation Flow
85
+
86
+ ### Task Creation
87
+
88
+ 1. When a step completes, its output is analyzed:
89
+
90
+ - Outputs below the size threshold remain in the regular output JSONB
91
+ - Large outputs are stored in the `pgflow.output_blobs` table with a unique ID
92
+
93
+ 2. The `start_ready_steps` function:
94
+ - Creates task entries with references to any large blob data
95
+ - Enqueues only the task identifiers (not the actual data) in the task queue
96
+
97
+ ### Task Execution
98
+
99
+ 1. Worker picks up the task identifier from the queue
100
+ 2. Worker calls `poll_for_tasks` to get the task details
101
+ 3. `poll_for_tasks` returns:
102
+ - The `input` object with regular data
103
+ - The `blobs_refs` object with references to any large data outputs
104
+ 4. Worker fetches blob content for any references in `blobs_refs`
105
+ 5. Worker assembles the complete input (combining regular data and blob data) for the task handler
106
+ 6. Task handler executes with the complete data, unaware of the blob reference system
107
+
108
+ ### Example Processing Flow
109
+
110
+ For a web scraping workflow:
111
+
112
+ 1. `fetch-html` step returns a large HTML string (3MB)
113
+ 2. System detects the large output and:
114
+ - Stores HTML in `pgflow.output_blobs` with ID "abc-123"
115
+ - Records only the blob reference in the step's output
116
+ 3. When `parse-html` step is ready to run:
117
+ - Queue contains only the task identifier
118
+ - `poll_for_tasks` returns the task with:
119
+ ```json
120
+ {
121
+ "input": {
122
+ "run": { "url": "https://example.com" }
123
+ },
124
+ "blobs_refs": {
125
+ "fetch-html": "abc-123"
126
+ }
127
+ }
128
+ ```
129
+ 4. Worker:
130
+ - Detects the blob reference "abc-123" for "fetch-html"
131
+ - Fetches the actual HTML content from the blob table
132
+ - Provides the handler with complete input including the HTML content
133
+
134
+ ## Developer Experience
135
+
136
+ From a workflow developer's perspective, the blob reference system is completely transparent:
137
+
138
+ ```typescript
139
+ // Developer writes code as if all data is directly available
140
+ const parseHtmlHandler: StepHandler<ParseInput, ParseOutput> = async (
141
+ input
142
+ ) => {
143
+ // input.dependencies["fetch-html"] contains the full HTML content
144
+ // (the blob reference was automatically resolved)
145
+ const html = input.dependencies['fetch-html'];
146
+
147
+ // Process the HTML...
148
+ const title = extractTitle(html);
149
+ const links = extractLinks(html);
150
+
151
+ return { title, links };
152
+ };
153
+ ```
154
+
155
+ The developer never needs to:
156
+
157
+ - Manually resolve blob references
158
+ - Check if data is a reference or actual content
159
+ - Handle storage of large outputs differently
160
+
161
+ ## Benefits and Considerations
162
+
163
+ ### Benefits
164
+
165
+ 1. **Database Efficiency**: Large data is stored separately from workflow metadata
166
+ 2. **Queue Performance**: Queue messages remain small and consistent in size
167
+ 3. **Separation of Concerns**: Control flow data is separate from large payloads
168
+ 4. **Transparent to Developers**: No special code required to handle large data
169
+ 5. **Scalability**: Can handle arbitrary data sizes without affecting workflow system performance
170
+
171
+ ### Considerations
172
+
173
+ 1. **Query Optimization**: Ensure `poll_for_tasks` efficiently retrieves both regular data and blob references
174
+ 2. **Blob Lifecycle Management**: Implement cleanup for orphaned or expired blobs
175
+ 3. **Size Threshold Tuning**: Configure appropriate thresholds for when data should use blob storage
176
+
177
+ ## Conclusion
178
+
179
+ The Blob Reference System in PgFlow provides an elegant solution for handling large data in workflows. By splitting task data into regular inputs and blob references, the system maintains efficient database usage and queue performance while providing a seamless experience for workflow developers. The design ensures that large data is handled appropriately without requiring developers to write special code for blob resolution or storage.