@bubblelab/bubble-core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.txt +202 -0
- package/dist/bubble-bundle.d.ts +2021 -0
- package/dist/bubble-factory.d.ts +161 -0
- package/dist/bubble-factory.d.ts.map +1 -0
- package/dist/bubble-factory.js +426 -0
- package/dist/bubble-factory.js.map +1 -0
- package/dist/bubble-flow/bubble-flow-class.d.ts +19 -0
- package/dist/bubble-flow/bubble-flow-class.d.ts.map +1 -0
- package/dist/bubble-flow/bubble-flow-class.js +23 -0
- package/dist/bubble-flow/bubble-flow-class.js.map +1 -0
- package/dist/bubble-flow/sample/data-analyst-flow.d.ts +15 -0
- package/dist/bubble-flow/sample/data-analyst-flow.d.ts.map +1 -0
- package/dist/bubble-flow/sample/data-analyst-flow.js +63 -0
- package/dist/bubble-flow/sample/data-analyst-flow.js.map +1 -0
- package/dist/bubble-flow/sample/error-ts.d.ts +23 -0
- package/dist/bubble-flow/sample/error-ts.d.ts.map +1 -0
- package/dist/bubble-flow/sample/error-ts.js +31 -0
- package/dist/bubble-flow/sample/error-ts.js.map +1 -0
- package/dist/bubble-flow/sample/sanitytest.d.ts +10 -0
- package/dist/bubble-flow/sample/sanitytest.d.ts.map +1 -0
- package/dist/bubble-flow/sample/sanitytest.js +13 -0
- package/dist/bubble-flow/sample/sanitytest.js.map +1 -0
- package/dist/bubble-flow/sample/simple-webhook-2.d.ts +19 -0
- package/dist/bubble-flow/sample/simple-webhook-2.d.ts.map +1 -0
- package/dist/bubble-flow/sample/simple-webhook-2.js +23 -0
- package/dist/bubble-flow/sample/simple-webhook-2.js.map +1 -0
- package/dist/bubble-flow/sample/simple-webhook.d.ts +10 -0
- package/dist/bubble-flow/sample/simple-webhook.d.ts.map +1 -0
- package/dist/bubble-flow/sample/simple-webhook.js +18 -0
- package/dist/bubble-flow/sample/simple-webhook.js.map +1 -0
- package/dist/bubble-flow/sample/simplified-data-analysis.flow.d.ts +29 -0
- package/dist/bubble-flow/sample/simplified-data-analysis.flow.d.ts.map +1 -0
- package/dist/bubble-flow/sample/simplified-data-analysis.flow.js +150 -0
- package/dist/bubble-flow/sample/simplified-data-analysis.flow.js.map +1 -0
- package/dist/bubble-flow/sample/slack-v0.1.d.ts +10 -0
- package/dist/bubble-flow/sample/slack-v0.1.d.ts.map +1 -0
- package/dist/bubble-flow/sample/slack-v0.1.js +59 -0
- package/dist/bubble-flow/sample/slack-v0.1.js.map +1 -0
- package/dist/bubble-flow/sample/slackagenttest.d.ts +10 -0
- package/dist/bubble-flow/sample/slackagenttest.d.ts.map +1 -0
- package/dist/bubble-flow/sample/slackagenttest.js +59 -0
- package/dist/bubble-flow/sample/slackagenttest.js.map +1 -0
- package/dist/bubble-trigger/index.d.ts +2 -0
- package/dist/bubble-trigger/index.d.ts.map +1 -0
- package/dist/bubble-trigger/index.js +2 -0
- package/dist/bubble-trigger/index.js.map +1 -0
- package/dist/bubble-trigger/types.d.ts +87 -0
- package/dist/bubble-trigger/types.d.ts.map +1 -0
- package/dist/bubble-trigger/types.js +14 -0
- package/dist/bubble-trigger/types.js.map +1 -0
- package/dist/bubbles/service-bubble/ai-agent.d.ts +428 -0
- package/dist/bubbles/service-bubble/ai-agent.d.ts.map +1 -0
- package/dist/bubbles/service-bubble/ai-agent.js +881 -0
- package/dist/bubbles/service-bubble/ai-agent.js.map +1 -0
- package/dist/bubbles/service-bubble/gmail.d.ts +3073 -0
- package/dist/bubbles/service-bubble/gmail.d.ts.map +1 -0
- package/dist/bubbles/service-bubble/gmail.js +908 -0
- package/dist/bubbles/service-bubble/gmail.js.map +1 -0
- package/dist/bubbles/service-bubble/google-calendar.d.ts +3377 -0
- package/dist/bubbles/service-bubble/google-calendar.d.ts.map +1 -0
- package/dist/bubbles/service-bubble/google-calendar.js +527 -0
- package/dist/bubbles/service-bubble/google-calendar.js.map +1 -0
- package/dist/bubbles/service-bubble/google-drive.d.ts +1152 -0
- package/dist/bubbles/service-bubble/google-drive.d.ts.map +1 -0
- package/dist/bubbles/service-bubble/google-drive.js +943 -0
- package/dist/bubbles/service-bubble/google-drive.js.map +1 -0
- package/dist/bubbles/service-bubble/google-sheets.d.ts +1811 -0
- package/dist/bubbles/service-bubble/google-sheets.d.ts.map +1 -0
- package/dist/bubbles/service-bubble/google-sheets.js +904 -0
- package/dist/bubbles/service-bubble/google-sheets.js.map +1 -0
- package/dist/bubbles/service-bubble/hello-world.d.ts +74 -0
- package/dist/bubbles/service-bubble/hello-world.d.ts.map +1 -0
- package/dist/bubbles/service-bubble/hello-world.js +67 -0
- package/dist/bubbles/service-bubble/hello-world.js.map +1 -0
- package/dist/bubbles/service-bubble/http.d.ts +134 -0
- package/dist/bubbles/service-bubble/http.d.ts.map +1 -0
- package/dist/bubbles/service-bubble/http.js +184 -0
- package/dist/bubbles/service-bubble/http.js.map +1 -0
- package/dist/bubbles/service-bubble/postgresql.d.ts +180 -0
- package/dist/bubbles/service-bubble/postgresql.d.ts.map +1 -0
- package/dist/bubbles/service-bubble/postgresql.js +448 -0
- package/dist/bubbles/service-bubble/postgresql.js.map +1 -0
- package/dist/bubbles/service-bubble/resend.d.ts +301 -0
- package/dist/bubbles/service-bubble/resend.d.ts.map +1 -0
- package/dist/bubbles/service-bubble/resend.js +253 -0
- package/dist/bubbles/service-bubble/resend.js.map +1 -0
- package/dist/bubbles/service-bubble/slack.d.ts +5869 -0
- package/dist/bubbles/service-bubble/slack.d.ts.map +1 -0
- package/dist/bubbles/service-bubble/slack.js +1536 -0
- package/dist/bubbles/service-bubble/slack.js.map +1 -0
- package/dist/bubbles/service-bubble/storage.d.ts +571 -0
- package/dist/bubbles/service-bubble/storage.d.ts.map +1 -0
- package/dist/bubbles/service-bubble/storage.js +504 -0
- package/dist/bubbles/service-bubble/storage.js.map +1 -0
- package/dist/bubbles/tool-bubble/bubbleflow-validation-tool.d.ts +308 -0
- package/dist/bubbles/tool-bubble/bubbleflow-validation-tool.d.ts.map +1 -0
- package/dist/bubbles/tool-bubble/bubbleflow-validation-tool.js +285 -0
- package/dist/bubbles/tool-bubble/bubbleflow-validation-tool.js.map +1 -0
- package/dist/bubbles/tool-bubble/chart-js-tool.d.ts +416 -0
- package/dist/bubbles/tool-bubble/chart-js-tool.d.ts.map +1 -0
- package/dist/bubbles/tool-bubble/chart-js-tool.js +570 -0
- package/dist/bubbles/tool-bubble/chart-js-tool.js.map +1 -0
- package/dist/bubbles/tool-bubble/get-bubble-details-tool.d.ts +99 -0
- package/dist/bubbles/tool-bubble/get-bubble-details-tool.d.ts.map +1 -0
- package/dist/bubbles/tool-bubble/get-bubble-details-tool.js +645 -0
- package/dist/bubbles/tool-bubble/get-bubble-details-tool.js.map +1 -0
- package/dist/bubbles/tool-bubble/list-bubbles-tool.d.ts +112 -0
- package/dist/bubbles/tool-bubble/list-bubbles-tool.d.ts.map +1 -0
- package/dist/bubbles/tool-bubble/list-bubbles-tool.js +82 -0
- package/dist/bubbles/tool-bubble/list-bubbles-tool.js.map +1 -0
- package/dist/bubbles/tool-bubble/reddit-scrape-tool.d.ts +413 -0
- package/dist/bubbles/tool-bubble/reddit-scrape-tool.d.ts.map +1 -0
- package/dist/bubbles/tool-bubble/reddit-scrape-tool.js +327 -0
- package/dist/bubbles/tool-bubble/reddit-scrape-tool.js.map +1 -0
- package/dist/bubbles/tool-bubble/research-agent-tool.d.ts +122 -0
- package/dist/bubbles/tool-bubble/research-agent-tool.d.ts.map +1 -0
- package/dist/bubbles/tool-bubble/research-agent-tool.js +343 -0
- package/dist/bubbles/tool-bubble/research-agent-tool.js.map +1 -0
- package/dist/bubbles/tool-bubble/sql-query-tool.d.ts +131 -0
- package/dist/bubbles/tool-bubble/sql-query-tool.d.ts.map +1 -0
- package/dist/bubbles/tool-bubble/sql-query-tool.js +147 -0
- package/dist/bubbles/tool-bubble/sql-query-tool.js.map +1 -0
- package/dist/bubbles/tool-bubble/tool-template.d.ts +257 -0
- package/dist/bubbles/tool-bubble/tool-template.d.ts.map +1 -0
- package/dist/bubbles/tool-bubble/tool-template.js +238 -0
- package/dist/bubbles/tool-bubble/tool-template.js.map +1 -0
- package/dist/bubbles/tool-bubble/virtual-file-editor-example.d.ts +8 -0
- package/dist/bubbles/tool-bubble/virtual-file-editor-example.d.ts.map +1 -0
- package/dist/bubbles/tool-bubble/virtual-file-editor-example.js +65 -0
- package/dist/bubbles/tool-bubble/virtual-file-editor-example.js.map +1 -0
- package/dist/bubbles/tool-bubble/virtual-file-editor.tool.d.ts +125 -0
- package/dist/bubbles/tool-bubble/virtual-file-editor.tool.d.ts.map +1 -0
- package/dist/bubbles/tool-bubble/virtual-file-editor.tool.js +169 -0
- package/dist/bubbles/tool-bubble/virtual-file-editor.tool.js.map +1 -0
- package/dist/bubbles/tool-bubble/web-crawl-tool.d.ts +218 -0
- package/dist/bubbles/tool-bubble/web-crawl-tool.d.ts.map +1 -0
- package/dist/bubbles/tool-bubble/web-crawl-tool.js +255 -0
- package/dist/bubbles/tool-bubble/web-crawl-tool.js.map +1 -0
- package/dist/bubbles/tool-bubble/web-extract-tool.d.ts +134 -0
- package/dist/bubbles/tool-bubble/web-extract-tool.d.ts.map +1 -0
- package/dist/bubbles/tool-bubble/web-extract-tool.js +175 -0
- package/dist/bubbles/tool-bubble/web-extract-tool.js.map +1 -0
- package/dist/bubbles/tool-bubble/web-scrape-tool.d.ts +228 -0
- package/dist/bubbles/tool-bubble/web-scrape-tool.d.ts.map +1 -0
- package/dist/bubbles/tool-bubble/web-scrape-tool.js +214 -0
- package/dist/bubbles/tool-bubble/web-scrape-tool.js.map +1 -0
- package/dist/bubbles/tool-bubble/web-search-tool.d.ts +134 -0
- package/dist/bubbles/tool-bubble/web-search-tool.d.ts.map +1 -0
- package/dist/bubbles/tool-bubble/web-search-tool.js +155 -0
- package/dist/bubbles/tool-bubble/web-search-tool.js.map +1 -0
- package/dist/bubbles/workflow-bubble/bubbleflow-generator.workflow.d.ts +114 -0
- package/dist/bubbles/workflow-bubble/bubbleflow-generator.workflow.d.ts.map +1 -0
- package/dist/bubbles/workflow-bubble/bubbleflow-generator.workflow.js +777 -0
- package/dist/bubbles/workflow-bubble/bubbleflow-generator.workflow.js.map +1 -0
- package/dist/bubbles/workflow-bubble/bubblscript-generateor.workflow.d.ts +97 -0
- package/dist/bubbles/workflow-bubble/bubblscript-generateor.workflow.d.ts.map +1 -0
- package/dist/bubbles/workflow-bubble/bubblscript-generateor.workflow.js +327 -0
- package/dist/bubbles/workflow-bubble/bubblscript-generateor.workflow.js.map +1 -0
- package/dist/bubbles/workflow-bubble/database-analyzer.workflow.d.ts +303 -0
- package/dist/bubbles/workflow-bubble/database-analyzer.workflow.d.ts.map +1 -0
- package/dist/bubbles/workflow-bubble/database-analyzer.workflow.js +297 -0
- package/dist/bubbles/workflow-bubble/database-analyzer.workflow.js.map +1 -0
- package/dist/bubbles/workflow-bubble/file-editor-agent.workflow.d.ts +157 -0
- package/dist/bubbles/workflow-bubble/file-editor-agent.workflow.d.ts.map +1 -0
- package/dist/bubbles/workflow-bubble/file-editor-agent.workflow.js +310 -0
- package/dist/bubbles/workflow-bubble/file-editor-agent.workflow.js.map +1 -0
- package/dist/bubbles/workflow-bubble/generate-document.workflow.d.ts +543 -0
- package/dist/bubbles/workflow-bubble/generate-document.workflow.d.ts.map +1 -0
- package/dist/bubbles/workflow-bubble/generate-document.workflow.js +628 -0
- package/dist/bubbles/workflow-bubble/generate-document.workflow.js.map +1 -0
- package/dist/bubbles/workflow-bubble/parse-document.workflow.d.ts +679 -0
- package/dist/bubbles/workflow-bubble/parse-document.workflow.d.ts.map +1 -0
- package/dist/bubbles/workflow-bubble/parse-document.workflow.js +604 -0
- package/dist/bubbles/workflow-bubble/parse-document.workflow.js.map +1 -0
- package/dist/bubbles/workflow-bubble/pdf-form-operations.workflow.d.ts +1011 -0
- package/dist/bubbles/workflow-bubble/pdf-form-operations.workflow.d.ts.map +1 -0
- package/dist/bubbles/workflow-bubble/pdf-form-operations.workflow.js +841 -0
- package/dist/bubbles/workflow-bubble/pdf-form-operations.workflow.js.map +1 -0
- package/dist/bubbles/workflow-bubble/pdf-ocr.workflow.d.ts +883 -0
- package/dist/bubbles/workflow-bubble/pdf-ocr.workflow.d.ts.map +1 -0
- package/dist/bubbles/workflow-bubble/pdf-ocr.workflow.js +781 -0
- package/dist/bubbles/workflow-bubble/pdf-ocr.workflow.js.map +1 -0
- package/dist/bubbles/workflow-bubble/slack-data-assistant.workflow.d.ts +300 -0
- package/dist/bubbles/workflow-bubble/slack-data-assistant.workflow.d.ts.map +1 -0
- package/dist/bubbles/workflow-bubble/slack-data-assistant.workflow.js +508 -0
- package/dist/bubbles/workflow-bubble/slack-data-assistant.workflow.js.map +1 -0
- package/dist/bubbles/workflow-bubble/slack-formatter-agent.d.ts +731 -0
- package/dist/bubbles/workflow-bubble/slack-formatter-agent.d.ts.map +1 -0
- package/dist/bubbles/workflow-bubble/slack-formatter-agent.js +690 -0
- package/dist/bubbles/workflow-bubble/slack-formatter-agent.js.map +1 -0
- package/dist/bubbles/workflow-bubble/slack-notifier.workflow.d.ts +401 -0
- package/dist/bubbles/workflow-bubble/slack-notifier.workflow.d.ts.map +1 -0
- package/dist/bubbles/workflow-bubble/slack-notifier.workflow.js +382 -0
- package/dist/bubbles/workflow-bubble/slack-notifier.workflow.js.map +1 -0
- package/dist/bubbles/workflow-bubble/workflow-template.d.ts +144 -0
- package/dist/bubbles/workflow-bubble/workflow-template.d.ts.map +1 -0
- package/dist/bubbles/workflow-bubble/workflow-template.js +124 -0
- package/dist/bubbles/workflow-bubble/workflow-template.js.map +1 -0
- package/dist/index.d.ts +46 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +53 -0
- package/dist/index.js.map +1 -0
- package/dist/logging/BubbleLogger.d.ts +146 -0
- package/dist/logging/BubbleLogger.d.ts.map +1 -0
- package/dist/logging/BubbleLogger.js +472 -0
- package/dist/logging/BubbleLogger.js.map +1 -0
- package/dist/logging/StreamingBubbleLogger.d.ts +85 -0
- package/dist/logging/StreamingBubbleLogger.d.ts.map +1 -0
- package/dist/logging/StreamingBubbleLogger.js +340 -0
- package/dist/logging/StreamingBubbleLogger.js.map +1 -0
- package/dist/types/ai-models.d.ts +4 -0
- package/dist/types/ai-models.d.ts.map +1 -0
- package/dist/types/ai-models.js +14 -0
- package/dist/types/ai-models.js.map +1 -0
- package/dist/types/available-tools.d.ts +4 -0
- package/dist/types/available-tools.d.ts.map +1 -0
- package/dist/types/available-tools.js +19 -0
- package/dist/types/available-tools.js.map +1 -0
- package/dist/types/base-bubble-class.d.ts +47 -0
- package/dist/types/base-bubble-class.d.ts.map +1 -0
- package/dist/types/base-bubble-class.js +212 -0
- package/dist/types/base-bubble-class.js.map +1 -0
- package/dist/types/bubble-errors.d.ts +44 -0
- package/dist/types/bubble-errors.d.ts.map +1 -0
- package/dist/types/bubble-errors.js +51 -0
- package/dist/types/bubble-errors.js.map +1 -0
- package/dist/types/bubble.d.ts +73 -0
- package/dist/types/bubble.d.ts.map +1 -0
- package/dist/types/bubble.js +2 -0
- package/dist/types/bubble.js.map +1 -0
- package/dist/types/credentials.d.ts +6 -0
- package/dist/types/credentials.d.ts.map +1 -0
- package/dist/types/credentials.js +6 -0
- package/dist/types/credentials.js.map +1 -0
- package/dist/types/service-bubble-class.d.ts +31 -0
- package/dist/types/service-bubble-class.d.ts.map +1 -0
- package/dist/types/service-bubble-class.js +36 -0
- package/dist/types/service-bubble-class.js.map +1 -0
- package/dist/types/streaming-events.d.ts +18 -0
- package/dist/types/streaming-events.d.ts.map +1 -0
- package/dist/types/streaming-events.js +5 -0
- package/dist/types/streaming-events.js.map +1 -0
- package/dist/types/tool-bubble-class.d.ts +19 -0
- package/dist/types/tool-bubble-class.d.ts.map +1 -0
- package/dist/types/tool-bubble-class.js +48 -0
- package/dist/types/tool-bubble-class.js.map +1 -0
- package/dist/types/workflow-bubble-class.d.ts +25 -0
- package/dist/types/workflow-bubble-class.d.ts.map +1 -0
- package/dist/types/workflow-bubble-class.js +30 -0
- package/dist/types/workflow-bubble-class.js.map +1 -0
- package/dist/utils/bubbleflow-parser.d.ts +32 -0
- package/dist/utils/bubbleflow-parser.d.ts.map +1 -0
- package/dist/utils/bubbleflow-parser.js +332 -0
- package/dist/utils/bubbleflow-parser.js.map +1 -0
- package/dist/utils/bubbleflow-validation.d.ts +9 -0
- package/dist/utils/bubbleflow-validation.d.ts.map +1 -0
- package/dist/utils/bubbleflow-validation.js +116 -0
- package/dist/utils/bubbleflow-validation.js.map +1 -0
- package/dist/utils/json-parsing.d.ts +20 -0
- package/dist/utils/json-parsing.d.ts.map +1 -0
- package/dist/utils/json-parsing.js +394 -0
- package/dist/utils/json-parsing.js.map +1 -0
- package/dist/utils/mock-data-generator.d.ts +43 -0
- package/dist/utils/mock-data-generator.d.ts.map +1 -0
- package/dist/utils/mock-data-generator.js +312 -0
- package/dist/utils/mock-data-generator.js.map +1 -0
- package/dist/utils/param-helper.d.ts +2 -0
- package/dist/utils/param-helper.d.ts.map +1 -0
- package/dist/utils/param-helper.js +5 -0
- package/dist/utils/param-helper.js.map +1 -0
- package/dist/utils/source-bubble-parser.d.ts +31 -0
- package/dist/utils/source-bubble-parser.d.ts.map +1 -0
- package/dist/utils/source-bubble-parser.js +231 -0
- package/dist/utils/source-bubble-parser.js.map +1 -0
- package/package.json +63 -0
|
@@ -0,0 +1,781 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF OCR WORKFLOW
|
|
3
|
+
*
|
|
4
|
+
* A comprehensive workflow that converts PDF documents to images and passes them
|
|
5
|
+
* to an AI agent along with discovered form fields to parse and extract schema information.
|
|
6
|
+
*
|
|
7
|
+
* This workflow combines:
|
|
8
|
+
* 1. PDF field discovery using pdf-lib
|
|
9
|
+
* 2. PDF to images conversion using pdf-img-convert
|
|
10
|
+
* 3. AI agent analysis for schema parsing and field extraction
|
|
11
|
+
*
|
|
12
|
+
* Returns structured JSON containing field IDs from discovery and extracted field names
|
|
13
|
+
* with their values from AI analysis.
|
|
14
|
+
*/
|
|
15
|
+
import { z } from 'zod';
|
|
16
|
+
import { WorkflowBubble } from '../../types/workflow-bubble-class.js';
|
|
17
|
+
import { CredentialType } from '@bubblelab/shared-schemas';
|
|
18
|
+
import { PDFFormOperationsWorkflow } from './pdf-form-operations.workflow.js';
|
|
19
|
+
import { AIAgentBubble } from '../service-bubble/ai-agent.js';
|
|
20
|
+
import { AvailableModels } from '../../types/ai-models.js';
|
|
21
|
+
/**
|
|
22
|
+
* System prompts for different modes
|
|
23
|
+
*/
|
|
24
|
+
const IDENTIFY_MODE_PROMPT = `You are an expert OCR and form field extraction specialist. Analyze the provided PDF images and form field discovery data to extract structured information.
|
|
25
|
+
|
|
26
|
+
IMPORTANT: The form field IDs are numbered in natural reading order (left to right, top to bottom). Use this ordering to help identify what each field represents.
|
|
27
|
+
|
|
28
|
+
Your task:
|
|
29
|
+
1. Examine the PDF images to identify all visible text and form fields
|
|
30
|
+
2. Cross-reference with the discovered form field metadata (field IDs follow natural reading order)
|
|
31
|
+
3. Generate descriptive field names based on the PDF content, context, and field position
|
|
32
|
+
4. Return a JSON array with field information
|
|
33
|
+
|
|
34
|
+
Return format: JSON array of objects with:
|
|
35
|
+
- id: number (MUST use the exact ID from discovery data when available - these IDs are in natural reading order)
|
|
36
|
+
- fieldName: string (descriptive name based on PDF content, context, and field position)
|
|
37
|
+
- confidence: number (0.0-1.0, your confidence in the field identification)
|
|
38
|
+
|
|
39
|
+
Focus on generating meaningful, descriptive field names that accurately represent what each field is for based on the PDF context and natural reading order.`;
|
|
40
|
+
const AUTOFILL_MODE_PROMPT = `You are an expert OCR and form field extraction specialist with autofill capabilities. Analyze the provided PDF images, form field discovery data, and client information to extract and fill structured information.
|
|
41
|
+
|
|
42
|
+
IMPORTANT: The form field IDs are numbered in natural reading order (left to right, top to bottom). Use this ordering to help identify what each field represents and to match client information appropriately.
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
Your task:
|
|
46
|
+
1. Examine the PDF images to identify all visible text and form fields
|
|
47
|
+
2. Cross-reference with the discovered form field metadata (field IDs follow natural reading order)
|
|
48
|
+
3. Generate descriptive field names based on the PDF content, context, and field position
|
|
49
|
+
4. Use the provided client information to determine appropriate values for each field
|
|
50
|
+
5. Return a JSON array with ALL discovered fields (every single field ID must have an entry)
|
|
51
|
+
|
|
52
|
+
Return format: JSON array of objects with:
|
|
53
|
+
- id: number (MUST use the exact ID from discovery data when available - these IDs are in natural reading order)
|
|
54
|
+
- originalFieldName: string (MUST use the exact field name from discovery data when available, for precise field matching)
|
|
55
|
+
- fieldName: string (descriptive name based on PDF content, context, and natural reading order)
|
|
56
|
+
- value: string (appropriate value from client information, or empty string if not applicable)
|
|
57
|
+
- confidence: number (0.0-1.0, your confidence in the field identification and value assignment)
|
|
58
|
+
|
|
59
|
+
CRITICAL:
|
|
60
|
+
- You MUST return an entry for EVERY field ID from the discovery data - no field should be omitted
|
|
61
|
+
- For fields that match discovered form field metadata, you MUST include the originalFieldName exactly as provided in the discovery data. This is essential for proper form filling.
|
|
62
|
+
|
|
63
|
+
Rules for value assignment:
|
|
64
|
+
- Use the natural reading order (field ID sequence) to understand form structure and field relationships
|
|
65
|
+
- Only fill values that clearly match the client information provided
|
|
66
|
+
- Use empty string for fields where no appropriate value can be determined from client information
|
|
67
|
+
- Format values appropriately for the field type (dates, numbers, etc.)
|
|
68
|
+
- Be conservative - if unsure, use empty string rather than guessing
|
|
69
|
+
- EVERY discovered field ID must appear in your response, even if the value is empty
|
|
70
|
+
|
|
71
|
+
Focus on accuracy and appropriate value mapping based on the client information context and natural field ordering.`;
|
|
72
|
+
/**
|
|
73
|
+
* Parameters schema for PDF OCR workflow using discriminated union for different modes
|
|
74
|
+
*/
|
|
75
|
+
const PDFOcrWorkflowParamsSchema = z.discriminatedUnion('mode', [
|
|
76
|
+
// Identify mode - just identifies fields and generates descriptive names
|
|
77
|
+
z.object({
|
|
78
|
+
mode: z
|
|
79
|
+
.literal('identify')
|
|
80
|
+
.describe('Identify form fields and generate descriptive names'),
|
|
81
|
+
pdfData: z
|
|
82
|
+
.string()
|
|
83
|
+
.min(1, 'PDF data is required')
|
|
84
|
+
.describe('Base64 encoded PDF data'),
|
|
85
|
+
discoveryOptions: z
|
|
86
|
+
.object({
|
|
87
|
+
targetPage: z
|
|
88
|
+
.number()
|
|
89
|
+
.positive()
|
|
90
|
+
.optional()
|
|
91
|
+
.describe('Extract fields from specific page only (default: all pages)'),
|
|
92
|
+
})
|
|
93
|
+
.default({})
|
|
94
|
+
.describe('Options for PDF field discovery'),
|
|
95
|
+
imageOptions: z
|
|
96
|
+
.object({
|
|
97
|
+
format: z
|
|
98
|
+
.enum(['png', 'jpeg'])
|
|
99
|
+
.default('png')
|
|
100
|
+
.describe('Output image format'),
|
|
101
|
+
quality: z
|
|
102
|
+
.number()
|
|
103
|
+
.min(0.1)
|
|
104
|
+
.max(1.0)
|
|
105
|
+
.default(0.8)
|
|
106
|
+
.describe('JPEG quality (0.1-1.0, only for JPEG format)'),
|
|
107
|
+
dpi: z
|
|
108
|
+
.number()
|
|
109
|
+
.min(72)
|
|
110
|
+
.max(300)
|
|
111
|
+
.default(150)
|
|
112
|
+
.describe('Output DPI (dots per inch)'),
|
|
113
|
+
pages: z
|
|
114
|
+
.array(z.number().positive())
|
|
115
|
+
.optional()
|
|
116
|
+
.describe('Specific page numbers to convert (1-indexed). If not provided, converts all pages'),
|
|
117
|
+
})
|
|
118
|
+
.default({
|
|
119
|
+
format: 'png',
|
|
120
|
+
quality: 0.8,
|
|
121
|
+
dpi: 150,
|
|
122
|
+
})
|
|
123
|
+
.describe('Options for PDF to images conversion'),
|
|
124
|
+
aiOptions: z
|
|
125
|
+
.object({
|
|
126
|
+
model: AvailableModels.default('google/gemini-2.5-flash').describe('AI model to use for field identification'),
|
|
127
|
+
temperature: z
|
|
128
|
+
.number()
|
|
129
|
+
.min(0)
|
|
130
|
+
.max(2)
|
|
131
|
+
.default(0.3)
|
|
132
|
+
.describe('Temperature for AI responses (lower = more consistent)'),
|
|
133
|
+
maxTokens: z
|
|
134
|
+
.number()
|
|
135
|
+
.positive()
|
|
136
|
+
.default(50000)
|
|
137
|
+
.describe('Maximum tokens for AI response'),
|
|
138
|
+
jsonMode: z
|
|
139
|
+
.boolean()
|
|
140
|
+
.default(true)
|
|
141
|
+
.describe('Enable JSON mode to ensure clean JSON output'),
|
|
142
|
+
})
|
|
143
|
+
.default({
|
|
144
|
+
model: 'google/gemini-2.5-flash',
|
|
145
|
+
temperature: 0.3,
|
|
146
|
+
maxTokens: 50000,
|
|
147
|
+
jsonMode: true,
|
|
148
|
+
})
|
|
149
|
+
.describe('AI agent configuration options'),
|
|
150
|
+
credentials: z
|
|
151
|
+
.record(z.nativeEnum(CredentialType), z.string())
|
|
152
|
+
.optional()
|
|
153
|
+
.describe('Credentials for AI model access (GOOGLE_GEMINI_CRED, OPENAI_CRED, etc.)'),
|
|
154
|
+
}),
|
|
155
|
+
// Autofill mode - identifies fields and fills them based on client information
|
|
156
|
+
z.object({
|
|
157
|
+
mode: z
|
|
158
|
+
.literal('autofill')
|
|
159
|
+
.describe('Identify form fields and autofill with client information'),
|
|
160
|
+
pdfData: z
|
|
161
|
+
.string()
|
|
162
|
+
.min(1, 'PDF data is required')
|
|
163
|
+
.describe('Base64 encoded PDF data'),
|
|
164
|
+
clientInformation: z
|
|
165
|
+
.string()
|
|
166
|
+
.min(1, 'Client information is required for autofill mode')
|
|
167
|
+
.describe('Free text containing client information to use for autofilling form fields'),
|
|
168
|
+
discoveryOptions: z
|
|
169
|
+
.object({
|
|
170
|
+
targetPage: z
|
|
171
|
+
.number()
|
|
172
|
+
.positive()
|
|
173
|
+
.optional()
|
|
174
|
+
.describe('Extract fields from specific page only (default: all pages)'),
|
|
175
|
+
})
|
|
176
|
+
.default({})
|
|
177
|
+
.describe('Options for PDF field discovery'),
|
|
178
|
+
imageOptions: z
|
|
179
|
+
.object({
|
|
180
|
+
format: z
|
|
181
|
+
.enum(['png', 'jpeg'])
|
|
182
|
+
.default('png')
|
|
183
|
+
.describe('Output image format'),
|
|
184
|
+
quality: z
|
|
185
|
+
.number()
|
|
186
|
+
.min(0.1)
|
|
187
|
+
.max(1.0)
|
|
188
|
+
.default(0.8)
|
|
189
|
+
.describe('JPEG quality (0.1-1.0, only for JPEG format)'),
|
|
190
|
+
dpi: z
|
|
191
|
+
.number()
|
|
192
|
+
.min(72)
|
|
193
|
+
.max(300)
|
|
194
|
+
.default(150)
|
|
195
|
+
.describe('Output DPI (dots per inch)'),
|
|
196
|
+
pages: z
|
|
197
|
+
.array(z.number().positive())
|
|
198
|
+
.optional()
|
|
199
|
+
.describe('Specific page numbers to convert (1-indexed). If not provided, converts all pages'),
|
|
200
|
+
})
|
|
201
|
+
.default({
|
|
202
|
+
format: 'png',
|
|
203
|
+
quality: 0.8,
|
|
204
|
+
dpi: 150,
|
|
205
|
+
})
|
|
206
|
+
.describe('Options for PDF to images conversion'),
|
|
207
|
+
aiOptions: z
|
|
208
|
+
.object({
|
|
209
|
+
model: AvailableModels.default('google/gemini-2.5-flash').describe('AI model to use for field identification and autofill'),
|
|
210
|
+
temperature: z
|
|
211
|
+
.number()
|
|
212
|
+
.min(0)
|
|
213
|
+
.max(2)
|
|
214
|
+
.default(0.3)
|
|
215
|
+
.describe('Temperature for AI responses (lower = more consistent)'),
|
|
216
|
+
maxTokens: z
|
|
217
|
+
.number()
|
|
218
|
+
.positive()
|
|
219
|
+
.default(50000)
|
|
220
|
+
.describe('Maximum tokens for AI response'),
|
|
221
|
+
jsonMode: z
|
|
222
|
+
.boolean()
|
|
223
|
+
.default(true)
|
|
224
|
+
.describe('Enable JSON mode to ensure clean JSON output'),
|
|
225
|
+
})
|
|
226
|
+
.default({
|
|
227
|
+
model: 'google/gemini-2.5-flash',
|
|
228
|
+
temperature: 0.3,
|
|
229
|
+
maxTokens: 50000,
|
|
230
|
+
jsonMode: true,
|
|
231
|
+
})
|
|
232
|
+
.describe('AI agent configuration options'),
|
|
233
|
+
credentials: z
|
|
234
|
+
.record(z.nativeEnum(CredentialType), z.string())
|
|
235
|
+
.optional()
|
|
236
|
+
.describe('Credentials for AI model access (GOOGLE_GEMINI_CRED, OPENAI_CRED, etc.)'),
|
|
237
|
+
}),
|
|
238
|
+
]);
|
|
239
|
+
/**
|
|
240
|
+
* Result schema for PDF OCR workflow using discriminated union for different modes
|
|
241
|
+
*/
|
|
242
|
+
const PDFOcrWorkflowResultSchema = z.discriminatedUnion('mode', [
|
|
243
|
+
// Identify mode result
|
|
244
|
+
z.object({
|
|
245
|
+
mode: z.literal('identify').describe('Result from identify mode'),
|
|
246
|
+
extractedFields: z
|
|
247
|
+
.array(z.object({
|
|
248
|
+
id: z.number().describe('Field ID from discovery or auto-generated'),
|
|
249
|
+
fieldName: z
|
|
250
|
+
.string()
|
|
251
|
+
.describe('Descriptive name generated based on PDF content'),
|
|
252
|
+
confidence: z
|
|
253
|
+
.number()
|
|
254
|
+
.min(0)
|
|
255
|
+
.max(1)
|
|
256
|
+
.describe('AI confidence in the field identification (0.0-1.0)'),
|
|
257
|
+
}))
|
|
258
|
+
.describe('Array of identified fields with descriptive names'),
|
|
259
|
+
discoveryData: z
|
|
260
|
+
.object({
|
|
261
|
+
totalFields: z.number(),
|
|
262
|
+
fieldsWithCoordinates: z.number(),
|
|
263
|
+
pages: z.array(z.number()),
|
|
264
|
+
})
|
|
265
|
+
.describe('Summary of field discovery results'),
|
|
266
|
+
imageData: z
|
|
267
|
+
.object({
|
|
268
|
+
totalPages: z.number(),
|
|
269
|
+
convertedPages: z.number(),
|
|
270
|
+
format: z.string(),
|
|
271
|
+
dpi: z.number(),
|
|
272
|
+
})
|
|
273
|
+
.describe('Summary of image conversion results'),
|
|
274
|
+
aiAnalysis: z
|
|
275
|
+
.object({
|
|
276
|
+
model: z.string(),
|
|
277
|
+
iterations: z.number(),
|
|
278
|
+
processingTime: z.number().optional(),
|
|
279
|
+
})
|
|
280
|
+
.describe('AI analysis metadata'),
|
|
281
|
+
success: z
|
|
282
|
+
.boolean()
|
|
283
|
+
.describe('Whether the workflow completed successfully'),
|
|
284
|
+
error: z.string().describe('Error message if workflow failed'),
|
|
285
|
+
}),
|
|
286
|
+
// Autofill mode result
|
|
287
|
+
z.object({
|
|
288
|
+
mode: z.literal('autofill').describe('Result from autofill mode'),
|
|
289
|
+
extractedFields: z
|
|
290
|
+
.array(z.object({
|
|
291
|
+
id: z.number().describe('Field ID from discovery or auto-generated'),
|
|
292
|
+
originalFieldName: z
|
|
293
|
+
.string()
|
|
294
|
+
.optional()
|
|
295
|
+
.describe('Original field name from discovery for precise matching'),
|
|
296
|
+
fieldName: z
|
|
297
|
+
.string()
|
|
298
|
+
.describe('Descriptive name generated based on PDF content'),
|
|
299
|
+
value: z
|
|
300
|
+
.string()
|
|
301
|
+
.describe('Value to fill in the field based on client information'),
|
|
302
|
+
confidence: z
|
|
303
|
+
.number()
|
|
304
|
+
.min(0)
|
|
305
|
+
.max(1)
|
|
306
|
+
.describe('AI confidence in the field identification and value assignment (0.0-1.0)'),
|
|
307
|
+
}))
|
|
308
|
+
.describe('Array of identified fields with values for autofill'),
|
|
309
|
+
filledPdfData: z.string().describe('Base64 encoded filled PDF data'),
|
|
310
|
+
discoveryData: z
|
|
311
|
+
.object({
|
|
312
|
+
totalFields: z.number(),
|
|
313
|
+
fieldsWithCoordinates: z.number(),
|
|
314
|
+
pages: z.array(z.number()),
|
|
315
|
+
})
|
|
316
|
+
.describe('Summary of field discovery results'),
|
|
317
|
+
imageData: z
|
|
318
|
+
.object({
|
|
319
|
+
totalPages: z.number(),
|
|
320
|
+
convertedPages: z.number(),
|
|
321
|
+
format: z.string(),
|
|
322
|
+
dpi: z.number(),
|
|
323
|
+
})
|
|
324
|
+
.describe('Summary of image conversion results'),
|
|
325
|
+
aiAnalysis: z
|
|
326
|
+
.object({
|
|
327
|
+
model: z.string(),
|
|
328
|
+
iterations: z.number(),
|
|
329
|
+
processingTime: z.number().optional(),
|
|
330
|
+
})
|
|
331
|
+
.describe('AI analysis metadata'),
|
|
332
|
+
fillResults: z
|
|
333
|
+
.object({
|
|
334
|
+
filledFields: z.number(),
|
|
335
|
+
successfullyFilled: z.number(),
|
|
336
|
+
})
|
|
337
|
+
.describe('Summary of PDF filling results'),
|
|
338
|
+
success: z
|
|
339
|
+
.boolean()
|
|
340
|
+
.describe('Whether the workflow completed successfully'),
|
|
341
|
+
error: z.string().describe('Error message if workflow failed'),
|
|
342
|
+
}),
|
|
343
|
+
]);
|
|
344
|
+
/**
|
|
345
|
+
* PDF OCR Workflow
|
|
346
|
+
* Combines PDF field discovery, image conversion, and AI analysis for comprehensive form field extraction
|
|
347
|
+
*/
|
|
348
|
+
export class PDFOcrWorkflow extends WorkflowBubble {
|
|
349
|
+
static type = 'workflow';
|
|
350
|
+
static bubbleName = 'pdf-ocr-workflow';
|
|
351
|
+
static schema = PDFOcrWorkflowParamsSchema;
|
|
352
|
+
static resultSchema = PDFOcrWorkflowResultSchema;
|
|
353
|
+
static shortDescription = 'PDF OCR workflow: identify fields or autofill forms using AI analysis';
|
|
354
|
+
static longDescription = `
|
|
355
|
+
Comprehensive PDF OCR workflow with two modes for form field processing:
|
|
356
|
+
|
|
357
|
+
**Identify Mode:**
|
|
358
|
+
- Discovers and names form fields from PDF documents
|
|
359
|
+
- Returns field IDs, descriptive names, and confidence scores
|
|
360
|
+
- Useful for form schema generation and document understanding
|
|
361
|
+
|
|
362
|
+
**Autofill Mode:**
|
|
363
|
+
- Identifies form fields AND fills them using provided client information
|
|
364
|
+
- Returns field data with values plus a filled PDF
|
|
365
|
+
- Uses AI to intelligently map client data to appropriate form fields
|
|
366
|
+
|
|
367
|
+
Process:
|
|
368
|
+
1. Discover form fields using PyMuPDF (field names, types, coordinates)
|
|
369
|
+
2. Convert PDF pages to high-quality images using PyMuPDF
|
|
370
|
+
3. Send images + discovery data + client info (autofill mode) to AI agent
|
|
371
|
+
4. For autofill mode: Use PDF Form Operations to fill the form with AI-determined values
|
|
372
|
+
|
|
373
|
+
Features:
|
|
374
|
+
- Two distinct modes: identify vs autofill
|
|
375
|
+
- Cross-references visual analysis with form field metadata
|
|
376
|
+
- Supports both fillable PDFs and scanned documents
|
|
377
|
+
- Generates meaningful field names based on PDF content and context
|
|
378
|
+
- Intelligent value mapping from client information (autofill mode)
|
|
379
|
+
- Configurable image quality and AI model selection
|
|
380
|
+
- Returns confidence scores for field identification accuracy
|
|
381
|
+
|
|
382
|
+
Use cases:
|
|
383
|
+
- **Identify**: Form schema generation, document structure analysis
|
|
384
|
+
- **Autofill**: Automated form filling, client onboarding, data entry automation
|
|
385
|
+
|
|
386
|
+
Input: Base64 encoded PDF data + mode + client information (autofill mode)
|
|
387
|
+
Output: Mode-specific results with field data and optional filled PDF
|
|
388
|
+
`;
|
|
389
|
+
static alias = 'pdf-ocr';
|
|
390
|
+
constructor(params, context) {
|
|
391
|
+
super(params, context);
|
|
392
|
+
}
|
|
393
|
+
async performAction() {
|
|
394
|
+
const startTime = Date.now();
|
|
395
|
+
console.log('[PDFOcrWorkflow] Starting comprehensive PDF OCR analysis');
|
|
396
|
+
console.log('[PDFOcrWorkflow] PDF data length:', this.params.pdfData.length);
|
|
397
|
+
try {
|
|
398
|
+
// Step 1: Discover form fields
|
|
399
|
+
console.log('[PDFOcrWorkflow] Step 1: Discovering form fields...');
|
|
400
|
+
const discoveryWorkflow = new PDFFormOperationsWorkflow({
|
|
401
|
+
operation: 'discover',
|
|
402
|
+
pdfData: this.params.pdfData,
|
|
403
|
+
targetPage: this.params.discoveryOptions?.targetPage,
|
|
404
|
+
credentials: this.params.credentials,
|
|
405
|
+
}, this.context);
|
|
406
|
+
const discoveryResult = await discoveryWorkflow.action();
|
|
407
|
+
if (!discoveryResult.success) {
|
|
408
|
+
throw new Error(`Field discovery failed: ${discoveryResult.error}`);
|
|
409
|
+
}
|
|
410
|
+
console.log(`[PDFOcrWorkflow] Discovered ${discoveryResult.data?.totalFields} fields`);
|
|
411
|
+
// Step 2: Convert PDF to images
|
|
412
|
+
console.log('[PDFOcrWorkflow] Step 2: Converting PDF to images...');
|
|
413
|
+
const imageWorkflow = new PDFFormOperationsWorkflow({
|
|
414
|
+
operation: 'convert-to-images',
|
|
415
|
+
pdfData: this.params.pdfData,
|
|
416
|
+
format: this.params.imageOptions?.format || 'png',
|
|
417
|
+
quality: this.params.imageOptions?.quality || 0.8,
|
|
418
|
+
dpi: this.params.imageOptions?.dpi || 150,
|
|
419
|
+
pages: this.params.imageOptions?.pages,
|
|
420
|
+
credentials: this.params.credentials,
|
|
421
|
+
}, this.context);
|
|
422
|
+
const imageResult = await imageWorkflow.action();
|
|
423
|
+
if (!imageResult.success) {
|
|
424
|
+
throw new Error(`Image conversion failed: ${imageResult.error}`);
|
|
425
|
+
}
|
|
426
|
+
console.log(`[PDFOcrWorkflow] Converted ${imageResult.data?.convertedPages} pages to images`);
|
|
427
|
+
// Step 3: Prepare data for AI analysis
|
|
428
|
+
console.log('[PDFOcrWorkflow] Step 3: Preparing data for AI analysis...');
|
|
429
|
+
// Prepare images for AI agent
|
|
430
|
+
const imageInputs = imageResult.data?.images?.map((image) => ({
|
|
431
|
+
type: 'base64',
|
|
432
|
+
data: image.imageData,
|
|
433
|
+
mimeType: image.format === 'png' ? 'image/png' : 'image/jpeg',
|
|
434
|
+
description: `Page ${image.pageNumber} - PDF form field extraction`,
|
|
435
|
+
})) || [];
|
|
436
|
+
// Prepare discovery data summary for AI prompt
|
|
437
|
+
const fieldsData = discoveryResult.data?.fields || [];
|
|
438
|
+
const discoveryContext = fieldsData.length > 0
|
|
439
|
+
? `\nDiscovered form fields metadata:
|
|
440
|
+
${fieldsData
|
|
441
|
+
.map((field) => `- ID: ${field.id}, Name: "${field.name}", Type: ${field.type}, Page: ${field.page}, Value: "${field.current_value}", Coordinates: (${field.x}, ${field.y}, ${field.width}x${field.height})`)
|
|
442
|
+
.join('\n')}`
|
|
443
|
+
: '\nNo structured form fields discovered. Perform pure OCR analysis of the images.';
|
|
444
|
+
// Choose system prompt based on mode
|
|
445
|
+
const basePrompt = this.params.mode === 'identify'
|
|
446
|
+
? IDENTIFY_MODE_PROMPT
|
|
447
|
+
: AUTOFILL_MODE_PROMPT;
|
|
448
|
+
// Add client information context for autofill mode
|
|
449
|
+
const clientContext = this.params.mode === 'autofill'
|
|
450
|
+
? `\n\nClient Information:\n${this.params.clientInformation}\n\nUse this information to fill appropriate field values.`
|
|
451
|
+
: '';
|
|
452
|
+
const enhancedPrompt = basePrompt + discoveryContext + clientContext;
|
|
453
|
+
// Step 4: AI analysis
|
|
454
|
+
console.log('[PDFOcrWorkflow] Step 4: Performing AI analysis...');
|
|
455
|
+
const aiAgent = new AIAgentBubble({
|
|
456
|
+
message: this.params.mode === 'identify'
|
|
457
|
+
? `Please analyze these PDF pages and identify all form fields.
|
|
458
|
+
|
|
459
|
+
Please return a JSON array of field objects as specified in the system prompt. Focus on:
|
|
460
|
+
1. Identifying all text fields, checkboxes, and form elements
|
|
461
|
+
2. Generating descriptive field names based on labels, context, and purpose
|
|
462
|
+
3. Cross-referencing with any discovered form field metadata provided
|
|
463
|
+
4. Providing confidence scores for field identification
|
|
464
|
+
|
|
465
|
+
Return only the JSON array, no additional text or formatting.`
|
|
466
|
+
: `Please analyze these PDF pages and identify all form fields, then fill them using the provided client information.
|
|
467
|
+
|
|
468
|
+
Please return a JSON array of field objects as specified in the system prompt. Focus on:
|
|
469
|
+
1. Identifying all text fields, checkboxes, and form elements
|
|
470
|
+
2. Generating descriptive field names based on labels, context, and purpose
|
|
471
|
+
3. Cross-referencing with any discovered form field metadata provided
|
|
472
|
+
4. Using the client information to determine appropriate values for each field
|
|
473
|
+
5. Providing confidence scores for field identification and value assignment
|
|
474
|
+
|
|
475
|
+
Return only the JSON array, no additional text or formatting.`,
|
|
476
|
+
images: imageInputs,
|
|
477
|
+
systemPrompt: enhancedPrompt,
|
|
478
|
+
model: {
|
|
479
|
+
model: this.params.aiOptions?.model || 'google/gemini-2.5-flash',
|
|
480
|
+
temperature: this.params.aiOptions?.temperature || 0.3,
|
|
481
|
+
maxTokens: this.params.aiOptions?.maxTokens || 50000,
|
|
482
|
+
jsonMode: this.params.aiOptions?.jsonMode ?? true,
|
|
483
|
+
},
|
|
484
|
+
credentials: this.params.credentials,
|
|
485
|
+
tools: [], // No tools needed for this analysis
|
|
486
|
+
maxIterations: 3,
|
|
487
|
+
}, this.context);
|
|
488
|
+
const aiResult = await aiAgent.action();
|
|
489
|
+
if (!aiResult.success) {
|
|
490
|
+
throw new Error(`AI analysis failed: ${aiResult.error}`);
|
|
491
|
+
}
|
|
492
|
+
console.log('[PDFOcrWorkflow] AI analysis completed');
|
|
493
|
+
// Step 5: Parse AI response and structure results
|
|
494
|
+
console.log('[PDFOcrWorkflow] Step 5: Processing AI results...');
|
|
495
|
+
let extractedFields = [];
|
|
496
|
+
try {
|
|
497
|
+
// Parse the AI response as JSON
|
|
498
|
+
const aiResponse = aiResult.data?.response || '[]';
|
|
499
|
+
const parsedFields = JSON.parse(aiResponse);
|
|
500
|
+
if (Array.isArray(parsedFields)) {
|
|
501
|
+
extractedFields = parsedFields.map((field, index) => {
|
|
502
|
+
const baseField = {
|
|
503
|
+
id: field.id || index + 1000, // Use provided ID or generate one
|
|
504
|
+
fieldName: field.fieldName || field.name || `field_${index + 1}`,
|
|
505
|
+
confidence: Math.min(Math.max(field.confidence || 0.8, 0), 1), // Clamp between 0-1
|
|
506
|
+
};
|
|
507
|
+
// Add value for autofill mode
|
|
508
|
+
if (this.params.mode === 'autofill') {
|
|
509
|
+
return {
|
|
510
|
+
...baseField,
|
|
511
|
+
originalFieldName: field.originalFieldName,
|
|
512
|
+
value: field.value || '',
|
|
513
|
+
};
|
|
514
|
+
}
|
|
515
|
+
return baseField;
|
|
516
|
+
});
|
|
517
|
+
}
|
|
518
|
+
else {
|
|
519
|
+
console.warn('[PDFOcrWorkflow] AI response was not an array, attempting to extract fields from object');
|
|
520
|
+
// If AI returned an object instead of array, try to convert it
|
|
521
|
+
if (typeof parsedFields === 'object' && parsedFields !== null) {
|
|
522
|
+
extractedFields = Object.entries(parsedFields).map(([key], index) => {
|
|
523
|
+
const baseField = {
|
|
524
|
+
id: index + 1000,
|
|
525
|
+
fieldName: key,
|
|
526
|
+
confidence: 0.7, // Lower confidence for converted data
|
|
527
|
+
};
|
|
528
|
+
// Add value for autofill mode
|
|
529
|
+
if (this.params.mode === 'autofill') {
|
|
530
|
+
return {
|
|
531
|
+
...baseField,
|
|
532
|
+
originalFieldName: undefined,
|
|
533
|
+
value: '',
|
|
534
|
+
};
|
|
535
|
+
}
|
|
536
|
+
return baseField;
|
|
537
|
+
});
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
catch {
|
|
542
|
+
console.warn('[PDFOcrWorkflow] Failed to parse AI response as JSON, attempting text extraction');
|
|
543
|
+
// Fallback: try to extract field-value pairs from text response
|
|
544
|
+
const response = aiResult.data?.response || '';
|
|
545
|
+
const lines = response
|
|
546
|
+
.split('\n')
|
|
547
|
+
.filter((line) => line.trim().length > 0);
|
|
548
|
+
extractedFields = lines.map((line, index) => {
|
|
549
|
+
const match = line.match(/^[•\-*]?\s*(.+?):\s*(.+)$/);
|
|
550
|
+
if (match) {
|
|
551
|
+
const baseField = {
|
|
552
|
+
id: index + 1000,
|
|
553
|
+
fieldName: match[1].trim(),
|
|
554
|
+
confidence: 0.6, // Lower confidence for regex-extracted data
|
|
555
|
+
};
|
|
556
|
+
// Add value for autofill mode
|
|
557
|
+
if (this.params.mode === 'autofill') {
|
|
558
|
+
return {
|
|
559
|
+
...baseField,
|
|
560
|
+
originalFieldName: undefined,
|
|
561
|
+
value: match[2]?.trim() || '',
|
|
562
|
+
};
|
|
563
|
+
}
|
|
564
|
+
return baseField;
|
|
565
|
+
}
|
|
566
|
+
const baseField = {
|
|
567
|
+
id: index + 1000,
|
|
568
|
+
fieldName: `extracted_text_${index + 1}`,
|
|
569
|
+
confidence: 0.5,
|
|
570
|
+
};
|
|
571
|
+
// Add value for autofill mode
|
|
572
|
+
if (this.params.mode === 'autofill') {
|
|
573
|
+
return {
|
|
574
|
+
...baseField,
|
|
575
|
+
originalFieldName: undefined,
|
|
576
|
+
value: '',
|
|
577
|
+
};
|
|
578
|
+
}
|
|
579
|
+
return baseField;
|
|
580
|
+
});
|
|
581
|
+
}
|
|
582
|
+
const processingTime = Date.now() - startTime;
|
|
583
|
+
console.log(`[PDFOcrWorkflow] Extracted ${extractedFields.length} fields`);
|
|
584
|
+
console.log(`[PDFOcrWorkflow] Total processing time: ${processingTime}ms`);
|
|
585
|
+
// Handle autofill mode - fill the PDF with extracted values
|
|
586
|
+
let filledPdfData = '';
|
|
587
|
+
let fillResults = { filledFields: 0, successfullyFilled: 0 };
|
|
588
|
+
if (this.params.mode === 'autofill') {
|
|
589
|
+
console.log('[PDFOcrWorkflow] Step 5: Filling PDF with extracted values...');
|
|
590
|
+
// Create field values map from autofill results
|
|
591
|
+
const fieldValues = {};
|
|
592
|
+
extractedFields.forEach((field) => {
|
|
593
|
+
if ('value' in field && field.value) {
|
|
594
|
+
let matchingDiscoveredField = null;
|
|
595
|
+
// First try: Use originalFieldName if available (most precise)
|
|
596
|
+
if (field.originalFieldName) {
|
|
597
|
+
matchingDiscoveredField = fieldsData.find((f) => f.name === field.originalFieldName);
|
|
598
|
+
if (matchingDiscoveredField) {
|
|
599
|
+
console.log(`[PDFOcrWorkflow] DEBUG: Direct match via originalFieldName: "${field.originalFieldName}" = "${field.value}"`);
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
// Second try: Match by exact field name
|
|
603
|
+
if (!matchingDiscoveredField) {
|
|
604
|
+
matchingDiscoveredField = fieldsData.find((f) => f.name === field.fieldName);
|
|
605
|
+
if (matchingDiscoveredField) {
|
|
606
|
+
console.log(`[PDFOcrWorkflow] DEBUG: Exact match via fieldName: "${field.fieldName}" -> "${matchingDiscoveredField.name}" = "${field.value}"`);
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
// Third try: Fuzzy match by normalized field name (remove spaces, lowercase)
|
|
610
|
+
if (!matchingDiscoveredField) {
|
|
611
|
+
const normalizedFieldName = field.fieldName
|
|
612
|
+
.toLowerCase()
|
|
613
|
+
.replace(/\s+/g, '');
|
|
614
|
+
matchingDiscoveredField = fieldsData.find((f) => f.name.toLowerCase().replace(/\s+/g, '') ===
|
|
615
|
+
normalizedFieldName);
|
|
616
|
+
if (matchingDiscoveredField) {
|
|
617
|
+
console.log(`[PDFOcrWorkflow] DEBUG: Fuzzy match: "${field.fieldName}" -> "${matchingDiscoveredField.name}" = "${field.value}"`);
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
if (matchingDiscoveredField) {
|
|
621
|
+
fieldValues[matchingDiscoveredField.name] = field.value;
|
|
622
|
+
}
|
|
623
|
+
else {
|
|
624
|
+
console.log(`[PDFOcrWorkflow] DEBUG: No match found for AI field: "${field.fieldName}" (originalFieldName: "${field.originalFieldName}", value: "${field.value}")`);
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
});
|
|
628
|
+
if (Object.keys(fieldValues).length > 0) {
|
|
629
|
+
console.log(`[PDFOcrWorkflow] Attempting to fill ${Object.keys(fieldValues).length} fields`);
|
|
630
|
+
// Use PDF Form Operations to fill the form
|
|
631
|
+
const fillWorkflow = new PDFFormOperationsWorkflow({
|
|
632
|
+
operation: 'fill',
|
|
633
|
+
pdfData: this.params.pdfData,
|
|
634
|
+
fieldValues,
|
|
635
|
+
credentials: this.params.credentials,
|
|
636
|
+
}, this.context);
|
|
637
|
+
const fillResult = await fillWorkflow.action();
|
|
638
|
+
if (fillResult.success && fillResult.data) {
|
|
639
|
+
filledPdfData = fillResult.data.filledPdfData;
|
|
640
|
+
fillResults = {
|
|
641
|
+
filledFields: Object.keys(fieldValues).length,
|
|
642
|
+
successfullyFilled: fillResult.data.filledFields,
|
|
643
|
+
};
|
|
644
|
+
console.log(`[PDFOcrWorkflow] Successfully filled ${fillResults.successfullyFilled} fields`);
|
|
645
|
+
}
|
|
646
|
+
else {
|
|
647
|
+
console.warn(`[PDFOcrWorkflow] PDF filling failed: ${fillResult.error}`);
|
|
648
|
+
// Fall back to original PDF
|
|
649
|
+
filledPdfData = this.params.pdfData;
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
else {
|
|
653
|
+
console.log('[PDFOcrWorkflow] No field values found for filling, returning original PDF');
|
|
654
|
+
filledPdfData = this.params.pdfData;
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
// TypeScript can't narrow the generic T inside the conditional, so we need to help it
|
|
658
|
+
const result = await (async () => {
|
|
659
|
+
// Return appropriate result based on mode
|
|
660
|
+
if (this.params.mode === 'identify') {
|
|
661
|
+
return {
|
|
662
|
+
mode: 'identify',
|
|
663
|
+
extractedFields: extractedFields,
|
|
664
|
+
discoveryData: {
|
|
665
|
+
totalFields: discoveryResult.data?.totalFields || 0,
|
|
666
|
+
fieldsWithCoordinates: fieldsData.filter((f) => f.x !== 0 || f.y !== 0).length,
|
|
667
|
+
pages: [...new Set(fieldsData.map((f) => f.page))],
|
|
668
|
+
},
|
|
669
|
+
imageData: {
|
|
670
|
+
totalPages: imageResult.data?.totalPages || 0,
|
|
671
|
+
convertedPages: imageResult.data?.convertedPages || 0,
|
|
672
|
+
format: this.params.imageOptions?.format || 'png',
|
|
673
|
+
dpi: this.params.imageOptions?.dpi || 150,
|
|
674
|
+
},
|
|
675
|
+
aiAnalysis: {
|
|
676
|
+
model: this.params.aiOptions?.model || 'google/gemini-2.5-flash',
|
|
677
|
+
iterations: aiResult.data?.iterations || 0,
|
|
678
|
+
processingTime,
|
|
679
|
+
},
|
|
680
|
+
success: true,
|
|
681
|
+
error: '',
|
|
682
|
+
};
|
|
683
|
+
}
|
|
684
|
+
else {
|
|
685
|
+
return {
|
|
686
|
+
mode: 'autofill',
|
|
687
|
+
extractedFields: extractedFields,
|
|
688
|
+
filledPdfData,
|
|
689
|
+
discoveryData: {
|
|
690
|
+
totalFields: discoveryResult.data?.totalFields || 0,
|
|
691
|
+
fieldsWithCoordinates: fieldsData.filter((f) => f.x !== 0 || f.y !== 0).length,
|
|
692
|
+
pages: [...new Set(fieldsData.map((f) => f.page))],
|
|
693
|
+
},
|
|
694
|
+
imageData: {
|
|
695
|
+
totalPages: imageResult.data?.totalPages || 0,
|
|
696
|
+
convertedPages: imageResult.data?.convertedPages || 0,
|
|
697
|
+
format: this.params.imageOptions?.format || 'png',
|
|
698
|
+
dpi: this.params.imageOptions?.dpi || 150,
|
|
699
|
+
},
|
|
700
|
+
aiAnalysis: {
|
|
701
|
+
model: this.params.aiOptions?.model || 'google/gemini-2.5-flash',
|
|
702
|
+
iterations: aiResult.data?.iterations || 0,
|
|
703
|
+
processingTime,
|
|
704
|
+
},
|
|
705
|
+
fillResults,
|
|
706
|
+
success: true,
|
|
707
|
+
error: '',
|
|
708
|
+
};
|
|
709
|
+
}
|
|
710
|
+
})();
|
|
711
|
+
// The result is guaranteed to match T['mode'] because of the discriminated union
|
|
712
|
+
return result;
|
|
713
|
+
}
|
|
714
|
+
catch (error) {
|
|
715
|
+
const processingTime = Date.now() - startTime;
|
|
716
|
+
console.error('[PDFOcrWorkflow] Workflow failed:', error);
|
|
717
|
+
// Return appropriate error result based on mode
|
|
718
|
+
const errorResult = (() => {
|
|
719
|
+
if (this.params.mode === 'identify') {
|
|
720
|
+
return {
|
|
721
|
+
mode: 'identify',
|
|
722
|
+
extractedFields: [],
|
|
723
|
+
discoveryData: {
|
|
724
|
+
totalFields: 0,
|
|
725
|
+
fieldsWithCoordinates: 0,
|
|
726
|
+
pages: [],
|
|
727
|
+
},
|
|
728
|
+
imageData: {
|
|
729
|
+
totalPages: 0,
|
|
730
|
+
convertedPages: 0,
|
|
731
|
+
format: this.params.imageOptions?.format || 'png',
|
|
732
|
+
dpi: this.params.imageOptions?.dpi || 150,
|
|
733
|
+
},
|
|
734
|
+
aiAnalysis: {
|
|
735
|
+
model: this.params.aiOptions?.model || 'google/gemini-2.5-flash',
|
|
736
|
+
iterations: 0,
|
|
737
|
+
processingTime,
|
|
738
|
+
},
|
|
739
|
+
success: false,
|
|
740
|
+
error: error instanceof Error
|
|
741
|
+
? error.message
|
|
742
|
+
: 'Unknown error during PDF OCR workflow',
|
|
743
|
+
};
|
|
744
|
+
}
|
|
745
|
+
else {
|
|
746
|
+
return {
|
|
747
|
+
mode: 'autofill',
|
|
748
|
+
extractedFields: [],
|
|
749
|
+
filledPdfData: '',
|
|
750
|
+
discoveryData: {
|
|
751
|
+
totalFields: 0,
|
|
752
|
+
fieldsWithCoordinates: 0,
|
|
753
|
+
pages: [],
|
|
754
|
+
},
|
|
755
|
+
imageData: {
|
|
756
|
+
totalPages: 0,
|
|
757
|
+
convertedPages: 0,
|
|
758
|
+
format: this.params.imageOptions?.format || 'png',
|
|
759
|
+
dpi: this.params.imageOptions?.dpi || 150,
|
|
760
|
+
},
|
|
761
|
+
aiAnalysis: {
|
|
762
|
+
model: this.params.aiOptions?.model || 'google/gemini-2.5-flash',
|
|
763
|
+
iterations: 0,
|
|
764
|
+
processingTime,
|
|
765
|
+
},
|
|
766
|
+
fillResults: {
|
|
767
|
+
filledFields: 0,
|
|
768
|
+
successfullyFilled: 0,
|
|
769
|
+
},
|
|
770
|
+
success: false,
|
|
771
|
+
error: error instanceof Error
|
|
772
|
+
? error.message
|
|
773
|
+
: 'Unknown error during PDF OCR workflow',
|
|
774
|
+
};
|
|
775
|
+
}
|
|
776
|
+
})();
|
|
777
|
+
return errorResult;
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
//# sourceMappingURL=pdf-ocr.workflow.js.map
|