npm - @talonic/docs - Versions diffs - 0.20.13 → 0.20.14 - Mend

@talonic/docs 0.20.13 → 0.20.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/content.js +149 -41
package/package.json +1 -1

package/dist/content.js CHANGED Viewed

@@ -574,11 +574,11 @@ var sections = [
     faq: [
       {
         question: "What is the Field Registry?",
-        answer: "The Field Registry is a unified knowledge graph of all canonical fields discovered across your documents, organized by tier, clustered semantically, and enriched with master extraction instructions."
+        answer: "The Field Registry is a unified knowledge graph of all canonical fields discovered across your documents, organized by tier, clustered semantically, and enriched with master extraction instructions. Fields progress through three tiers as they mature: Tier 3 (emerging, newly discovered), Tier 2 (established, promoted after repeated occurrence), and Tier 1 (universal, core fields present across most document types). Each tier transition triggers instruction synthesis so the platform learns the optimal way to extract that field."
       },
       {
         question: "What is provenance in Talonic?",
-        answer: "Provenance is per-cell metadata that tracks which pipeline phase filled the value, the confidence score, an AI reasoning trace, and source references back to the original document."
+        answer: "Provenance is per-cell metadata that tracks which pipeline phase filled the value, the confidence score, an AI reasoning trace, and source references back to the original document. You can inspect provenance by hovering any cell in the job results grid to see its confidence score, then clicking to expand the full provenance panel. The panel shows which strategy resolved the value, the raw source text it was derived from, and the AI reasoning chain when applicable."
       },
       {
         question: "How do Cases form?",
@@ -737,11 +737,11 @@ var sections = [
     faq: [
       {
         question: "What is the fastest way to get started with Talonic?",
-        answer: "Upload documents in Sources, then go to Structuring > Runs > New to create your first extraction job. Results appear progressively as each phase completes."
+        answer: "Upload documents in Sources, then go to Structuring > Runs > New to create your first extraction job. Results appear progressively as each phase completes. For a single document, use the quick extract shortcut (Cmd+J / Ctrl+J) to upload and process from any page without navigating to Sources first. Most users see their first structured output within two to three minutes of uploading."
       },
       {
         question: "How is the Talonic platform organized?",
-        answer: "The platform is organized into three primary sections: Sources (document ingest), Structuring (processing & validation), and Outputs (delivery to downstream systems)."
+        answer: "The platform is organized into three primary sections: Sources (document ingest), Structuring (processing & validation), and Outputs (delivery to downstream systems). Sources handles all document ingestion \u2014 manual uploads, cloud connectors, email inboxes, and API ingestion. Structuring is where you define schemas, run extraction jobs, review results, and approve output. Outputs manages delivery bindings that push approved data to webhooks, SFTP, cloud storage, and other downstream systems."
       },
       {
         question: "Do I need to define a schema before processing documents?",
@@ -1182,7 +1182,7 @@ var sections3 = [
       },
       {
         question: "How does Talonic handle image files?",
-        answer: "Image files (PNG, JPG, JPEG, GIF, WEBP) are sent to AI for multimodal visual extraction."
+        answer: "Image files (PNG, JPG, JPEG, GIF, WEBP) are sent to AI for multimodal visual extraction. The AI model sees the image directly and extracts data visually, which is useful for photos of receipts, scanned handwritten notes, or diagrams. If an image was previously OCR'd and produced meaningful Markdown (more than 100 characters), the system uses the Markdown extraction path instead, which enables richer quality metrics and confidence scoring."
       },
       {
         question: "How does Talonic handle large PDF files?",
@@ -1511,7 +1511,7 @@ var sections3 = [
       },
       {
         question: "Can routing rules fully automate my document processing pipeline?",
-        answer: "Yes. By combining routing rules with source connectors and delivery bindings, you can create a fully automated pipeline: documents arrive from a connected source, routing rules assign schemas and trigger extraction jobs, and delivery bindings push approved results to downstream systems."
+        answer: "Yes. By combining routing rules with source connectors and delivery bindings, you can create a fully automated pipeline: documents arrive from a connected source, routing rules assign schemas and trigger extraction jobs, and delivery bindings push approved results to downstream systems. For example, a Google Drive folder receiving weekly invoices can be connected as a source with a routing rule that auto-assigns your Invoice schema and triggers extraction. A delivery binding then pushes approved results to your ERP via webhook \u2014 zero manual steps required."
       }
     ],
     mentions: ["routing rules", "auto-assign", "schema assignment", "document workflows"]
@@ -2212,6 +2212,20 @@ var sections5 = [
         type: "paragraph",
         text: "When configuring a field, start with the basics \u2014 name, type, and registry mapping \u2014 then layer on advanced features as needed. For example, add a **format constraint** to enforce a date pattern, attach a **reference table** for code lookups, or define **capture submoves** to control the exact extraction sequence. Features compose independently, so you can mix and match without conflicts."
       },
+      {
+        type: "list",
+        ordered: false,
+        items: [
+          "**Format constraint** \u2014 Regex validation with configurable mismatch behavior (clear, flag, or replace).",
+          "**Modifiers** \u2014 Post-processing pipeline: format (date/number conversion), alias (value mapping), max_length (truncation).",
+          "**Constraints** \u2014 Validation rules: required, enum, date-format, length, cross-field expressions.",
+          "**Bypass strategy** \u2014 Skip AI extraction: constant value, deterministic ID generator, or reference table lookup.",
+          "**Reference table** \u2014 Key-value pairs for code mapping with a 3-tier lookup cascade (normalization, fuzzy, AI).",
+          "**Manual instruction** \u2014 User-written extraction directive that overrides the AI-synthesized master instruction.",
+          "**Capture submoves** \u2014 Ordered extraction sequence: match (field matching), compute (calculation), reason (LLM inference).",
+          "**Output name** \u2014 Remap the field name in delivery and export output without changing the internal schema name."
+        ]
+      },
       {
         type: "paragraph",
         text: "The **modifier pipeline** runs in a fixed order during Phase 4 of the extraction pipeline: format transforms first (converting dates or numbers to your target format), then alias mapping (replacing values using a lookup), and finally max_length truncation. Constraint evaluation happens after all modifiers have been applied, so constraints validate the final transformed value, not the raw extraction."
@@ -2369,6 +2383,10 @@ var sections5 = [
         type: "paragraph",
         text: "Reference tables are used in two pipeline stages. In **Phase 1**, the lookup cascade runs as part of the resolve step, mapping extracted labels to codes without any AI calls (Tier 1 and Tier 2). In **Phase 3**, the cascade runs again on values produced by Phase 2's AI extraction, normalizing free-text AI output to your canonical codes. This two-pass approach ensures maximum code coverage across the entire pipeline."
       },
+      {
+        type: "paragraph",
+        text: 'For example, consider a "Contract Type" field with a reference table mapping codes to labels: `std_master` = "Master Agreement", `std_service` = "Service Agreement", `std_nda` = "Non-Disclosure Agreement". When the AI extracts "Frame Agreement" from a document, the Phase 3 lookup cascade normalizes it: Tier 1 finds no exact match, Tier 2 fuzzy matching scores "Frame Agreement" against "Master Agreement" at ~0.65 (below the threshold), so Tier 3 AI fallback maps it to `std_master` at 0.50 confidence. Adding "Frame Agreement" as a synonym pointing to `std_master` would promote this to a Tier 1 match (0.95 confidence) in future runs.'
+      },
       {
         type: "paragraph",
         text: 'For best results, include common variations and abbreviations as separate value entries all pointing to the same key. For example, if your code is `US`, add values for "United States", "USA", "U.S.A.", and "United States of America". The more variations you cover, the more values resolve at Tier 1 (highest confidence) without falling through to fuzzy or AI matching.'
@@ -2441,15 +2459,15 @@ var sections5 = [
     faq: [
       {
         question: "How does schema versioning work?",
-        answer: "Templates use a workshop system with three states: Live (published, read-only), Workshop (mutable draft), and Version History (timeline with diffs). Breaking changes like field removals or type changes are detected on promotion. Every published version is immutable, creating a complete audit trail of how your schema evolved over time."
+        answer: "Templates use a workshop system with three states: Live (published, read-only), Workshop (mutable draft), and Version History (timeline with diffs). Breaking changes like field removals or type changes are detected on promotion. Every published version is immutable, creating a complete audit trail of how your schema evolved over time. The diff view highlights added fields, removed fields, type changes, and updated instructions between any two versions."
       },
       {
         question: "What are breaking changes in a schema?",
-        answer: "Breaking changes include field removals and data type changes. The system detects and warns about these when promoting a draft to live, helping you avoid unintended downstream impacts. If a downstream delivery binding depends on a specific field, the warning helps you assess the impact before committing the change."
+        answer: "Breaking changes include field removals and data type changes. The system detects and warns about these when promoting a draft to live, helping you avoid unintended downstream impacts. If a downstream delivery binding depends on a specific field, the warning helps you assess the impact before committing the change. Always run a Test Extraction on representative documents before publishing a draft that includes breaking changes."
       },
       {
         question: "Can I revert to a previous schema version?",
-        answer: "Version history is append-only, so you cannot revert directly. However, you can review any previous version in the timeline, compare it with the current live version using the diff view, and manually re-add fields or settings that were changed. This design ensures that every historical job result always references the exact schema version that produced it."
+        answer: "Version history is append-only, so you cannot revert directly. However, you can review any previous version in the timeline, compare it with the current live version using the diff view, and manually re-add fields or settings that were changed. This design ensures that every historical job result always references the exact schema version that produced it. For safe iteration, always use the Workshop draft to test changes via Test Extraction before publishing a new version."
       }
     ],
     mentions: ["versioning", "drafts", "workshop", "live version", "breaking changes"]
@@ -2554,6 +2572,10 @@ var sections5 = [
           }
         ]
       },
+      {
+        type: "paragraph",
+        text: 'For example, to configure date formatting for a European accounting system: set `date_format` to `DD.MM.YYYY` so dates render as `15.03.2025` instead of the default `YYYY/MM/DD`. Pair this with `number_locale: "de-DE"` for comma-decimal formatting (`1.234,56`) and `delimiter: ";"` so CSV files open correctly in Excel on European locale machines. Save this configuration as a shared dialect named "EU Accounting" and attach it to every schema that feeds into that system \u2014 all future exports and deliveries will use consistent formatting without per-schema configuration.'
+      },
       {
         type: "paragraph",
         text: "When working with international data, configure the dialect to match your downstream system requirements. For example, set **number_locale** to `fr-FR` for European comma-decimal formatting, switch the **delimiter** to semicolon for CSV compatibility, and choose **UTF-8-BOM** encoding if your data will be opened in Excel. Creating a shared dialect and reusing it across schemas ensures consistent formatting across all your exports."
@@ -2640,6 +2662,17 @@ var sections5 = [
         type: "paragraph",
         text: 'Use bypass strategies for fields whose values are known ahead of time or can be derived without reading the document. For example, set a **constant** of `"USD"` for a currency field that is always the same, or use a **generator** to produce a deterministic ID for each row. Fields with bypass strategies skip the AI extraction phase entirely, reducing processing time and credit usage.'
       },
+      {
+        type: "list",
+        ordered: false,
+        items: [
+          "**none** \u2014 Use when a field should always be blank. Useful for placeholder columns in your output that will be populated by a downstream system.",
+          '**constant** \u2014 Use when the value never varies across documents (e.g., currency `"USD"`, data source `"talonic"`, processing status `"pending"`).',
+          "**generator (deterministic-id)** \u2014 Use when you need a unique, reproducible identifier for each row. Produces a hash-based ID from entity attributes.",
+          "**generator (context-fallback)** \u2014 Use when the value can be derived from other fields in the schema without reading the document.",
+          "**reference** \u2014 Use when the value should be looked up from a reference table using a `key_expression` that references another schema field (e.g., map supplier name to ERP vendor code)."
+        ]
+      },
       {
         type: "paragraph",
         text: "The **reference** bypass strategy is particularly powerful for enrichment fields. Define a `key_expression` that references another field in the schema (e.g., the supplier name), and the system will automatically look up the corresponding code from your reference table without any AI involvement. This is ideal for mapping extracted entity names to internal system identifiers, ERP codes, or classification labels."
@@ -2737,15 +2770,15 @@ var sections5 = [
     faq: [
       {
         question: "What are format constraints?",
-        answer: "Format constraints apply regex-based validation to schema fields, evaluated post-extraction in Phase 4. Mismatch behaviors: empty (clear), flag (amber dot), or constant (replace with a fixed value)."
+        answer: 'Format constraints apply regex-based validation to schema fields, evaluated post-extraction in Phase 4 after all transforms have been applied. Mismatch behaviors: empty (clear the cell, the default), flag (keep the value but show an amber dot in the results grid), or constant (replace with a fixed value like "INVALID" or "N/A"). The constraint validates the final transformed value, not the raw extraction.'
       },
       {
         question: "Are original values preserved when format constraints clear a cell?",
-        answer: "Yes. Original values are always preserved for audit in the original_extractions table, regardless of the mismatch behavior applied."
+        answer: "Yes. Original values are always preserved for audit in the original_extractions table, regardless of the mismatch behavior applied. This means you can always review what the AI originally extracted before the constraint was applied, giving you full visibility into the extraction pipeline."
       },
       {
         question: "Can I use case-insensitive regex patterns?",
-        answer: "Yes. Use the (?i) inline flag at the start of your pattern for case-insensitive matching. The evaluator supports standard JavaScript regex syntax with inline flags."
+        answer: "Yes. Use the (?i) inline flag at the start of your pattern for case-insensitive matching. The evaluator supports standard JavaScript regex syntax including character classes, alternation, and lookahead assertions. ReDoS protection is built in \u2014 nested quantifiers are rejected and input is capped at 1,000 characters."
       }
     ],
     mentions: [
@@ -2855,11 +2888,11 @@ var sections6 = [
     faq: [
       {
         question: "What are the four phases of the extraction pipeline?",
-        answer: "Phase 1: Resolve (graph matches, ~30% of cells), Phase 2: Agent (AI strategies), Phase 3: Validation (cross-field checks), and Phase 4: Re-read (targeted gap filling)."
+        answer: "Phase 1: Resolve (graph matches and deterministic lookups, fills 30-80% of cells depending on registry maturity). Phase 2: Agent (AI extraction for remaining gaps, grouped into batches of 10 fields per call). Phase 3: Validation (cross-field checks and reference table re-normalization of AI output). Phase 4: Re-read (targeted gap filling with full grid context, plus deterministic transforms and format constraint evaluation)."
       },
       {
         question: "Can I see results before all phases complete?",
-        answer: "Yes. Results are visible as each phase completes. The fill rate increases progressively through the pipeline."
+        answer: "Yes. The grid is flushed to the database after each phase, enabling progressive rendering in the UI. You can watch cells fill in real time and begin reviewing Phase 1 results while Phase 2 is still running. The phase timeline on the job detail page shows which phase is active and the cumulative fill rate at each stage."
       },
       {
         question: "Why does the pipeline use multiple phases instead of a single AI call?",
@@ -2922,6 +2955,10 @@ var sections6 = [
         type: "paragraph",
         text: "The resolution strategies execute in a fixed order: registry transfer first, then raw extraction mapping, then the 3-tier lookup cascade, and finally deterministic compute (formulas like `Total = Unit Price x Quantity`). Each strategy only attempts to fill cells that are still empty after the previous strategy ran. This ordering ensures that the highest-confidence method always gets priority."
       },
+      {
+        type: "paragraph",
+        text: `For example, consider an invoice with a "Vendor Name" field. The system first checks the Field Registry for a direct transfer \u2014 if "Vendor Name" was extracted from a previous document and promoted to Tier 1, it resolves instantly at 0.85+ confidence. If no registry match exists, the raw extraction mapping looks for a semantically equivalent field in the document's extracted data (e.g., "supplier_name"). If that also misses, the 3-tier lookup cascade checks the reference table: exact normalization first (0.95), then fuzzy token overlap (~0.70), then AI fallback (0.50). Only if all four strategies fail does the cell pass to Phase 2 for AI extraction.`
+      },
       {
         type: "callout",
         text: "Phase 1 fill rates improve over time as your Field Registry grows. The more documents you process, the richer the registry becomes, and the more cells Phase 1 can resolve without AI \u2014 reducing both cost and latency for every subsequent job."
@@ -3004,7 +3041,7 @@ var sections6 = [
       },
       {
         type: "paragraph",
-        text: "Phase 2 processes documents with grouped extraction calls \u2014 schema fields are divided into batches of up to 10 fields per call to balance extraction quality with throughput. For each document, the agent sends the document text along with the schema field definitions and any already-resolved values from Phase 1 as context. This context-aware approach means the AI can use related values (like a contract start date) to more accurately extract dependent values (like the end date)."
+        text: 'Phase 2 processes documents with grouped extraction calls \u2014 schema fields are divided into batches of up to 10 fields per call to balance extraction quality with throughput. For each document, the agent sends the document text along with the schema field definitions and any already-resolved values from Phase 1 as context. This context-aware approach means the AI can use related values (like a contract start date) to more accurately extract dependent values (like the end date). For example, if Phase 1 resolved "Contract Start Date" to 2025-01-15 via a registry transfer, and the "Contract End Date" cell is still empty, the agent receives the start date as context and can search the document for a corresponding end date with higher precision \u2014 producing a more accurate result than extracting the end date in isolation.'
       },
       {
         type: "paragraph",
@@ -3098,6 +3135,10 @@ var sections6 = [
         type: "paragraph",
         text: "Validation flags are designed to surface the most impactful issues first. The **low_confidence_outlier** flag is particularly useful \u2014 it highlights cells where the system is uncertain in an otherwise high-confidence row, pointing you to the exact cells most likely to contain errors. For large runs with hundreds of documents, filtering by flags and reviewing those cells first can reduce your review time by 80% or more."
       },
+      {
+        type: "paragraph",
+        text: "What gets flagged and why depends on cross-field relationships, not just individual values. A **date_sanity** flag fires when temporal fields contradict each other \u2014 for example, a contract end date that falls before the start date, or a signature date after the effective date. An **amount_mismatch** flag fires when a computed total deviates more than 20% from the product of its component values (e.g., monthly rent times term length versus total contract value). The **unexpected_empty** flag fires when a field that appears in over 80% of documents in your registry is missing from this particular document, suggesting the AI may have missed it rather than it being genuinely absent."
+      },
       {
         type: "callout",
         text: "Validation flags never modify cell values. They are purely informational annotations that help you prioritize review. The actual cell value and confidence score remain unchanged by Phase 3 flagging."
@@ -3166,15 +3207,15 @@ var sections6 = [
     faq: [
       {
         question: "What does Phase 4 Re-read do?",
-        answer: "Phase 4 performs context-aware gap filling by re-reading the original document with field instructions and full grid context for each empty or low-confidence cell."
+        answer: "Phase 4 performs context-aware gap filling by re-reading the original document with field instructions and full grid context for each empty or low-confidence cell. Because it has access to all values resolved in earlier phases, it can use surrounding data as clues \u2014 for example, using a resolved start date to locate the corresponding end date more accurately."
       },
       {
         question: "Can Phase 4 overwrite high-confidence values?",
-        answer: "No. Phase 4 respects the confidence gate \u2014 it can only fill empty cells or upgrade cells below the confidence threshold. High-confidence values from earlier phases are permanently protected."
+        answer: "No. Phase 4 respects the confidence gate \u2014 it can only fill empty cells or upgrade cells below the confidence threshold. High-confidence values from earlier phases are permanently protected. This is the single most important pipeline rule, ensuring that reliable lookup results are never replaced by lower-confidence AI extractions."
       },
       {
         question: "What else happens in Phase 4 besides gap filling?",
-        answer: "Phase 4 also applies deterministic transforms (ISO codes, dates, units), evaluates format constraints (regex validation), and runs the modifier pipeline (format, alias, max_length). Original values are preserved for audit."
+        answer: "Phase 4 also applies deterministic transforms (ISO codes, dates, units), evaluates format constraints (regex validation), and runs the modifier pipeline in a fixed order: format transforms first, then alias mapping, then max_length truncation. Constraint evaluation happens after all modifiers. Original values are always preserved in the original_extractions table for audit, regardless of whether constraints clear, flag, or replace them."
       }
     ],
     mentions: ["Phase 4", "re-read", "gap filling", "confidence gate", "targeted extraction"]
@@ -3221,15 +3262,15 @@ var sections6 = [
     faq: [
       {
         question: "What do the colored dots in the results grid mean?",
-        answer: "Each dot indicates how a cell was resolved: blue = graph match, purple = computed, teal = agent transfer, indigo = agent extract, amber = lookup."
+        answer: "Each dot indicates how a cell was resolved: blue = graph match (Phase 1 registry transfer, highest reliability), purple = computed (deterministic formula), teal = agent transfer (copy from equivalent field), indigo = agent extract (AI read from document), amber = lookup result or format flag. A grid dominated by blue and purple dots typically requires minimal review."
       },
       {
         question: "Can I export extraction results?",
-        answer: "Yes. Use CSV export from the job detail page. You can export clean data only or full data with metadata including confidence scores and resolution types."
+        answer: "Yes. Use CSV export from the job detail page. The clean export includes only extracted values, ready for direct import into downstream systems. The full export includes metadata columns for each field: confidence score, resolution type, phase number, and reasoning trace \u2014 useful for audit trails or analyzing extraction performance across your document corpus."
       },
       {
         question: "What is the most efficient way to review a large extraction run?",
-        answer: "Start with the Flagged filter to address cells with validation warnings, low confidence, or format mismatches. Then spot-check a random sample of Clean rows. Focus corrections on recurring field-level patterns rather than individual cells."
+        answer: "Start with the Flagged filter to address cells with validation warnings, low confidence, or format mismatches. Then spot-check a random sample of Clean rows. Focus corrections on recurring field-level patterns rather than individual cells. If you find a field that is consistently wrong, update its manual instruction or reference table in the schema rather than correcting cells one by one \u2014 this improves future runs as well."
       }
     ],
     mentions: [
@@ -3290,6 +3331,17 @@ var sections6 = [
         type: "paragraph",
         text: "Confidence scores follow predictable patterns by resolution type. Graph matches from Phase 1 typically score 0.7-0.95 because they are derived from verified registry data. Reference table lookups score 0.95 for exact normalization matches, ~0.70 for fuzzy matches, and 0.50 for AI fallback. Agent-derived values from Phase 2 generally score 0.5-0.9 depending on the clarity of the source document and the specificity of the extraction instruction."
       },
+      {
+        type: "list",
+        ordered: false,
+        items: [
+          "**0.90-0.95** \u2014 Tier 1 lookup or exact registry transfer. Highest reliability; safe to trust without review in most workflows.",
+          "**0.70-0.89** \u2014 Strong graph match or fuzzy registry transfer. Generally reliable; spot-check a sample to validate.",
+          "**0.50-0.69** \u2014 AI extraction or fuzzy lookup result. Review recommended; the system found a plausible value but certainty is moderate.",
+          "**0.30-0.49** \u2014 Low-confidence AI extraction. The source document was ambiguous or the field instruction was vague. Always review manually.",
+          "**Below 0.30** \u2014 Very low confidence. The value is likely a best guess. Consider updating the schema instruction or adding a reference table to improve future runs."
+        ]
+      },
       {
         type: "paragraph",
         text: "Use confidence scores to set your review threshold. Cells above 0.8 are generally reliable and can be trusted without manual verification for most use cases. Cells between 0.5 and 0.8 warrant a quick check. Cells below 0.5 should always be reviewed manually. You can use the full CSV export to filter and sort by confidence, making it easy to batch-review low-confidence cells efficiently."
@@ -3363,15 +3415,15 @@ var sections6 = [
     faq: [
       {
         question: "How do I correct an extracted value?",
-        answer: "Click any cell in the results grid to edit its value. Choose propagation scope: this_document_only (single cell) or all_similar (same field + method across all documents)."
+        answer: "Click any cell in the results grid to edit its value. Choose propagation scope: this_document_only (single cell) or all_similar (same field + method across all documents). When using all_similar, the system shows a preview count of how many cells will be affected before you confirm \u2014 always verify this count to avoid unintended bulk changes."
       },
       {
         question: "Do corrections improve future extractions?",
-        answer: "Yes. Corrections feed back as training signals for future runs, helping the system learn from your corrections and improve accuracy over time."
+        answer: "Yes. Corrections feed back as training signals for future runs, helping the system learn from your corrections and improve accuracy over time. For maximum impact, correct the root cause rather than individual symptoms \u2014 update the schema field instruction or reference table so that future runs resolve correctly without manual intervention."
       },
       {
         question: "Is there an audit trail for corrections?",
-        answer: "Yes. Every correction logs the original value, the corrected value, the user who made the change, and the timestamp. This audit history is preserved and included in full metadata CSV exports."
+        answer: "Yes. Every correction logs the original value, the corrected value, the user who made the change, and the timestamp. This audit history is preserved even after subsequent jobs run and is included in full metadata CSV exports. Downstream systems can use this data to distinguish between AI-extracted and human-corrected values."
       }
     ],
     mentions: [
@@ -3786,11 +3838,11 @@ var sections7 = [
     faq: [
       {
         question: "What anomalies does Talonic detect?",
-        answer: "Five structural patterns: validation clusters, field conflicts, duplicate key divergence, missing document types, and value reuse. Each is surfaced as a dismissable card on the case detail page."
+        answer: "Five structural patterns: validation clusters (D1), field conflicts (D2), duplicate key divergence (D3), missing document types (D4), and value reuse (D5). Each is surfaced as a dismissable card on the case detail page. D2 and D3 are the highest-value detectors for procurement and financial workflows \u2014 they catch contradictory values across related documents, such as mismatched amounts between an invoice and its corresponding purchase order."
       },
       {
         question: "Do anomalies update automatically when cases change?",
-        answer: "Yes. The detection engine re-runs whenever case membership changes \u2014 documents added or removed, cases merged or split. Anomaly badges in the case header update in real time."
+        answer: "Yes. The detection engine re-runs whenever case membership changes \u2014 documents added or removed, cases merged or split. Anomaly badges in the case header update in real time. Each detector operates independently, so a single case can trigger multiple anomaly types simultaneously. This continuous re-evaluation ensures that anomalies stay current as your document corpus evolves."
       },
       {
         question: "Can I dismiss anomalies?",
@@ -4011,11 +4063,11 @@ var sections8 = [
       },
       {
         question: "Why should I use assemblies for production data?",
-        answer: "Assemblies provide a single audit trail from source documents through extraction, resolution, and validation to the final output, making them the recommended approach for production datasets."
+        answer: "Assemblies provide a single audit trail from source documents through extraction, resolution, and validation to the final output, making them the recommended approach for production datasets. Unlike ad-hoc exports, assemblies are versioned and reproducible \u2014 you can regenerate the same output shape from different document sets without reconfiguring columns or transforms. Previous versions are retained automatically, so you can compare outputs across time periods and demonstrate compliance with audit requirements."
       },
       {
         question: "Can an assembly pull from multiple sources?",
-        answer: "Yes. An assembly can combine documents from any number of sources \u2014 uploaded files, connected drives, email attachments, and more \u2014 into a single structured dataset."
+        answer: "Yes. An assembly can combine documents from any number of sources \u2014 uploaded files, connected drives, email attachments, and more \u2014 into a single structured dataset. This is particularly useful for cross-functional reporting where data arrives through different channels. For example, you can combine invoices from a Google Drive connector, purchase orders uploaded manually, and contracts ingested via the API into a single unified procurement dataset."
       }
     ],
     mentions: [
@@ -4499,7 +4551,7 @@ var sections10 = [
       },
       {
         type: "paragraph",
-        text: "Every delivery flows through a five-stage pipeline. Producers are stateless \u2014 they only publish typed events into an outbox and never interact with destinations or bindings directly. A background poller drains the outbox every 5 seconds, matches events against active bindings, and enqueues delivery jobs for processing:"
+        text: "Every delivery flows through a five-stage pipeline. Producers are stateless \u2014 they only publish typed events into an outbox and never interact with destinations or bindings directly. A background poller drains the outbox every 5 seconds (configurable via `delivery.poll_interval_ms`), claiming up to 50 rows per tick using `FOR UPDATE SKIP LOCKED` for safe multi-instance operation. When the BullMQ queue depth exceeds the backpressure threshold (default 10,000), the poller pauses until the queue drains, preventing memory exhaustion under burst load. Matched events are enqueued as delivery jobs processed by workers (default concurrency: 10):"
       },
       {
         type: "param-table",
@@ -4633,6 +4685,10 @@ var sections10 = [
         type: "paragraph",
         text: "A single destination can back multiple bindings. For example, one S3 bucket destination can receive both `document.extracted` and `result.approved` events through separate bindings, each with its own serializer and field map. This keeps your destination inventory small while supporting diverse routing requirements."
       },
+      {
+        type: "paragraph",
+        text: 'For example, to set up a webhook destination via the API: `POST /v1/delivery/destinations` with a body containing `name`, `type: "webhook"`, `config: { url: "https://ops.example.com/talonic" }`, and optionally `auth_config`, `signing_secret`, and `payload_cap_bytes`. The response returns the destination ID, which you then reference when creating a binding. After creation, call `POST /v1/delivery/destinations/:id/test` to verify the connection end-to-end before routing live events to it.'
+      },
       {
         type: "paragraph",
         text: "For best results, always run a live-ping test after creating a destination. The test exercises the full transport envelope \u2014 SSRF validation, payload cap, and authentication \u2014 with a tiny test payload, so you catch configuration errors before real events start flowing. OAuth-based destinations (Google Drive, Google Sheets) require connecting your account first via the OAuth flow in the dashboard."
@@ -4688,7 +4744,7 @@ var sections10 = [
       },
       {
         type: "paragraph",
-        text: "The compatibility triangle is enforced on every create and update. The backend checks that your chosen serializer supports the deliverable resolver's output shape, and that the connector accepts the serializer's format. If any predicate fails, the binding is rejected with a descriptive error \u2014 you never end up with a binding that cannot deliver."
+        text: "The compatibility triangle is enforced on every create and update via six predicates. The backend checks that: (1) the `signal_filter` is well-formed with a known event type and valid match values, (2) the `deliverable_type` resolves to a registered resolver, (3) the `serializer_format` resolves to a registered serializer, (4) the serializer supports the resolver's output shape, (5) the connector's supported serializer list includes the chosen format, and (6) the resolver's compatible signals include the signal filter's event type. If any predicate fails, the binding is rejected with a descriptive error \u2014 you never end up with a binding that cannot deliver."
       },
       {
         type: "paragraph",
@@ -4805,7 +4861,7 @@ var sections10 = [
       },
       {
         type: "paragraph",
-        text: "Signals are typed events emitted by the platform when meaningful state changes occur. Document-level signals fire on extraction success or failure. Run-level signals fire when a job completes across dataspace, structuring, resolution, or extraction runs. Result-level signals fire when a reviewer approves, rejects, or flags a record."
+        text: "Signals are typed events emitted by the platform when meaningful state changes occur. They fall into four categories. **Document signals** (`document.extracted`, `document.extraction_failed`) fire on extraction success or failure for individual documents. **Run signals** (`run.dataspace.completed`, `run.structuring.completed`, `run.resolution.completed`, `run.extraction.completed`) fire when a job run completes across the four pipeline domains. **Result signals** (`result.approved`, `result.rejected`, `result.flagged`) fire when a reviewer takes action on a record. **Meta-signals** (`delivery.item.completed`, `delivery.item.failed`) fire when a delivery attempt itself succeeds or fails, enabling self-monitoring workflows."
       },
       {
         type: "paragraph",
@@ -4883,15 +4939,15 @@ var sections10 = [
     faq: [
       {
         question: "How is delivery history tracked?",
-        answer: "Every delivery attempt writes a row to /v1/delivery/items with status, HTTP code, error code, and request/response bodies. The log is strictly append-only \u2014 nothing is ever mutated."
+        answer: "Every delivery attempt writes a row to /v1/delivery/items with status, HTTP code, error code, and request/response bodies (truncated to 10 KB each). The log is strictly append-only \u2014 nothing is ever mutated. You can filter items by binding_id, destination_id, or status to narrow results when debugging a specific integration."
       },
       {
         question: "What is the dead letter queue (DLQ)?",
-        answer: "Terminal failures (retry ladder exhausted or permanent 4xx) escalate to /v1/delivery/dlq. DLQ entries are fully replayable \u2014 replay enqueues a fresh attempt with a new idempotency key."
+        answer: "Terminal failures (retry ladder exhausted or permanent 4xx) escalate to /v1/delivery/dlq. DLQ entries are fully replayable \u2014 replay enqueues a fresh attempt while preserving the deterministic idempotency key, so receivers that deduplicate on the key will not process the same delivery twice. Destinations returning authentication errors are automatically disabled to prevent further failed attempts."
       },
       {
         question: "How long are request and response bodies retained?",
-        answer: "Request and response bodies are cleaned up after the configured retention period (default 30 days). Row metadata \u2014 status, HTTP code, error code, and duration \u2014 is retained indefinitely for audit purposes."
+        answer: "Request and response bodies are cleaned up after the configured retention period (default 30 days) by a daily cleanup job that runs at 03:00 server time. Row metadata \u2014 status, HTTP code, error code, and duration \u2014 is retained indefinitely for audit purposes. Configure the retention period via the delivery.item_body_retention_days setting in pipeline.yaml."
       }
     ],
     mentions: [
@@ -4964,7 +5020,7 @@ var sections11 = [
       },
       {
         question: "When should I use a shared dialect vs an inline dialect?",
-        answer: "Use shared dialects for workspace-wide defaults that apply to most schemas. Use inline dialects only when a specific schema needs different formatting \u2014 for example, a schema that outputs dates in a different format for a particular downstream system."
+        answer: "Use shared dialects for workspace-wide defaults that apply to most schemas. Use inline dialects only when a specific schema needs different formatting \u2014 for example, a schema that outputs dates in DD/MM/YYYY for a European ERP while the rest of your workspace uses YYYY-MM-DD. Inline overrides apply only to that one schema, so they do not affect any other output. If you find yourself overriding the same setting in multiple schemas, consider updating the shared dialect instead."
       },
       {
         question: "Do shared dialects affect the extraction process?",
@@ -5036,7 +5092,7 @@ var sections11 = [
       },
       {
         question: "How does the lookup cascade work?",
-        answer: "The platform tries three tiers: first, exact string normalization (whitespace and case normalization). If that fails, token-based fuzzy matching. If the fuzzy match is below the confidence threshold, a Haiku LLM call resolves the ambiguity."
+        answer: "The platform tries three tiers in sequence. First, exact string normalization strips whitespace and normalizes casing to find a direct match. If no exact match is found, token-based fuzzy matching compares individual tokens against all reference values and scores similarity. If the best fuzzy match falls below the confidence threshold, a Haiku LLM call evaluates the ambiguous value in context against the top candidates and selects the most likely match. This three-tier approach balances speed and accuracy \u2014 most lookups resolve in the first two tiers without any LLM cost."
       },
       {
         question: "What happens when I update a reference primitive?",
@@ -5081,7 +5137,10 @@ var sections11 = [
         items: [
           "**Schema changes** \u2014 field additions, removals, mapping updates, and format constraint modifications.",
           "**Shared dialect changes** \u2014 date format, number locale, delimiter, and encoding updates.",
-          "**Reference primitive changes** \u2014 new versions of lookup tables and key-value modifications."
+          "**Reference primitive changes** \u2014 new versions of lookup tables and key-value modifications.",
+          "**Delivery binding changes** \u2014 modifications to outbound delivery destinations, field maps, or signal filters.",
+          "**Routing rule changes** \u2014 additions or modifications to document routing rules that assign schemas automatically.",
+          "**Format constraint changes** \u2014 regex pattern updates or fallback behavior modifications on schema fields."
         ]
       },
       {
@@ -5162,7 +5221,9 @@ var sections12 = [
           "**Extracted values** \u2014 finds specific data points across all processed documents.",
           "**Field names** \u2014 searches the Field Registry for canonical field definitions.",
           "**Schema names** \u2014 locates generated and template schemas by title.",
-          "**Sources** \u2014 matches source connection names and configurations."
+          "**Sources** \u2014 matches source connection names and configurations.",
+          "**Matching configurations** \u2014 finds matching configs and reference datasets by name.",
+          "**Delivery bindings** \u2014 locates delivery pipeline bindings and destination configurations."
         ]
       }
     ],
@@ -5229,6 +5290,10 @@ var sections12 = [
       {
         type: "paragraph",
         text: 'For best results, save your most common filter combinations as presets. Most teams create presets for categories like "high-value invoices this quarter," "documents missing key fields," or "recently failed extractions." Presets appear as one-click buttons on the Documents page, eliminating the need to rebuild complex filter conditions from scratch each time.'
+      },
+      {
+        type: "paragraph",
+        text: 'For example, to find all invoices from a specific vendor with outstanding amounts, build a filter with `vendor_name eq "Acme Corp"` AND `document_type eq "Invoice"` AND `total_amount gt 5000`. The field autocomplete ensures you are filtering on valid extracted fields, and the materialized index returns results instantly even across thousands of documents. Save this as a preset called "Acme high-value invoices" for one-click access when you need to review that vendor\'s billing history.'
       }
     ],
     related: [
@@ -5350,6 +5415,10 @@ var sections13 = [
       {
         question: "Can I have multiple API keys?",
         answer: "Yes. You can create as many API keys as needed. Best practice is to create separate keys for each integration so you can rotate or revoke them independently without disrupting other services."
+      },
+      {
+        question: "What are best practices for API key management?",
+        answer: "Store keys in a secrets manager rather than source code or environment files checked into version control. Create one key per integration so each can be rotated independently. Use the narrowest scope possible \u2014 a read-only dashboard needs only the read scope, not extract or write. Rotate keys on a regular schedule and immediately revoke any key that may have been exposed. Monitor API usage per key to detect anomalies early."
       }
     ],
     mentions: ["API keys", "tlnc_", "SHA-256", "Bearer token", "scopes"]
@@ -5727,6 +5796,16 @@ var sections14 = [
         variant: "info",
         text: "Domain matching streamlines onboarding for larger teams. When a new user signs up with an email address matching your organization's domain (e.g., `@yourcompany.com`), they are automatically associated with your org in a **pending** state. An admin must approve them before they gain access."
       },
+      {
+        type: "list",
+        ordered: false,
+        items: [
+          "**Viewer** \u2014 read-only access to documents, extraction results, schemas, and reports. Cannot create, edit, or delete any resources.",
+          "**Member** \u2014 full CRUD access to documents, schemas, jobs, matching configurations, and delivery bindings. Cannot manage team members or workspace settings.",
+          "**Admin** \u2014 all Member permissions plus team management (approve/reject members, change roles), workspace settings (shared dialects, reference primitives, change review), and routing rules.",
+          "**Owner** \u2014 all Admin permissions plus billing management, API key generation and revocation, organization-level settings, and the ability to transfer ownership."
+        ]
+      },
       {
         type: "list",
         ordered: true,
@@ -5861,7 +5940,7 @@ var sections14 = [
       },
       {
         question: "How can I reduce my usage costs?",
-        answer: "Use batch mode for non-urgent documents to cut extraction costs by 50%. Review the per-feature breakdown to identify your highest-cost operations, and use the daily cost chart to spot and investigate usage spikes."
+        answer: "Use batch mode for non-urgent documents to cut extraction costs by 50%. Review the per-feature breakdown to identify your highest-cost operations, and use the daily cost chart to spot and investigate usage spikes. Additionally, invest in building your Field Registry \u2014 as more fields reach Tier 1 and Tier 2, values are resolved via deterministic lookup instead of LLM calls, which reduces per-document extraction cost over time. Leverage routing rules to assign schemas automatically, which avoids manual re-extractions and wasted processing."
       }
     ],
     mentions: [
@@ -6014,7 +6093,7 @@ var sections14 = [
       },
       {
         question: "What does the quick extract shortcut do?",
-        answer: "Cmd+J / Ctrl+J opens the quick extract interface, allowing you to upload and process a document directly from any page. It provides a streamlined drag-and-drop area that immediately processes the uploaded file and displays extraction results."
+        answer: "Cmd+J / Ctrl+J opens the quick extract interface, allowing you to upload and process a document directly from any page. It provides a streamlined drag-and-drop area that immediately processes the uploaded file and displays extraction results. This is the fastest path from receiving a document to seeing structured data \u2014 ideal for one-off documents that arrive via email or chat and need immediate attention without navigating to the upload page."
       },
       {
         question: "Do shortcuts work inside modals or overlays?",
@@ -6134,6 +6213,17 @@ var sections15 = [
       {
         type: "paragraph",
         text: "You can also enable batch mode on a per-source basis. When a source connection has the batch processing toggle enabled, all documents ingested through that source are automatically routed to the batch queue. This is ideal for source connections that handle non-urgent, high-volume ingestion \u2014 such as a shared drive that collects documents overnight."
+      },
+      {
+        type: "list",
+        ordered: false,
+        items: [
+          "**Included in batch:** Stage 2 Claude extraction, markdown pre-processing, field parsing, quality metrics computation, extraction metadata, and all post-processing that does not require LLM calls.",
+          "**Excluded from batch:** LLM-based quality passes (field estimation, verification, cross-reference enrichment) are skipped to preserve cost savings.",
+          "**Excluded from batch:** Image-only documents (PNG, JPG) are automatically routed to real-time processing because the batch payload is text-only.",
+          "**Fallback behavior:** Parse failures in batch mode are retried through the real-time extraction path \u2014 never as a new batch \u2014 to maintain the 48-hour SLA.",
+          "**Minimum threshold:** Batches require at least 100 items (a provider requirement). Uploads below this threshold fall back to real-time processing with a warning."
+        ]
       }
     ],
     related: [
@@ -6187,6 +6277,10 @@ var sections15 = [
         type: "paragraph",
         text: "The batch detail view shows individual items within a batch, including which documents are included, their current processing state, and any errors that occurred. Use this view to verify that a specific document was included in the expected batch and to troubleshoot items that failed to parse."
       },
+      {
+        type: "paragraph",
+        text: "For example, after uploading 500 invoices in batch mode, navigate to `/sources/batches` to check progress. You will see a batch in **accumulating** status collecting items until the 15-minute timer fires. Once submitted, the status changes to **submitted** and the platform polls the provider hourly. Click the batch row to see each document's individual state \u2014 if 3 items show parse errors, those documents were automatically retried via the real-time path while the remaining 497 completed normally. When the batch transitions to **completed**, all results have been applied and documents are ready for review."
+      },
       {
         type: "paragraph",
         text: "The platform includes built-in crash recovery for batch processing. If the application restarts while a batch is in a transient `processing` state, the recovery logic automatically reverts it to `submitted` so the next polling cycle can retry. This means batch jobs are resilient to infrastructure disruptions without requiring manual intervention."
@@ -6306,7 +6400,7 @@ var sections16 = [
       },
       {
         question: "How is reference data used?",
-        answer: "Reference datasets are used by the matching engine for field-to-field comparisons and by reference strategies in schemas for code mapping and value resolution."
+        answer: "Reference datasets serve two purposes. First, the matching engine uses them for field-to-field comparisons \u2014 comparing extracted document values against reference rows using weighted strategies (exact, fuzzy, date_range, numeric_range). Second, reference strategies in schemas use them for code mapping and value resolution, translating labels found in documents into canonical codes defined in the reference dataset."
       },
       {
         question: "Can I import reference data from a database?",
@@ -6379,6 +6473,16 @@ var sections16 = [
         type: "callout",
         variant: "info",
         text: "Use **AI strategy generation** when setting up matching for the first time. The platform analyzes your schema fields and reference data columns, then suggests which fields to compare and which strategy to use for each. You can review and adjust the suggestions before saving."
+      },
+      {
+        type: "list",
+        ordered: false,
+        items: [
+          "**exact** \u2014 case-insensitive string comparison. Best for unique identifiers like PO numbers, invoice IDs, and reference codes where values should match verbatim.",
+          "**fuzzy** \u2014 token-based similarity with a configurable threshold (0-100%). Handles misspellings, abbreviations, and word reordering. Ideal for company names, addresses, and descriptions.",
+          "**date_range** \u2014 matches dates within a configurable tolerance window (e.g., +/- 7 days). Useful when documents report dates with slight offsets, such as invoice date vs. received date.",
+          "**numeric_range** \u2014 matches numbers within a percentage or absolute tolerance. Handles rounding differences in amounts, quantities, and prices across systems."
+        ]
       }
     ],
     related: [
@@ -6527,6 +6631,10 @@ var sections16 = [
         type: "callout",
         variant: "info",
         text: "You can **approve or reject** individual match results. Approved matches can be used downstream in delivery pipelines. Rejected matches are excluded from future consideration for that document."
+      },
+      {
+        type: "paragraph",
+        text: 'Consider a practical example: you receive an invoice from "Acme Corp" with a total of $12,450 dated 2025-03-15. The matching engine returns the top candidate as "ACME Corporation" in your reference data with a confidence score of 87%. The evidence view shows the vendor name scored 92% via fuzzy match (handling "Corp" vs "Corporation"), the amount scored 100% via exact match, and the date scored 78% via date_range because the reference shows a PO date of 2025-03-10 \u2014 within the 7-day tolerance. You can quickly verify the match is correct and approve it, sending the linked record downstream.'
       }
     ],
     related: [

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@talonic/docs",
-  "version": "0.20.13",
+  "version": "0.20.14",
   "description": "Talonic documentation components — API Reference & Platform Guide",
   "license": "UNLICENSED",
   "private": false,