convex-cms 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/admin.d.ts +16 -0
- package/dist/cli/commands/admin.d.ts.map +1 -0
- package/dist/cli/commands/admin.js +88 -0
- package/dist/cli/commands/admin.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +18 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/utils/detectConvexUrl.d.ts +13 -0
- package/dist/cli/utils/detectConvexUrl.d.ts.map +1 -0
- package/dist/cli/utils/detectConvexUrl.js +48 -0
- package/dist/cli/utils/detectConvexUrl.js.map +1 -0
- package/dist/cli/utils/openBrowser.d.ts +7 -0
- package/dist/cli/utils/openBrowser.d.ts.map +1 -0
- package/dist/cli/utils/openBrowser.js +17 -0
- package/dist/cli/utils/openBrowser.js.map +1 -0
- package/dist/client/admin-config.d.ts +126 -0
- package/dist/client/admin-config.d.ts.map +1 -0
- package/dist/client/admin-config.js +117 -0
- package/dist/client/admin-config.js.map +1 -0
- package/dist/client/adminApi.d.ts +2273 -0
- package/dist/client/adminApi.d.ts.map +1 -0
- package/dist/client/adminApi.js +716 -0
- package/dist/client/adminApi.js.map +1 -0
- package/dist/client/agentTools.d.ts +933 -0
- package/dist/client/agentTools.d.ts.map +1 -0
- package/dist/client/agentTools.js +1004 -0
- package/dist/client/agentTools.js.map +1 -0
- package/dist/client/argTypes.d.ts +212 -0
- package/dist/client/argTypes.d.ts.map +1 -0
- package/dist/client/argTypes.js +5 -0
- package/dist/client/argTypes.js.map +1 -0
- package/dist/client/field-types.d.ts +55 -0
- package/dist/client/field-types.d.ts.map +1 -0
- package/dist/client/field-types.js +152 -0
- package/dist/client/field-types.js.map +1 -0
- package/dist/client/index.d.ts +189 -0
- package/dist/client/index.d.ts.map +1 -0
- package/dist/client/index.js +668 -0
- package/dist/client/index.js.map +1 -0
- package/dist/client/queryBuilder.d.ts +765 -0
- package/dist/client/queryBuilder.d.ts.map +1 -0
- package/dist/client/queryBuilder.js +970 -0
- package/dist/client/queryBuilder.js.map +1 -0
- package/dist/client/schema/codegen.d.ts +128 -0
- package/dist/client/schema/codegen.d.ts.map +1 -0
- package/dist/client/schema/codegen.js +318 -0
- package/dist/client/schema/codegen.js.map +1 -0
- package/dist/client/schema/defineContentType.d.ts +221 -0
- package/dist/client/schema/defineContentType.d.ts.map +1 -0
- package/dist/client/schema/defineContentType.js +380 -0
- package/dist/client/schema/defineContentType.js.map +1 -0
- package/dist/client/schema/index.d.ts +85 -0
- package/dist/client/schema/index.d.ts.map +1 -0
- package/dist/client/schema/index.js +92 -0
- package/dist/client/schema/index.js.map +1 -0
- package/dist/client/schema/schemaDrift.d.ts +199 -0
- package/dist/client/schema/schemaDrift.d.ts.map +1 -0
- package/dist/client/schema/schemaDrift.js +340 -0
- package/dist/client/schema/schemaDrift.js.map +1 -0
- package/dist/client/schema/typedClient.d.ts +401 -0
- package/dist/client/schema/typedClient.d.ts.map +1 -0
- package/dist/client/schema/typedClient.js +269 -0
- package/dist/client/schema/typedClient.js.map +1 -0
- package/dist/client/schema/types.d.ts +477 -0
- package/dist/client/schema/types.d.ts.map +1 -0
- package/dist/client/schema/types.js +39 -0
- package/dist/client/schema/types.js.map +1 -0
- package/dist/client/types.d.ts +449 -0
- package/dist/client/types.d.ts.map +1 -0
- package/dist/client/types.js +149 -0
- package/dist/client/types.js.map +1 -0
- package/dist/client/workflows.d.ts +51 -0
- package/dist/client/workflows.d.ts.map +1 -0
- package/dist/client/workflows.js +103 -0
- package/dist/client/workflows.js.map +1 -0
- package/dist/client/wrapper.d.ts +2198 -0
- package/dist/client/wrapper.d.ts.map +1 -0
- package/dist/client/wrapper.js +2651 -0
- package/dist/client/wrapper.js.map +1 -0
- package/dist/component/_generated/api.d.ts +124 -0
- package/dist/component/_generated/api.d.ts.map +1 -0
- package/dist/component/_generated/api.js +31 -0
- package/dist/component/_generated/api.js.map +1 -0
- package/dist/component/_generated/component.d.ts +4321 -0
- package/dist/component/_generated/component.d.ts.map +1 -0
- package/dist/component/_generated/component.js +11 -0
- package/dist/component/_generated/component.js.map +1 -0
- package/dist/component/_generated/dataModel.d.ts +46 -0
- package/dist/component/_generated/dataModel.d.ts.map +1 -0
- package/dist/component/_generated/dataModel.js +11 -0
- package/dist/component/_generated/dataModel.js.map +1 -0
- package/dist/component/_generated/server.d.ts +121 -0
- package/dist/component/_generated/server.d.ts.map +1 -0
- package/dist/component/_generated/server.js +78 -0
- package/dist/component/_generated/server.js.map +1 -0
- package/dist/component/auditLog.d.ts +410 -0
- package/dist/component/auditLog.d.ts.map +1 -0
- package/dist/component/auditLog.js +607 -0
- package/dist/component/auditLog.js.map +1 -0
- package/dist/component/authorization.d.ts +323 -0
- package/dist/component/authorization.d.ts.map +1 -0
- package/dist/component/authorization.js +464 -0
- package/dist/component/authorization.js.map +1 -0
- package/dist/component/authorizationHooks.d.ts +184 -0
- package/dist/component/authorizationHooks.d.ts.map +1 -0
- package/dist/component/authorizationHooks.js +521 -0
- package/dist/component/authorizationHooks.js.map +1 -0
- package/dist/component/bulkOperations.d.ts +200 -0
- package/dist/component/bulkOperations.d.ts.map +1 -0
- package/dist/component/bulkOperations.js +568 -0
- package/dist/component/bulkOperations.js.map +1 -0
- package/dist/component/contentEntries.d.ts +719 -0
- package/dist/component/contentEntries.d.ts.map +1 -0
- package/dist/component/contentEntries.js +1617 -0
- package/dist/component/contentEntries.js.map +1 -0
- package/dist/component/contentEntryMutations.d.ts +505 -0
- package/dist/component/contentEntryMutations.d.ts.map +1 -0
- package/dist/component/contentEntryMutations.js +1009 -0
- package/dist/component/contentEntryMutations.js.map +1 -0
- package/dist/component/contentEntryValidation.d.ts +115 -0
- package/dist/component/contentEntryValidation.d.ts.map +1 -0
- package/dist/component/contentEntryValidation.js +546 -0
- package/dist/component/contentEntryValidation.js.map +1 -0
- package/dist/component/contentLock.d.ts +328 -0
- package/dist/component/contentLock.d.ts.map +1 -0
- package/dist/component/contentLock.js +471 -0
- package/dist/component/contentLock.js.map +1 -0
- package/dist/component/contentTypeMigration.d.ts +411 -0
- package/dist/component/contentTypeMigration.d.ts.map +1 -0
- package/dist/component/contentTypeMigration.js +805 -0
- package/dist/component/contentTypeMigration.js.map +1 -0
- package/dist/component/contentTypeMutations.d.ts +975 -0
- package/dist/component/contentTypeMutations.d.ts.map +1 -0
- package/dist/component/contentTypeMutations.js +768 -0
- package/dist/component/contentTypeMutations.js.map +1 -0
- package/dist/component/contentTypes.d.ts +538 -0
- package/dist/component/contentTypes.d.ts.map +1 -0
- package/dist/component/contentTypes.js +304 -0
- package/dist/component/contentTypes.js.map +1 -0
- package/dist/component/convex.config.d.ts +42 -0
- package/dist/component/convex.config.d.ts.map +1 -0
- package/dist/component/convex.config.js +43 -0
- package/dist/component/convex.config.js.map +1 -0
- package/dist/component/documentTypes.d.ts +186 -0
- package/dist/component/documentTypes.d.ts.map +1 -0
- package/dist/component/documentTypes.js +23 -0
- package/dist/component/documentTypes.js.map +1 -0
- package/dist/component/eventEmitter.d.ts +281 -0
- package/dist/component/eventEmitter.d.ts.map +1 -0
- package/dist/component/eventEmitter.js +300 -0
- package/dist/component/eventEmitter.js.map +1 -0
- package/dist/component/exportImport.d.ts +1120 -0
- package/dist/component/exportImport.d.ts.map +1 -0
- package/dist/component/exportImport.js +931 -0
- package/dist/component/exportImport.js.map +1 -0
- package/dist/component/index.d.ts +28 -0
- package/dist/component/index.d.ts.map +1 -0
- package/dist/component/index.js +142 -0
- package/dist/component/index.js.map +1 -0
- package/dist/component/lib/deepReferenceResolver.d.ts +252 -0
- package/dist/component/lib/deepReferenceResolver.d.ts.map +1 -0
- package/dist/component/lib/deepReferenceResolver.js +601 -0
- package/dist/component/lib/deepReferenceResolver.js.map +1 -0
- package/dist/component/lib/errors.d.ts +306 -0
- package/dist/component/lib/errors.d.ts.map +1 -0
- package/dist/component/lib/errors.js +407 -0
- package/dist/component/lib/errors.js.map +1 -0
- package/dist/component/lib/index.d.ts +10 -0
- package/dist/component/lib/index.d.ts.map +1 -0
- package/dist/component/lib/index.js +33 -0
- package/dist/component/lib/index.js.map +1 -0
- package/dist/component/lib/mediaReferenceResolver.d.ts +217 -0
- package/dist/component/lib/mediaReferenceResolver.d.ts.map +1 -0
- package/dist/component/lib/mediaReferenceResolver.js +326 -0
- package/dist/component/lib/mediaReferenceResolver.js.map +1 -0
- package/dist/component/lib/metadataExtractor.d.ts +245 -0
- package/dist/component/lib/metadataExtractor.d.ts.map +1 -0
- package/dist/component/lib/metadataExtractor.js +548 -0
- package/dist/component/lib/metadataExtractor.js.map +1 -0
- package/dist/component/lib/mutationAuth.d.ts +95 -0
- package/dist/component/lib/mutationAuth.d.ts.map +1 -0
- package/dist/component/lib/mutationAuth.js +146 -0
- package/dist/component/lib/mutationAuth.js.map +1 -0
- package/dist/component/lib/queries.d.ts +17 -0
- package/dist/component/lib/queries.d.ts.map +1 -0
- package/dist/component/lib/queries.js +49 -0
- package/dist/component/lib/queries.js.map +1 -0
- package/dist/component/lib/ragContentChunker.d.ts +423 -0
- package/dist/component/lib/ragContentChunker.d.ts.map +1 -0
- package/dist/component/lib/ragContentChunker.js +897 -0
- package/dist/component/lib/ragContentChunker.js.map +1 -0
- package/dist/component/lib/referenceResolver.d.ts +175 -0
- package/dist/component/lib/referenceResolver.d.ts.map +1 -0
- package/dist/component/lib/referenceResolver.js +293 -0
- package/dist/component/lib/referenceResolver.js.map +1 -0
- package/dist/component/lib/slugGenerator.d.ts +71 -0
- package/dist/component/lib/slugGenerator.d.ts.map +1 -0
- package/dist/component/lib/slugGenerator.js +207 -0
- package/dist/component/lib/slugGenerator.js.map +1 -0
- package/dist/component/lib/slugUniqueness.d.ts +131 -0
- package/dist/component/lib/slugUniqueness.d.ts.map +1 -0
- package/dist/component/lib/slugUniqueness.js +229 -0
- package/dist/component/lib/slugUniqueness.js.map +1 -0
- package/dist/component/lib/softDelete.d.ts +18 -0
- package/dist/component/lib/softDelete.d.ts.map +1 -0
- package/dist/component/lib/softDelete.js +29 -0
- package/dist/component/lib/softDelete.js.map +1 -0
- package/dist/component/localeFallbackChain.d.ts +410 -0
- package/dist/component/localeFallbackChain.d.ts.map +1 -0
- package/dist/component/localeFallbackChain.js +467 -0
- package/dist/component/localeFallbackChain.js.map +1 -0
- package/dist/component/localeFields.d.ts +508 -0
- package/dist/component/localeFields.d.ts.map +1 -0
- package/dist/component/localeFields.js +592 -0
- package/dist/component/localeFields.js.map +1 -0
- package/dist/component/mediaAssetMutations.d.ts +235 -0
- package/dist/component/mediaAssetMutations.d.ts.map +1 -0
- package/dist/component/mediaAssetMutations.js +558 -0
- package/dist/component/mediaAssetMutations.js.map +1 -0
- package/dist/component/mediaAssets.d.ts +168 -0
- package/dist/component/mediaAssets.d.ts.map +1 -0
- package/dist/component/mediaAssets.js +618 -0
- package/dist/component/mediaAssets.js.map +1 -0
- package/dist/component/mediaFolderMutations.d.ts +642 -0
- package/dist/component/mediaFolderMutations.d.ts.map +1 -0
- package/dist/component/mediaFolderMutations.js +849 -0
- package/dist/component/mediaFolderMutations.js.map +1 -0
- package/dist/component/mediaUploadMutations.d.ts +136 -0
- package/dist/component/mediaUploadMutations.d.ts.map +1 -0
- package/dist/component/mediaUploadMutations.js +205 -0
- package/dist/component/mediaUploadMutations.js.map +1 -0
- package/dist/component/mediaVariantMutations.d.ts +468 -0
- package/dist/component/mediaVariantMutations.d.ts.map +1 -0
- package/dist/component/mediaVariantMutations.js +737 -0
- package/dist/component/mediaVariantMutations.js.map +1 -0
- package/dist/component/mediaVariants.d.ts +525 -0
- package/dist/component/mediaVariants.d.ts.map +1 -0
- package/dist/component/mediaVariants.js +661 -0
- package/dist/component/mediaVariants.js.map +1 -0
- package/dist/component/ragContentIndexer.d.ts +595 -0
- package/dist/component/ragContentIndexer.d.ts.map +1 -0
- package/dist/component/ragContentIndexer.js +794 -0
- package/dist/component/ragContentIndexer.js.map +1 -0
- package/dist/component/rateLimitHooks.d.ts +266 -0
- package/dist/component/rateLimitHooks.d.ts.map +1 -0
- package/dist/component/rateLimitHooks.js +412 -0
- package/dist/component/rateLimitHooks.js.map +1 -0
- package/dist/component/roles.d.ts +649 -0
- package/dist/component/roles.d.ts.map +1 -0
- package/dist/component/roles.js +884 -0
- package/dist/component/roles.js.map +1 -0
- package/dist/component/scheduledPublish.d.ts +182 -0
- package/dist/component/scheduledPublish.d.ts.map +1 -0
- package/dist/component/scheduledPublish.js +304 -0
- package/dist/component/scheduledPublish.js.map +1 -0
- package/dist/component/schema.d.ts +4114 -0
- package/dist/component/schema.d.ts.map +1 -0
- package/dist/component/schema.js +469 -0
- package/dist/component/schema.js.map +1 -0
- package/dist/component/taxonomies.d.ts +476 -0
- package/dist/component/taxonomies.d.ts.map +1 -0
- package/dist/component/taxonomies.js +785 -0
- package/dist/component/taxonomies.js.map +1 -0
- package/dist/component/taxonomyMutations.d.ts +206 -0
- package/dist/component/taxonomyMutations.d.ts.map +1 -0
- package/dist/component/taxonomyMutations.js +1001 -0
- package/dist/component/taxonomyMutations.js.map +1 -0
- package/dist/component/trash.d.ts +265 -0
- package/dist/component/trash.d.ts.map +1 -0
- package/dist/component/trash.js +621 -0
- package/dist/component/trash.js.map +1 -0
- package/dist/component/types.d.ts +4 -0
- package/dist/component/types.d.ts.map +1 -0
- package/dist/component/types.js +2 -0
- package/dist/component/types.js.map +1 -0
- package/dist/component/userContext.d.ts +508 -0
- package/dist/component/userContext.d.ts.map +1 -0
- package/dist/component/userContext.js +615 -0
- package/dist/component/userContext.js.map +1 -0
- package/dist/component/validation.d.ts +387 -0
- package/dist/component/validation.d.ts.map +1 -0
- package/dist/component/validation.js +1052 -0
- package/dist/component/validation.js.map +1 -0
- package/dist/component/validators.d.ts +4645 -0
- package/dist/component/validators.d.ts.map +1 -0
- package/dist/component/validators.js +641 -0
- package/dist/component/validators.js.map +1 -0
- package/dist/component/versionMutations.d.ts +216 -0
- package/dist/component/versionMutations.d.ts.map +1 -0
- package/dist/component/versionMutations.js +321 -0
- package/dist/component/versionMutations.js.map +1 -0
- package/dist/component/webhookTrigger.d.ts +770 -0
- package/dist/component/webhookTrigger.d.ts.map +1 -0
- package/dist/component/webhookTrigger.js +1413 -0
- package/dist/component/webhookTrigger.js.map +1 -0
- package/dist/react/index.d.ts +316 -0
- package/dist/react/index.d.ts.map +1 -0
- package/dist/react/index.js +558 -0
- package/dist/react/index.js.map +1 -0
- package/dist/test.d.ts +2230 -0
- package/dist/test.d.ts.map +1 -0
- package/dist/test.js +1107 -0
- package/dist/test.js.map +1 -0
- package/package.json +95 -0
- package/src/cli/commands/admin.ts +104 -0
- package/src/cli/index.ts +21 -0
- package/src/cli/utils/detectConvexUrl.ts +54 -0
- package/src/cli/utils/openBrowser.ts +16 -0
- package/src/client/admin-config.ts +138 -0
- package/src/client/adminApi.ts +942 -0
- package/src/client/agentTools.ts +1311 -0
- package/src/client/argTypes.ts +316 -0
- package/src/client/field-types.ts +187 -0
- package/src/client/index.ts +1301 -0
- package/src/client/queryBuilder.ts +1100 -0
- package/src/client/schema/codegen.ts +500 -0
- package/src/client/schema/defineContentType.ts +501 -0
- package/src/client/schema/index.ts +169 -0
- package/src/client/schema/schemaDrift.ts +574 -0
- package/src/client/schema/typedClient.ts +688 -0
- package/src/client/schema/types.ts +666 -0
- package/src/client/types.ts +723 -0
- package/src/client/workflows.ts +141 -0
- package/src/client/wrapper.ts +4304 -0
- package/src/component/_generated/api.ts +140 -0
- package/src/component/_generated/component.ts +5029 -0
- package/src/component/_generated/dataModel.ts +60 -0
- package/src/component/_generated/server.ts +156 -0
- package/src/component/authorization.ts +647 -0
- package/src/component/authorizationHooks.ts +668 -0
- package/src/component/bulkOperations.ts +687 -0
- package/src/component/contentEntries.ts +1976 -0
- package/src/component/contentEntryMutations.ts +1223 -0
- package/src/component/contentEntryValidation.ts +707 -0
- package/src/component/contentLock.ts +550 -0
- package/src/component/contentTypeMigration.ts +1064 -0
- package/src/component/contentTypeMutations.ts +969 -0
- package/src/component/contentTypes.ts +346 -0
- package/src/component/convex.config.ts +44 -0
- package/src/component/documentTypes.ts +240 -0
- package/src/component/eventEmitter.ts +485 -0
- package/src/component/exportImport.ts +1169 -0
- package/src/component/index.ts +491 -0
- package/src/component/lib/deepReferenceResolver.ts +999 -0
- package/src/component/lib/errors.ts +816 -0
- package/src/component/lib/index.ts +145 -0
- package/src/component/lib/mediaReferenceResolver.ts +495 -0
- package/src/component/lib/metadataExtractor.ts +792 -0
- package/src/component/lib/mutationAuth.ts +199 -0
- package/src/component/lib/queries.ts +79 -0
- package/src/component/lib/ragContentChunker.ts +1371 -0
- package/src/component/lib/referenceResolver.ts +430 -0
- package/src/component/lib/slugGenerator.ts +262 -0
- package/src/component/lib/slugUniqueness.ts +333 -0
- package/src/component/lib/softDelete.ts +44 -0
- package/src/component/localeFallbackChain.ts +673 -0
- package/src/component/localeFields.ts +896 -0
- package/src/component/mediaAssetMutations.ts +725 -0
- package/src/component/mediaAssets.ts +932 -0
- package/src/component/mediaFolderMutations.ts +1046 -0
- package/src/component/mediaUploadMutations.ts +224 -0
- package/src/component/mediaVariantMutations.ts +900 -0
- package/src/component/mediaVariants.ts +793 -0
- package/src/component/ragContentIndexer.ts +1067 -0
- package/src/component/rateLimitHooks.ts +572 -0
- package/src/component/roles.ts +1360 -0
- package/src/component/scheduledPublish.ts +358 -0
- package/src/component/schema.ts +617 -0
- package/src/component/taxonomies.ts +949 -0
- package/src/component/taxonomyMutations.ts +1210 -0
- package/src/component/trash.ts +724 -0
- package/src/component/userContext.ts +898 -0
- package/src/component/validation.ts +1388 -0
- package/src/component/validators.ts +949 -0
- package/src/component/versionMutations.ts +392 -0
- package/src/component/webhookTrigger.ts +1922 -0
- package/src/react/index.ts +898 -0
- package/src/test.ts +1580 -0
|
@@ -0,0 +1,1371 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RAG Content Chunker
|
|
3
|
+
*
|
|
4
|
+
* Utility to extract and structure content from CMS entries for @convex-dev/rag indexing.
|
|
5
|
+
* This module provides:
|
|
6
|
+
*
|
|
7
|
+
* 1. **Content Extraction**: Extracts text from various CMS field types (text, richText, json, etc.)
|
|
8
|
+
* 2. **Semantic Chunking**: Splits content into meaningful chunks optimized for embedding
|
|
9
|
+
* 3. **Metadata Tagging**: Attaches relevant metadata (content type, field source, locale, etc.)
|
|
10
|
+
* 4. **Reference Handling**: Processes embedded references and includes contextual information
|
|
11
|
+
*
|
|
12
|
+
* The output is designed to be directly compatible with @convex-dev/rag's `add()` function.
|
|
13
|
+
*
|
|
14
|
+
* @example
|
|
15
|
+
* ```typescript
|
|
16
|
+
* import { extractContentForRag, chunkContentEntry } from "@convex-cms/core/lib";
|
|
17
|
+
* import { rag } from "@convex-dev/rag";
|
|
18
|
+
*
|
|
19
|
+
* // In a Convex action
|
|
20
|
+
* const chunks = await chunkContentEntry(ctx, entry, contentType, {
|
|
21
|
+
* includeMetadata: true,
|
|
22
|
+
* chunkOptions: { maxCharsSoftLimit: 1000 },
|
|
23
|
+
* });
|
|
24
|
+
*
|
|
25
|
+
* await rag.add(ctx, {
|
|
26
|
+
* namespace: "cms-content",
|
|
27
|
+
* key: entry._id,
|
|
28
|
+
* chunks: chunks.map(c => c.text),
|
|
29
|
+
* title: chunks[0]?.metadata?.title,
|
|
30
|
+
* });
|
|
31
|
+
* ```
|
|
32
|
+
*
|
|
33
|
+
* @module
|
|
34
|
+
*/
|
|
35
|
+
|
|
36
|
+
// =============================================================================
|
|
37
|
+
// Type Definitions
|
|
38
|
+
// =============================================================================
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Field definition structure from the CMS schema.
|
|
42
|
+
*/
|
|
43
|
+
export interface FieldDefinition {
|
|
44
|
+
name: string;
|
|
45
|
+
label: string;
|
|
46
|
+
type: string;
|
|
47
|
+
required: boolean;
|
|
48
|
+
searchable?: boolean;
|
|
49
|
+
localized?: boolean;
|
|
50
|
+
description?: string;
|
|
51
|
+
options?: {
|
|
52
|
+
allowedContentTypes?: string[];
|
|
53
|
+
multiple?: boolean;
|
|
54
|
+
[key: string]: unknown;
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Content type structure from the CMS.
|
|
60
|
+
*/
|
|
61
|
+
export interface ContentTypeInfo {
|
|
62
|
+
_id: string;
|
|
63
|
+
name: string;
|
|
64
|
+
displayName: string;
|
|
65
|
+
fields: FieldDefinition[];
|
|
66
|
+
titleField?: string;
|
|
67
|
+
slugField?: string;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Content entry structure from the CMS.
|
|
72
|
+
*/
|
|
73
|
+
export interface ContentEntryInfo {
|
|
74
|
+
_id: string;
|
|
75
|
+
contentTypeId: string;
|
|
76
|
+
slug: string;
|
|
77
|
+
status: string;
|
|
78
|
+
data: Record<string, unknown>;
|
|
79
|
+
locale?: string;
|
|
80
|
+
version: number;
|
|
81
|
+
_creationTime: number;
|
|
82
|
+
firstPublishedAt?: number;
|
|
83
|
+
lastPublishedAt?: number;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* A resolved reference for context enrichment.
|
|
88
|
+
*/
|
|
89
|
+
export interface ResolvedReferenceInfo {
|
|
90
|
+
id: string;
|
|
91
|
+
contentTypeName: string;
|
|
92
|
+
title?: string;
|
|
93
|
+
slug?: string;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Metadata attached to each content chunk.
|
|
98
|
+
* This metadata helps with filtering and relevance scoring during retrieval.
|
|
99
|
+
*/
|
|
100
|
+
export interface ChunkMetadata {
|
|
101
|
+
/** The content entry ID this chunk came from */
|
|
102
|
+
entryId: string;
|
|
103
|
+
/** The content type name (e.g., "blog_post") */
|
|
104
|
+
contentType: string;
|
|
105
|
+
/** The content type display name (e.g., "Blog Post") */
|
|
106
|
+
contentTypeDisplayName: string;
|
|
107
|
+
/** The entry's URL slug */
|
|
108
|
+
slug: string;
|
|
109
|
+
/** Publishing status of the entry */
|
|
110
|
+
status: string;
|
|
111
|
+
/** Locale code if localized content */
|
|
112
|
+
locale?: string;
|
|
113
|
+
/** The field name(s) this chunk was extracted from */
|
|
114
|
+
sourceFields: string[];
|
|
115
|
+
/** The chunk index within the entry (0-based) */
|
|
116
|
+
chunkIndex: number;
|
|
117
|
+
/** Total number of chunks for this entry */
|
|
118
|
+
totalChunks: number;
|
|
119
|
+
/** The entry's title (if available) */
|
|
120
|
+
title?: string;
|
|
121
|
+
/** ISO timestamp when the entry was created */
|
|
122
|
+
createdAt: string;
|
|
123
|
+
/** ISO timestamp when the entry was first published */
|
|
124
|
+
firstPublishedAt?: string;
|
|
125
|
+
/** ISO timestamp when the entry was last published */
|
|
126
|
+
lastPublishedAt?: string;
|
|
127
|
+
/** Version number of the entry */
|
|
128
|
+
version: number;
|
|
129
|
+
/** IDs of referenced content entries (for relationship tracking) */
|
|
130
|
+
referencedEntryIds?: string[];
|
|
131
|
+
/** IDs of referenced media assets */
|
|
132
|
+
referencedMediaIds?: string[];
|
|
133
|
+
/** Semantic type of the chunk (heading, paragraph, list, etc.) */
|
|
134
|
+
semanticType?: ChunkSemanticType;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Semantic type classification for chunks.
|
|
139
|
+
* Helps with relevance scoring and filtering.
|
|
140
|
+
*/
|
|
141
|
+
export type ChunkSemanticType =
|
|
142
|
+
| "title"
|
|
143
|
+
| "heading"
|
|
144
|
+
| "paragraph"
|
|
145
|
+
| "list"
|
|
146
|
+
| "quote"
|
|
147
|
+
| "code"
|
|
148
|
+
| "table"
|
|
149
|
+
| "mixed"
|
|
150
|
+
| "field_value";
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* A single content chunk ready for RAG indexing.
|
|
154
|
+
*/
|
|
155
|
+
export interface ContentChunk {
|
|
156
|
+
/** The text content of the chunk */
|
|
157
|
+
text: string;
|
|
158
|
+
/** Metadata for filtering and context */
|
|
159
|
+
metadata: ChunkMetadata;
|
|
160
|
+
/** Optional custom embedding text (if different from display text) */
|
|
161
|
+
embeddingText?: string;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Options for the chunking algorithm.
|
|
166
|
+
*/
|
|
167
|
+
export interface ChunkOptions {
|
|
168
|
+
/**
|
|
169
|
+
* Minimum number of lines before creating a chunk.
|
|
170
|
+
* Helps avoid very small chunks.
|
|
171
|
+
* @default 1
|
|
172
|
+
*/
|
|
173
|
+
minLines?: number;
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Soft minimum character limit for chunks.
|
|
177
|
+
* Chunker will try to create chunks at least this size.
|
|
178
|
+
* @default 100
|
|
179
|
+
*/
|
|
180
|
+
minCharsSoftLimit?: number;
|
|
181
|
+
|
|
182
|
+
/**
|
|
183
|
+
* Soft maximum character limit for chunks.
|
|
184
|
+
* Chunker will try to split at natural boundaries before this limit.
|
|
185
|
+
* @default 1000
|
|
186
|
+
*/
|
|
187
|
+
maxCharsSoftLimit?: number;
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Hard maximum character limit for chunks.
|
|
191
|
+
* Chunks will be force-split at this limit.
|
|
192
|
+
* @default 4000
|
|
193
|
+
*/
|
|
194
|
+
maxCharsHardLimit?: number;
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Primary delimiter for splitting text into chunks.
|
|
198
|
+
* @default "\n\n" (paragraph breaks)
|
|
199
|
+
*/
|
|
200
|
+
delimiter?: string;
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Secondary delimiters to try when primary doesn't work.
|
|
204
|
+
* @default ["\n", ". ", ", "]
|
|
205
|
+
*/
|
|
206
|
+
fallbackDelimiters?: string[];
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Whether to preserve heading context in each chunk.
|
|
210
|
+
* When true, includes the most recent heading at the start of each chunk.
|
|
211
|
+
* @default true
|
|
212
|
+
*/
|
|
213
|
+
preserveHeadingContext?: boolean;
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Overlap characters between chunks for context continuity.
|
|
217
|
+
* @default 50
|
|
218
|
+
*/
|
|
219
|
+
overlapChars?: number;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Options for content extraction and chunking.
|
|
224
|
+
*/
|
|
225
|
+
export interface RagExtractionOptions {
|
|
226
|
+
/**
|
|
227
|
+
* Whether to include metadata with each chunk.
|
|
228
|
+
* @default true
|
|
229
|
+
*/
|
|
230
|
+
includeMetadata?: boolean;
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Field names to include in extraction.
|
|
234
|
+
* If not specified, all text-bearing fields are included.
|
|
235
|
+
*/
|
|
236
|
+
includeFields?: string[];
|
|
237
|
+
|
|
238
|
+
/**
|
|
239
|
+
* Field names to exclude from extraction.
|
|
240
|
+
*/
|
|
241
|
+
excludeFields?: string[];
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Whether to extract text from rich text fields.
|
|
245
|
+
* @default true
|
|
246
|
+
*/
|
|
247
|
+
extractRichText?: boolean;
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Whether to extract text from JSON fields.
|
|
251
|
+
* @default true
|
|
252
|
+
*/
|
|
253
|
+
extractJson?: boolean;
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* Whether to include reference context (titles of referenced entries).
|
|
257
|
+
* Requires passing resolved references.
|
|
258
|
+
* @default true
|
|
259
|
+
*/
|
|
260
|
+
includeReferenceContext?: boolean;
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Chunking algorithm options.
|
|
264
|
+
*/
|
|
265
|
+
chunkOptions?: ChunkOptions;
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Custom prefix for each chunk (e.g., for entry context).
|
|
269
|
+
* Supports placeholders: {contentType}, {title}, {slug}
|
|
270
|
+
*/
|
|
271
|
+
chunkPrefix?: string;
|
|
272
|
+
|
|
273
|
+
/**
|
|
274
|
+
* Custom suffix for each chunk.
|
|
275
|
+
*/
|
|
276
|
+
chunkSuffix?: string;
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Whether to create a separate "summary" chunk with key fields.
|
|
280
|
+
* @default false
|
|
281
|
+
*/
|
|
282
|
+
createSummaryChunk?: boolean;
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* Fields to include in the summary chunk.
|
|
286
|
+
* @default ["title", first searchable field]
|
|
287
|
+
*/
|
|
288
|
+
summaryFields?: string[];
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Result of content extraction before chunking.
|
|
293
|
+
*/
|
|
294
|
+
export interface ExtractedContent {
|
|
295
|
+
/** Combined text content from all fields */
|
|
296
|
+
fullText: string;
|
|
297
|
+
/** Text content organized by field name */
|
|
298
|
+
fieldTexts: Record<string, string>;
|
|
299
|
+
/** Entry title (if available) */
|
|
300
|
+
title?: string;
|
|
301
|
+
/** Referenced entry IDs found in content */
|
|
302
|
+
referencedEntryIds: string[];
|
|
303
|
+
/** Referenced media IDs found in content */
|
|
304
|
+
referencedMediaIds: string[];
|
|
305
|
+
/** Source field information for tracking */
|
|
306
|
+
sourceInfo: Array<{
|
|
307
|
+
fieldName: string;
|
|
308
|
+
fieldLabel: string;
|
|
309
|
+
fieldType: string;
|
|
310
|
+
charCount: number;
|
|
311
|
+
}>;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// =============================================================================
|
|
315
|
+
// Default Configuration
|
|
316
|
+
// =============================================================================
|
|
317
|
+
|
|
318
|
+
const DEFAULT_CHUNK_OPTIONS: Required<ChunkOptions> = {
|
|
319
|
+
minLines: 1,
|
|
320
|
+
minCharsSoftLimit: 100,
|
|
321
|
+
maxCharsSoftLimit: 1000,
|
|
322
|
+
maxCharsHardLimit: 4000,
|
|
323
|
+
delimiter: "\n\n",
|
|
324
|
+
fallbackDelimiters: ["\n", ". ", ", "],
|
|
325
|
+
preserveHeadingContext: true,
|
|
326
|
+
overlapChars: 50,
|
|
327
|
+
};
|
|
328
|
+
|
|
329
|
+
const DEFAULT_EXTRACTION_OPTIONS: Required<RagExtractionOptions> = {
|
|
330
|
+
includeMetadata: true,
|
|
331
|
+
includeFields: [],
|
|
332
|
+
excludeFields: [],
|
|
333
|
+
extractRichText: true,
|
|
334
|
+
extractJson: true,
|
|
335
|
+
includeReferenceContext: true,
|
|
336
|
+
chunkOptions: DEFAULT_CHUNK_OPTIONS,
|
|
337
|
+
chunkPrefix: "",
|
|
338
|
+
chunkSuffix: "",
|
|
339
|
+
createSummaryChunk: false,
|
|
340
|
+
summaryFields: [],
|
|
341
|
+
};
|
|
342
|
+
|
|
343
|
+
// =============================================================================
|
|
344
|
+
// Text Extraction Functions
|
|
345
|
+
// =============================================================================
|
|
346
|
+
|
|
347
|
+
/**
|
|
348
|
+
* Extracts plain text from a rich text field value.
|
|
349
|
+
*
|
|
350
|
+
* Handles common rich text formats:
|
|
351
|
+
* - HTML strings (strips tags)
|
|
352
|
+
* - ProseMirror/Tiptap JSON structure
|
|
353
|
+
* - Markdown strings
|
|
354
|
+
* - Plain text strings
|
|
355
|
+
*
|
|
356
|
+
* @param value - The rich text field value
|
|
357
|
+
* @returns Plain text content
|
|
358
|
+
*/
|
|
359
|
+
export function extractTextFromRichText(value: unknown): string {
|
|
360
|
+
if (value === null || value === undefined) {
|
|
361
|
+
return "";
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
// Handle string values (HTML, Markdown, or plain text)
|
|
365
|
+
if (typeof value === "string") {
|
|
366
|
+
return stripHtmlTags(value);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// Handle ProseMirror/Tiptap JSON structure
|
|
370
|
+
if (typeof value === "object" && value !== null) {
|
|
371
|
+
const obj = value as Record<string, unknown>;
|
|
372
|
+
|
|
373
|
+
// Check for ProseMirror doc structure
|
|
374
|
+
if (obj.type === "doc" && Array.isArray(obj.content)) {
|
|
375
|
+
return extractTextFromProseMirrorDoc(obj);
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
// Check for array of blocks
|
|
379
|
+
if (Array.isArray(value)) {
|
|
380
|
+
return value.map((block) => extractTextFromRichText(block)).join("\n\n");
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// Generic object - try to find text content
|
|
384
|
+
if ("text" in obj && typeof obj.text === "string") {
|
|
385
|
+
return obj.text;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
if ("content" in obj && typeof obj.content === "string") {
|
|
389
|
+
return obj.content;
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
return "";
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
/**
|
|
397
|
+
* Extracts text from a ProseMirror document structure.
|
|
398
|
+
*/
|
|
399
|
+
function extractTextFromProseMirrorDoc(doc: Record<string, unknown>): string {
|
|
400
|
+
const content = doc.content as unknown[];
|
|
401
|
+
if (!Array.isArray(content)) {
|
|
402
|
+
return "";
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
const textParts: string[] = [];
|
|
406
|
+
|
|
407
|
+
for (const node of content) {
|
|
408
|
+
if (typeof node !== "object" || node === null) continue;
|
|
409
|
+
|
|
410
|
+
const nodeObj = node as Record<string, unknown>;
|
|
411
|
+
const nodeType = nodeObj.type as string;
|
|
412
|
+
|
|
413
|
+
switch (nodeType) {
|
|
414
|
+
case "paragraph":
|
|
415
|
+
case "heading":
|
|
416
|
+
textParts.push(extractTextFromProseMirrorNode(nodeObj));
|
|
417
|
+
break;
|
|
418
|
+
|
|
419
|
+
case "bulletList":
|
|
420
|
+
case "orderedList":
|
|
421
|
+
textParts.push(extractTextFromProseMirrorList(nodeObj));
|
|
422
|
+
break;
|
|
423
|
+
|
|
424
|
+
case "blockquote": {
|
|
425
|
+
const quoteText = extractTextFromProseMirrorDoc(nodeObj);
|
|
426
|
+
textParts.push(`"${quoteText}"`);
|
|
427
|
+
break;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
case "codeBlock":
|
|
431
|
+
if (nodeObj.content && Array.isArray(nodeObj.content)) {
|
|
432
|
+
const codeText = (nodeObj.content as Array<{ text?: string }>)
|
|
433
|
+
.map((c) => c.text || "")
|
|
434
|
+
.join("");
|
|
435
|
+
textParts.push(codeText);
|
|
436
|
+
}
|
|
437
|
+
break;
|
|
438
|
+
|
|
439
|
+
case "horizontalRule":
|
|
440
|
+
// Skip horizontal rules
|
|
441
|
+
break;
|
|
442
|
+
|
|
443
|
+
default:
|
|
444
|
+
// Try generic extraction
|
|
445
|
+
if (nodeObj.content) {
|
|
446
|
+
textParts.push(extractTextFromProseMirrorDoc(nodeObj));
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
return textParts.filter(Boolean).join("\n\n");
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
/**
|
|
455
|
+
* Extracts text from a ProseMirror node with inline content.
|
|
456
|
+
*/
|
|
457
|
+
function extractTextFromProseMirrorNode(node: Record<string, unknown>): string {
|
|
458
|
+
const content = node.content as unknown[];
|
|
459
|
+
if (!Array.isArray(content)) {
|
|
460
|
+
return "";
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
return content
|
|
464
|
+
.map((child) => {
|
|
465
|
+
if (typeof child !== "object" || child === null) return "";
|
|
466
|
+
const childObj = child as Record<string, unknown>;
|
|
467
|
+
|
|
468
|
+
if (childObj.type === "text") {
|
|
469
|
+
return (childObj.text as string) || "";
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
// Handle inline nodes with content
|
|
473
|
+
if (childObj.content) {
|
|
474
|
+
return extractTextFromProseMirrorNode(childObj);
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
return "";
|
|
478
|
+
})
|
|
479
|
+
.join("");
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
/**
|
|
483
|
+
* Extracts text from a ProseMirror list node.
|
|
484
|
+
*/
|
|
485
|
+
function extractTextFromProseMirrorList(list: Record<string, unknown>): string {
|
|
486
|
+
const items = list.content as unknown[];
|
|
487
|
+
if (!Array.isArray(items)) {
|
|
488
|
+
return "";
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
return items
|
|
492
|
+
.map((item, _index) => {
|
|
493
|
+
if (typeof item !== "object" || item === null) return "";
|
|
494
|
+
const itemObj = item as Record<string, unknown>;
|
|
495
|
+
|
|
496
|
+
const itemText = extractTextFromProseMirrorDoc(itemObj);
|
|
497
|
+
return `- ${itemText}`;
|
|
498
|
+
})
|
|
499
|
+
.join("\n");
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
/**
|
|
503
|
+
* Strips HTML tags from a string, preserving structure where possible.
|
|
504
|
+
*/
|
|
505
|
+
export function stripHtmlTags(html: string): string {
|
|
506
|
+
if (!html) return "";
|
|
507
|
+
|
|
508
|
+
// First, add newlines for block elements
|
|
509
|
+
let text = html
|
|
510
|
+
.replace(/<\/?(p|div|br|h[1-6]|li|tr)[^>]*>/gi, "\n")
|
|
511
|
+
.replace(/<\/?(ul|ol|table|blockquote)[^>]*>/gi, "\n\n");
|
|
512
|
+
|
|
513
|
+
// Remove remaining HTML tags
|
|
514
|
+
text = text.replace(/<[^>]*>/g, "");
|
|
515
|
+
|
|
516
|
+
// Decode common HTML entities
|
|
517
|
+
text = text
|
|
518
|
+
.replace(/ /g, " ")
|
|
519
|
+
.replace(/&/g, "&")
|
|
520
|
+
.replace(/</g, "<")
|
|
521
|
+
.replace(/>/g, ">")
|
|
522
|
+
.replace(/"/g, '"')
|
|
523
|
+
.replace(/'/g, "'")
|
|
524
|
+
.replace(/—/g, "—")
|
|
525
|
+
.replace(/–/g, "–");
|
|
526
|
+
|
|
527
|
+
// Clean up whitespace
|
|
528
|
+
text = text
|
|
529
|
+
.split("\n")
|
|
530
|
+
.map((line) => line.trim())
|
|
531
|
+
.join("\n")
|
|
532
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
533
|
+
.trim();
|
|
534
|
+
|
|
535
|
+
return text;
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
/**
|
|
539
|
+
* Extracts text from a JSON field value.
|
|
540
|
+
*
|
|
541
|
+
* Recursively extracts string values from objects and arrays.
|
|
542
|
+
* Useful for structured data fields that may contain text content.
|
|
543
|
+
*
|
|
544
|
+
* @param value - The JSON field value
|
|
545
|
+
* @param maxDepth - Maximum recursion depth
|
|
546
|
+
* @returns Extracted text content
|
|
547
|
+
*/
|
|
548
|
+
export function extractTextFromJson(
|
|
549
|
+
value: unknown,
|
|
550
|
+
maxDepth: number = 5,
|
|
551
|
+
): string {
|
|
552
|
+
if (maxDepth <= 0) return "";
|
|
553
|
+
|
|
554
|
+
if (value === null || value === undefined) {
|
|
555
|
+
return "";
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
if (typeof value === "string") {
|
|
559
|
+
return value;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
if (typeof value === "number" || typeof value === "boolean") {
|
|
563
|
+
return String(value);
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
if (Array.isArray(value)) {
|
|
567
|
+
return value
|
|
568
|
+
.map((item) => extractTextFromJson(item, maxDepth - 1))
|
|
569
|
+
.filter(Boolean)
|
|
570
|
+
.join(", ");
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
if (typeof value === "object") {
|
|
574
|
+
const obj = value as Record<string, unknown>;
|
|
575
|
+
const textParts: string[] = [];
|
|
576
|
+
|
|
577
|
+
// Prioritize common text field names
|
|
578
|
+
const priorityKeys = [
|
|
579
|
+
"text",
|
|
580
|
+
"content",
|
|
581
|
+
"value",
|
|
582
|
+
"label",
|
|
583
|
+
"title",
|
|
584
|
+
"name",
|
|
585
|
+
"description",
|
|
586
|
+
];
|
|
587
|
+
const seenKeys = new Set<string>();
|
|
588
|
+
|
|
589
|
+
for (const key of priorityKeys) {
|
|
590
|
+
if (key in obj) {
|
|
591
|
+
const extracted = extractTextFromJson(obj[key], maxDepth - 1);
|
|
592
|
+
if (extracted) {
|
|
593
|
+
textParts.push(extracted);
|
|
594
|
+
seenKeys.add(key);
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
// Then process remaining keys
|
|
600
|
+
for (const [key, val] of Object.entries(obj)) {
|
|
601
|
+
if (seenKeys.has(key)) continue;
|
|
602
|
+
// Skip internal/system keys
|
|
603
|
+
if (key.startsWith("_") || key.startsWith("$")) continue;
|
|
604
|
+
|
|
605
|
+
const extracted = extractTextFromJson(val, maxDepth - 1);
|
|
606
|
+
if (extracted) {
|
|
607
|
+
textParts.push(extracted);
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
return textParts.join(" ");
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
return "";
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
/**
|
|
618
|
+
* Extracts text from a select or multiSelect field value.
|
|
619
|
+
*/
|
|
620
|
+
export function extractTextFromSelect(value: unknown): string {
|
|
621
|
+
if (value === null || value === undefined) {
|
|
622
|
+
return "";
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
if (typeof value === "string") {
|
|
626
|
+
return value;
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
if (Array.isArray(value)) {
|
|
630
|
+
return value.filter((v) => typeof v === "string").join(", ");
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
return "";
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
// =============================================================================
|
|
637
|
+
// Content Extraction
|
|
638
|
+
// =============================================================================
|
|
639
|
+
|
|
640
|
+
/**
|
|
641
|
+
* Extracts text content from a content entry based on its content type schema.
|
|
642
|
+
*
|
|
643
|
+
* This function:
|
|
644
|
+
* 1. Iterates through fields defined in the content type
|
|
645
|
+
* 2. Extracts text from each field based on its type
|
|
646
|
+
* 3. Tracks references (content and media)
|
|
647
|
+
* 4. Builds a combined text representation
|
|
648
|
+
*
|
|
649
|
+
* @param entry - The content entry to extract from
|
|
650
|
+
* @param contentType - The content type definition
|
|
651
|
+
* @param options - Extraction options
|
|
652
|
+
* @param resolvedReferences - Optional map of resolved reference information
|
|
653
|
+
* @returns Extracted content with metadata
|
|
654
|
+
*
|
|
655
|
+
* @example
|
|
656
|
+
* ```typescript
|
|
657
|
+
* const extracted = extractContent(entry, contentType, {
|
|
658
|
+
* includeFields: ["title", "content", "excerpt"],
|
|
659
|
+
* extractRichText: true,
|
|
660
|
+
* });
|
|
661
|
+
*
|
|
662
|
+
* console.log(extracted.fullText);
|
|
663
|
+
* // "My Blog Post\n\nThis is the main content...\n\nA brief excerpt."
|
|
664
|
+
* ```
|
|
665
|
+
*/
|
|
666
|
+
export function extractContent(
|
|
667
|
+
entry: ContentEntryInfo,
|
|
668
|
+
contentType: ContentTypeInfo,
|
|
669
|
+
options: Partial<RagExtractionOptions> = {},
|
|
670
|
+
resolvedReferences?: Map<string, ResolvedReferenceInfo>,
|
|
671
|
+
): ExtractedContent {
|
|
672
|
+
const opts = { ...DEFAULT_EXTRACTION_OPTIONS, ...options };
|
|
673
|
+
const data = entry.data || {};
|
|
674
|
+
|
|
675
|
+
const fieldTexts: Record<string, string> = {};
|
|
676
|
+
const sourceInfo: ExtractedContent["sourceInfo"] = [];
|
|
677
|
+
const referencedEntryIds: string[] = [];
|
|
678
|
+
const referencedMediaIds: string[] = [];
|
|
679
|
+
|
|
680
|
+
let title: string | undefined;
|
|
681
|
+
|
|
682
|
+
// Determine which fields to process
|
|
683
|
+
const fieldsToProcess = contentType.fields.filter((field) => {
|
|
684
|
+
// Check include list
|
|
685
|
+
if (opts.includeFields && opts.includeFields.length > 0) {
|
|
686
|
+
if (!opts.includeFields.includes(field.name)) return false;
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
// Check exclude list
|
|
690
|
+
if (opts.excludeFields && opts.excludeFields.length > 0) {
|
|
691
|
+
if (opts.excludeFields.includes(field.name)) return false;
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
return true;
|
|
695
|
+
});
|
|
696
|
+
|
|
697
|
+
// Process each field
|
|
698
|
+
for (const field of fieldsToProcess) {
|
|
699
|
+
const value = data[field.name];
|
|
700
|
+
if (value === null || value === undefined) continue;
|
|
701
|
+
|
|
702
|
+
let extractedText = "";
|
|
703
|
+
|
|
704
|
+
switch (field.type) {
|
|
705
|
+
case "text":
|
|
706
|
+
extractedText = typeof value === "string" ? value : String(value);
|
|
707
|
+
break;
|
|
708
|
+
|
|
709
|
+
case "richText":
|
|
710
|
+
if (opts.extractRichText) {
|
|
711
|
+
extractedText = extractTextFromRichText(value);
|
|
712
|
+
}
|
|
713
|
+
break;
|
|
714
|
+
|
|
715
|
+
case "json":
|
|
716
|
+
if (opts.extractJson) {
|
|
717
|
+
extractedText = extractTextFromJson(value);
|
|
718
|
+
}
|
|
719
|
+
break;
|
|
720
|
+
|
|
721
|
+
case "select":
|
|
722
|
+
case "multiSelect":
|
|
723
|
+
extractedText = extractTextFromSelect(value);
|
|
724
|
+
break;
|
|
725
|
+
|
|
726
|
+
case "reference": {
|
|
727
|
+
// Track reference IDs
|
|
728
|
+
const refIds = extractReferenceIds(value, field);
|
|
729
|
+
referencedEntryIds.push(...refIds);
|
|
730
|
+
|
|
731
|
+
// Include reference context if available
|
|
732
|
+
if (opts.includeReferenceContext && resolvedReferences) {
|
|
733
|
+
const refTexts = refIds
|
|
734
|
+
.map((id) => {
|
|
735
|
+
const ref = resolvedReferences.get(id);
|
|
736
|
+
if (ref && ref.title) {
|
|
737
|
+
return `[${ref.title}]`;
|
|
738
|
+
}
|
|
739
|
+
return null;
|
|
740
|
+
})
|
|
741
|
+
.filter(Boolean);
|
|
742
|
+
|
|
743
|
+
if (refTexts.length > 0) {
|
|
744
|
+
extractedText = `Referenced: ${refTexts.join(", ")}`;
|
|
745
|
+
}
|
|
746
|
+
}
|
|
747
|
+
break;
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
case "media": {
|
|
751
|
+
// Track media IDs
|
|
752
|
+
const mediaIds = extractMediaIds(value, field);
|
|
753
|
+
referencedMediaIds.push(...mediaIds);
|
|
754
|
+
// Media doesn't contribute to text content
|
|
755
|
+
break;
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
case "number":
|
|
759
|
+
case "boolean":
|
|
760
|
+
case "date":
|
|
761
|
+
case "datetime":
|
|
762
|
+
// These can optionally be included as context
|
|
763
|
+
extractedText = formatFieldValue(value, field.type);
|
|
764
|
+
break;
|
|
765
|
+
|
|
766
|
+
default:
|
|
767
|
+
// Unknown field type - try generic extraction
|
|
768
|
+
if (typeof value === "string") {
|
|
769
|
+
extractedText = value;
|
|
770
|
+
}
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
if (extractedText) {
|
|
774
|
+
fieldTexts[field.name] = extractedText;
|
|
775
|
+
sourceInfo.push({
|
|
776
|
+
fieldName: field.name,
|
|
777
|
+
fieldLabel: field.label,
|
|
778
|
+
fieldType: field.type,
|
|
779
|
+
charCount: extractedText.length,
|
|
780
|
+
});
|
|
781
|
+
|
|
782
|
+
// Extract title from title field
|
|
783
|
+
if (field.name === contentType.titleField) {
|
|
784
|
+
title = extractedText;
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
// Build full text with field labels for context
|
|
790
|
+
const fullTextParts: string[] = [];
|
|
791
|
+
|
|
792
|
+
// Add title first if available
|
|
793
|
+
if (title) {
|
|
794
|
+
fullTextParts.push(title);
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
// Add other fields
|
|
798
|
+
for (const field of fieldsToProcess) {
|
|
799
|
+
if (field.name === contentType.titleField) continue; // Already added
|
|
800
|
+
const text = fieldTexts[field.name];
|
|
801
|
+
if (text) {
|
|
802
|
+
fullTextParts.push(text);
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
return {
|
|
807
|
+
fullText: fullTextParts.join("\n\n"),
|
|
808
|
+
fieldTexts,
|
|
809
|
+
title,
|
|
810
|
+
referencedEntryIds,
|
|
811
|
+
referencedMediaIds,
|
|
812
|
+
sourceInfo,
|
|
813
|
+
};
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
/**
|
|
817
|
+
* Extracts reference IDs from a reference field value.
|
|
818
|
+
*/
|
|
819
|
+
function extractReferenceIds(value: unknown, field: FieldDefinition): string[] {
|
|
820
|
+
if (value === null || value === undefined) {
|
|
821
|
+
return [];
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
const isMultiple = field.options?.multiple === true;
|
|
825
|
+
|
|
826
|
+
if (isMultiple && Array.isArray(value)) {
|
|
827
|
+
return value.filter((v) => typeof v === "string");
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
if (typeof value === "string") {
|
|
831
|
+
return [value];
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
return [];
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
/**
|
|
838
|
+
* Extracts media IDs from a media field value.
|
|
839
|
+
*/
|
|
840
|
+
function extractMediaIds(value: unknown, field: FieldDefinition): string[] {
|
|
841
|
+
if (value === null || value === undefined) {
|
|
842
|
+
return [];
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
const isMultiple = field.options?.multiple === true;
|
|
846
|
+
|
|
847
|
+
if (isMultiple && Array.isArray(value)) {
|
|
848
|
+
return value.filter((v) => typeof v === "string");
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
if (typeof value === "string") {
|
|
852
|
+
return [value];
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
return [];
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
/**
|
|
859
|
+
* Formats a field value for text representation.
|
|
860
|
+
*/
|
|
861
|
+
function formatFieldValue(value: unknown, fieldType: string): string {
|
|
862
|
+
if (value === null || value === undefined) {
|
|
863
|
+
return "";
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
switch (fieldType) {
|
|
867
|
+
case "number":
|
|
868
|
+
return typeof value === "number" ? value.toString() : String(value);
|
|
869
|
+
|
|
870
|
+
case "boolean":
|
|
871
|
+
return value ? "Yes" : "No";
|
|
872
|
+
|
|
873
|
+
case "date":
|
|
874
|
+
case "datetime":
|
|
875
|
+
if (typeof value === "string") {
|
|
876
|
+
return value;
|
|
877
|
+
}
|
|
878
|
+
if (typeof value === "number") {
|
|
879
|
+
return new Date(value).toISOString();
|
|
880
|
+
}
|
|
881
|
+
return String(value);
|
|
882
|
+
|
|
883
|
+
default:
|
|
884
|
+
return String(value);
|
|
885
|
+
}
|
|
886
|
+
}
|
|
887
|
+
|
|
888
|
+
// =============================================================================
|
|
889
|
+
// Text Chunking
|
|
890
|
+
// =============================================================================
|
|
891
|
+
|
|
892
|
+
/**
|
|
893
|
+
* Splits text into semantic chunks optimized for embedding.
|
|
894
|
+
*
|
|
895
|
+
* The algorithm:
|
|
896
|
+
* 1. First tries to split on paragraph breaks (default delimiter)
|
|
897
|
+
* 2. Falls back to line breaks if paragraphs are too large
|
|
898
|
+
* 3. Falls back to sentence boundaries if lines are too large
|
|
899
|
+
* 4. Force-splits at hard limit if necessary
|
|
900
|
+
* 5. Optionally preserves heading context
|
|
901
|
+
*
|
|
902
|
+
* @param text - The text to chunk
|
|
903
|
+
* @param options - Chunking options
|
|
904
|
+
* @returns Array of text chunks
|
|
905
|
+
*
|
|
906
|
+
* @example
|
|
907
|
+
* ```typescript
|
|
908
|
+
* const chunks = chunkText(longArticle, {
|
|
909
|
+
* maxCharsSoftLimit: 1000,
|
|
910
|
+
* preserveHeadingContext: true,
|
|
911
|
+
* });
|
|
912
|
+
* ```
|
|
913
|
+
*/
|
|
914
|
+
export function chunkText(
|
|
915
|
+
text: string,
|
|
916
|
+
options: Partial<ChunkOptions> = {},
|
|
917
|
+
): string[] {
|
|
918
|
+
const opts = { ...DEFAULT_CHUNK_OPTIONS, ...options };
|
|
919
|
+
|
|
920
|
+
if (!text || text.trim().length === 0) {
|
|
921
|
+
return [];
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
// If text is small enough, return as single chunk
|
|
925
|
+
if (text.length <= opts.maxCharsSoftLimit) {
|
|
926
|
+
return [text.trim()];
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
const chunks: string[] = [];
|
|
930
|
+
let currentHeading: string | null = null;
|
|
931
|
+
|
|
932
|
+
// Split by primary delimiter
|
|
933
|
+
let segments = text.split(opts.delimiter);
|
|
934
|
+
|
|
935
|
+
// If we have very few segments, try secondary splitting
|
|
936
|
+
if (segments.length <= 2 && text.length > opts.maxCharsSoftLimit) {
|
|
937
|
+
for (const fallback of opts.fallbackDelimiters) {
|
|
938
|
+
const fallbackSegments = text.split(fallback);
|
|
939
|
+
if (fallbackSegments.length > segments.length) {
|
|
940
|
+
segments = fallbackSegments;
|
|
941
|
+
break;
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
let currentChunk = "";
|
|
947
|
+
|
|
948
|
+
for (const segment of segments) {
|
|
949
|
+
const trimmedSegment = segment.trim();
|
|
950
|
+
if (!trimmedSegment) continue;
|
|
951
|
+
|
|
952
|
+
// Detect headings (lines that look like titles)
|
|
953
|
+
const isHeading = detectHeading(trimmedSegment);
|
|
954
|
+
if (isHeading && opts.preserveHeadingContext) {
|
|
955
|
+
currentHeading = trimmedSegment;
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
// Check if adding this segment would exceed soft limit
|
|
959
|
+
const potentialChunk = currentChunk
|
|
960
|
+
? `${currentChunk}\n\n${trimmedSegment}`
|
|
961
|
+
: trimmedSegment;
|
|
962
|
+
|
|
963
|
+
if (potentialChunk.length > opts.maxCharsSoftLimit && currentChunk) {
|
|
964
|
+
// Save current chunk
|
|
965
|
+
chunks.push(finalizeChunk(currentChunk, currentHeading, opts));
|
|
966
|
+
|
|
967
|
+
// Start new chunk, potentially with heading context
|
|
968
|
+
if (opts.preserveHeadingContext && currentHeading && !isHeading) {
|
|
969
|
+
currentChunk = `${currentHeading}\n\n${trimmedSegment}`;
|
|
970
|
+
} else {
|
|
971
|
+
currentChunk = trimmedSegment;
|
|
972
|
+
}
|
|
973
|
+
} else {
|
|
974
|
+
currentChunk = potentialChunk;
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
// Handle segments that are too large even alone
|
|
978
|
+
if (currentChunk.length > opts.maxCharsHardLimit) {
|
|
979
|
+
const subChunks = forceSplitText(currentChunk, opts);
|
|
980
|
+
chunks.push(...subChunks.slice(0, -1));
|
|
981
|
+
currentChunk = subChunks[subChunks.length - 1] || "";
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
// Don't forget the last chunk
|
|
986
|
+
if (currentChunk.trim()) {
|
|
987
|
+
chunks.push(finalizeChunk(currentChunk, null, opts));
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
return chunks;
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
/**
|
|
994
|
+
* Detects if a text segment is likely a heading.
|
|
995
|
+
*/
|
|
996
|
+
function detectHeading(text: string): boolean {
|
|
997
|
+
const trimmed = text.trim();
|
|
998
|
+
|
|
999
|
+
// Short lines that don't end with sentence punctuation are likely headings
|
|
1000
|
+
if (trimmed.length < 100 && !trimmed.match(/[.!?]$/)) {
|
|
1001
|
+
// Check if it starts with heading patterns
|
|
1002
|
+
if (
|
|
1003
|
+
trimmed.match(/^#{1,6}\s/) || // Markdown headings
|
|
1004
|
+
trimmed.match(/^[A-Z][\w\s]+:?$/) || // Title Case lines
|
|
1005
|
+
trimmed.match(/^\d+\.\s+[A-Z]/)
|
|
1006
|
+
) {
|
|
1007
|
+
// Numbered sections
|
|
1008
|
+
return true;
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
|
|
1012
|
+
return false;
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
/**
|
|
1016
|
+
* Finalizes a chunk by adding overlap if needed.
|
|
1017
|
+
*/
|
|
1018
|
+
function finalizeChunk(
|
|
1019
|
+
chunk: string,
|
|
1020
|
+
_heading: string | null,
|
|
1021
|
+
_opts: Required<ChunkOptions>,
|
|
1022
|
+
): string {
|
|
1023
|
+
return chunk.trim();
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
/**
|
|
1027
|
+
* Force-splits text that exceeds the hard limit.
|
|
1028
|
+
*/
|
|
1029
|
+
function forceSplitText(text: string, opts: Required<ChunkOptions>): string[] {
|
|
1030
|
+
const chunks: string[] = [];
|
|
1031
|
+
let remaining = text;
|
|
1032
|
+
|
|
1033
|
+
while (remaining.length > opts.maxCharsHardLimit) {
|
|
1034
|
+
// Try to find a good split point
|
|
1035
|
+
let splitPoint = opts.maxCharsSoftLimit;
|
|
1036
|
+
|
|
1037
|
+
// Look for sentence boundary
|
|
1038
|
+
const sentenceEnd = remaining.lastIndexOf(". ", splitPoint);
|
|
1039
|
+
if (sentenceEnd > opts.minCharsSoftLimit) {
|
|
1040
|
+
splitPoint = sentenceEnd + 1;
|
|
1041
|
+
} else {
|
|
1042
|
+
// Look for word boundary
|
|
1043
|
+
const spacePoint = remaining.lastIndexOf(" ", splitPoint);
|
|
1044
|
+
if (spacePoint > opts.minCharsSoftLimit) {
|
|
1045
|
+
splitPoint = spacePoint;
|
|
1046
|
+
}
|
|
1047
|
+
}
|
|
1048
|
+
|
|
1049
|
+
chunks.push(remaining.slice(0, splitPoint).trim());
|
|
1050
|
+
remaining = remaining.slice(splitPoint).trim();
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
if (remaining) {
|
|
1054
|
+
chunks.push(remaining);
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
return chunks;
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
// =============================================================================
|
|
1061
|
+
// Main API Functions
|
|
1062
|
+
// =============================================================================
|
|
1063
|
+
|
|
1064
|
+
/**
|
|
1065
|
+
* Processes a content entry into chunks ready for RAG indexing.
|
|
1066
|
+
*
|
|
1067
|
+
* This is the main function to use for preparing CMS content for @convex-dev/rag.
|
|
1068
|
+
* It combines extraction and chunking with full metadata.
|
|
1069
|
+
*
|
|
1070
|
+
* @param entry - The content entry to process
|
|
1071
|
+
* @param contentType - The content type definition
|
|
1072
|
+
* @param options - Extraction and chunking options
|
|
1073
|
+
* @param resolvedReferences - Optional map of resolved references for context
|
|
1074
|
+
* @returns Array of content chunks with metadata
|
|
1075
|
+
*
|
|
1076
|
+
* @example
|
|
1077
|
+
* ```typescript
|
|
1078
|
+
* // In a Convex action
|
|
1079
|
+
* export const indexEntry = action({
|
|
1080
|
+
* args: { entryId: v.id("contentEntries") },
|
|
1081
|
+
* handler: async (ctx, { entryId }) => {
|
|
1082
|
+
* const entry = await ctx.runQuery(api.contentEntries.get, { id: entryId });
|
|
1083
|
+
* const contentType = await ctx.runQuery(api.contentTypes.get, {
|
|
1084
|
+
* id: entry.contentTypeId
|
|
1085
|
+
* });
|
|
1086
|
+
*
|
|
1087
|
+
* const chunks = chunkContentEntry(entry, contentType, {
|
|
1088
|
+
* chunkOptions: { maxCharsSoftLimit: 800 },
|
|
1089
|
+
* includeMetadata: true,
|
|
1090
|
+
* });
|
|
1091
|
+
*
|
|
1092
|
+
* // Add to RAG index
|
|
1093
|
+
* await rag.add(ctx, {
|
|
1094
|
+
* namespace: `cms:${contentType.name}`,
|
|
1095
|
+
* key: entryId,
|
|
1096
|
+
* chunks: chunks.map(c => c.text),
|
|
1097
|
+
* title: entry.data.title,
|
|
1098
|
+
* });
|
|
1099
|
+
*
|
|
1100
|
+
* return { indexed: chunks.length };
|
|
1101
|
+
* },
|
|
1102
|
+
* });
|
|
1103
|
+
* ```
|
|
1104
|
+
*/
|
|
1105
|
+
export function chunkContentEntry(
|
|
1106
|
+
entry: ContentEntryInfo,
|
|
1107
|
+
contentType: ContentTypeInfo,
|
|
1108
|
+
options: Partial<RagExtractionOptions> = {},
|
|
1109
|
+
resolvedReferences?: Map<string, ResolvedReferenceInfo>,
|
|
1110
|
+
): ContentChunk[] {
|
|
1111
|
+
const opts = { ...DEFAULT_EXTRACTION_OPTIONS, ...options };
|
|
1112
|
+
|
|
1113
|
+
// Extract content from the entry
|
|
1114
|
+
const extracted = extractContent(
|
|
1115
|
+
entry,
|
|
1116
|
+
contentType,
|
|
1117
|
+
opts,
|
|
1118
|
+
resolvedReferences,
|
|
1119
|
+
);
|
|
1120
|
+
|
|
1121
|
+
if (!extracted.fullText) {
|
|
1122
|
+
return [];
|
|
1123
|
+
}
|
|
1124
|
+
|
|
1125
|
+
// Apply prefix/suffix to full text before chunking
|
|
1126
|
+
let textToChunk = extracted.fullText;
|
|
1127
|
+
if (opts.chunkPrefix) {
|
|
1128
|
+
const prefix = opts.chunkPrefix
|
|
1129
|
+
.replace("{contentType}", contentType.displayName)
|
|
1130
|
+
.replace("{title}", extracted.title || entry.slug)
|
|
1131
|
+
.replace("{slug}", entry.slug);
|
|
1132
|
+
textToChunk = `${prefix}\n\n${textToChunk}`;
|
|
1133
|
+
}
|
|
1134
|
+
if (opts.chunkSuffix) {
|
|
1135
|
+
textToChunk = `${textToChunk}\n\n${opts.chunkSuffix}`;
|
|
1136
|
+
}
|
|
1137
|
+
|
|
1138
|
+
// Chunk the text
|
|
1139
|
+
const textChunks = chunkText(textToChunk, opts.chunkOptions);
|
|
1140
|
+
|
|
1141
|
+
// Build content chunks with metadata
|
|
1142
|
+
const chunks: ContentChunk[] = textChunks.map((text, index) => {
|
|
1143
|
+
const metadata: ChunkMetadata = {
|
|
1144
|
+
entryId: entry._id,
|
|
1145
|
+
contentType: contentType.name,
|
|
1146
|
+
contentTypeDisplayName: contentType.displayName,
|
|
1147
|
+
slug: entry.slug,
|
|
1148
|
+
status: entry.status,
|
|
1149
|
+
locale: entry.locale,
|
|
1150
|
+
sourceFields: extracted.sourceInfo.map((s) => s.fieldName),
|
|
1151
|
+
chunkIndex: index,
|
|
1152
|
+
totalChunks: textChunks.length,
|
|
1153
|
+
title: extracted.title,
|
|
1154
|
+
createdAt: new Date(entry._creationTime).toISOString(),
|
|
1155
|
+
firstPublishedAt: entry.firstPublishedAt
|
|
1156
|
+
? new Date(entry.firstPublishedAt).toISOString()
|
|
1157
|
+
: undefined,
|
|
1158
|
+
lastPublishedAt: entry.lastPublishedAt
|
|
1159
|
+
? new Date(entry.lastPublishedAt).toISOString()
|
|
1160
|
+
: undefined,
|
|
1161
|
+
version: entry.version,
|
|
1162
|
+
referencedEntryIds:
|
|
1163
|
+
extracted.referencedEntryIds.length > 0
|
|
1164
|
+
? extracted.referencedEntryIds
|
|
1165
|
+
: undefined,
|
|
1166
|
+
referencedMediaIds:
|
|
1167
|
+
extracted.referencedMediaIds.length > 0
|
|
1168
|
+
? extracted.referencedMediaIds
|
|
1169
|
+
: undefined,
|
|
1170
|
+
semanticType: detectSemanticType(text),
|
|
1171
|
+
};
|
|
1172
|
+
|
|
1173
|
+
return opts.includeMetadata ? { text, metadata } : { text, metadata };
|
|
1174
|
+
});
|
|
1175
|
+
|
|
1176
|
+
// Optionally create a summary chunk
|
|
1177
|
+
if (opts.createSummaryChunk && chunks.length > 0) {
|
|
1178
|
+
const summaryChunk = createSummaryChunk(
|
|
1179
|
+
entry,
|
|
1180
|
+
contentType,
|
|
1181
|
+
extracted,
|
|
1182
|
+
chunks.length,
|
|
1183
|
+
);
|
|
1184
|
+
chunks.unshift(summaryChunk);
|
|
1185
|
+
|
|
1186
|
+
// Update chunk indices
|
|
1187
|
+
chunks.forEach((chunk, index) => {
|
|
1188
|
+
chunk.metadata.chunkIndex = index;
|
|
1189
|
+
chunk.metadata.totalChunks = chunks.length;
|
|
1190
|
+
});
|
|
1191
|
+
}
|
|
1192
|
+
|
|
1193
|
+
return chunks;
|
|
1194
|
+
}
|
|
1195
|
+
|
|
1196
|
+
/**
|
|
1197
|
+
* Detects the semantic type of a chunk based on its content.
|
|
1198
|
+
*/
|
|
1199
|
+
function detectSemanticType(text: string): ChunkSemanticType {
|
|
1200
|
+
const trimmed = text.trim();
|
|
1201
|
+
|
|
1202
|
+
// Check for headings
|
|
1203
|
+
if (
|
|
1204
|
+
trimmed.match(/^#{1,6}\s/) ||
|
|
1205
|
+
(trimmed.length < 100 && !trimmed.includes("\n"))
|
|
1206
|
+
) {
|
|
1207
|
+
const lines = trimmed.split("\n");
|
|
1208
|
+
if (lines.length === 1 && !trimmed.match(/[.!?]$/)) {
|
|
1209
|
+
return lines[0].length < 20 ? "title" : "heading";
|
|
1210
|
+
}
|
|
1211
|
+
}
|
|
1212
|
+
|
|
1213
|
+
// Check for lists
|
|
1214
|
+
if (trimmed.match(/^[-*]\s/m) || trimmed.match(/^\d+\.\s/m)) {
|
|
1215
|
+
return "list";
|
|
1216
|
+
}
|
|
1217
|
+
|
|
1218
|
+
// Check for quotes
|
|
1219
|
+
if (trimmed.startsWith('"') || trimmed.startsWith(">")) {
|
|
1220
|
+
return "quote";
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
// Check for code
|
|
1224
|
+
if (trimmed.startsWith("```") || trimmed.match(/^\s{4}/m)) {
|
|
1225
|
+
return "code";
|
|
1226
|
+
}
|
|
1227
|
+
|
|
1228
|
+
// Default to paragraph or mixed
|
|
1229
|
+
return trimmed.includes("\n\n") ? "mixed" : "paragraph";
|
|
1230
|
+
}
|
|
1231
|
+
|
|
1232
|
+
/**
|
|
1233
|
+
* Creates a summary chunk from key fields.
|
|
1234
|
+
*/
|
|
1235
|
+
function createSummaryChunk(
|
|
1236
|
+
entry: ContentEntryInfo,
|
|
1237
|
+
contentType: ContentTypeInfo,
|
|
1238
|
+
extracted: ExtractedContent,
|
|
1239
|
+
totalChunks: number,
|
|
1240
|
+
): ContentChunk {
|
|
1241
|
+
const summaryParts: string[] = [];
|
|
1242
|
+
|
|
1243
|
+
// Add title
|
|
1244
|
+
if (extracted.title) {
|
|
1245
|
+
summaryParts.push(`Title: ${extracted.title}`);
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
// Add content type
|
|
1249
|
+
summaryParts.push(`Type: ${contentType.displayName}`);
|
|
1250
|
+
|
|
1251
|
+
// Add status and dates
|
|
1252
|
+
summaryParts.push(`Status: ${entry.status}`);
|
|
1253
|
+
if (entry.lastPublishedAt) {
|
|
1254
|
+
summaryParts.push(
|
|
1255
|
+
`Published: ${new Date(entry.lastPublishedAt).toLocaleDateString()}`,
|
|
1256
|
+
);
|
|
1257
|
+
}
|
|
1258
|
+
|
|
1259
|
+
// Add brief excerpt from first field
|
|
1260
|
+
const firstField = Object.keys(extracted.fieldTexts)[0];
|
|
1261
|
+
if (firstField && extracted.fieldTexts[firstField]) {
|
|
1262
|
+
const excerpt = extracted.fieldTexts[firstField].slice(0, 200);
|
|
1263
|
+
summaryParts.push(
|
|
1264
|
+
`Summary: ${excerpt}${excerpt.length >= 200 ? "..." : ""}`,
|
|
1265
|
+
);
|
|
1266
|
+
}
|
|
1267
|
+
|
|
1268
|
+
return {
|
|
1269
|
+
text: summaryParts.join("\n"),
|
|
1270
|
+
metadata: {
|
|
1271
|
+
entryId: entry._id,
|
|
1272
|
+
contentType: contentType.name,
|
|
1273
|
+
contentTypeDisplayName: contentType.displayName,
|
|
1274
|
+
slug: entry.slug,
|
|
1275
|
+
status: entry.status,
|
|
1276
|
+
locale: entry.locale,
|
|
1277
|
+
sourceFields: ["_summary"],
|
|
1278
|
+
chunkIndex: 0,
|
|
1279
|
+
totalChunks: totalChunks + 1,
|
|
1280
|
+
title: extracted.title,
|
|
1281
|
+
createdAt: new Date(entry._creationTime).toISOString(),
|
|
1282
|
+
firstPublishedAt: entry.firstPublishedAt
|
|
1283
|
+
? new Date(entry.firstPublishedAt).toISOString()
|
|
1284
|
+
: undefined,
|
|
1285
|
+
lastPublishedAt: entry.lastPublishedAt
|
|
1286
|
+
? new Date(entry.lastPublishedAt).toISOString()
|
|
1287
|
+
: undefined,
|
|
1288
|
+
version: entry.version,
|
|
1289
|
+
semanticType: "field_value",
|
|
1290
|
+
},
|
|
1291
|
+
};
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
// =============================================================================
|
|
1295
|
+
// Batch Processing Utilities
|
|
1296
|
+
// =============================================================================
|
|
1297
|
+
|
|
1298
|
+
/**
|
|
1299
|
+
* Processes multiple content entries into chunks.
|
|
1300
|
+
*
|
|
1301
|
+
* Useful for batch indexing operations.
|
|
1302
|
+
*
|
|
1303
|
+
* @param entries - Array of content entries
|
|
1304
|
+
* @param contentTypes - Map of content type ID to content type
|
|
1305
|
+
* @param options - Extraction options
|
|
1306
|
+
* @returns Map of entry ID to chunks
|
|
1307
|
+
*/
|
|
1308
|
+
export function chunkMultipleEntries(
|
|
1309
|
+
entries: ContentEntryInfo[],
|
|
1310
|
+
contentTypes: Map<string, ContentTypeInfo>,
|
|
1311
|
+
options: Partial<RagExtractionOptions> = {},
|
|
1312
|
+
): Map<string, ContentChunk[]> {
|
|
1313
|
+
const results = new Map<string, ContentChunk[]>();
|
|
1314
|
+
|
|
1315
|
+
for (const entry of entries) {
|
|
1316
|
+
const contentType = contentTypes.get(entry.contentTypeId);
|
|
1317
|
+
if (!contentType) {
|
|
1318
|
+
console.warn(`Content type not found for entry ${entry._id}`);
|
|
1319
|
+
continue;
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
const chunks = chunkContentEntry(entry, contentType, options);
|
|
1323
|
+
results.set(entry._id, chunks);
|
|
1324
|
+
}
|
|
1325
|
+
|
|
1326
|
+
return results;
|
|
1327
|
+
}
|
|
1328
|
+
|
|
1329
|
+
/**
|
|
1330
|
+
* Calculates the total character count and chunk count for entries.
|
|
1331
|
+
*
|
|
1332
|
+
* Useful for estimating indexing costs.
|
|
1333
|
+
*/
|
|
1334
|
+
export function estimateChunkingStats(
|
|
1335
|
+
entries: ContentEntryInfo[],
|
|
1336
|
+
contentTypes: Map<string, ContentTypeInfo>,
|
|
1337
|
+
options: Partial<RagExtractionOptions> = {},
|
|
1338
|
+
): {
|
|
1339
|
+
totalEntries: number;
|
|
1340
|
+
totalChunks: number;
|
|
1341
|
+
totalCharacters: number;
|
|
1342
|
+
averageChunksPerEntry: number;
|
|
1343
|
+
averageCharsPerChunk: number;
|
|
1344
|
+
} {
|
|
1345
|
+
let totalChunks = 0;
|
|
1346
|
+
let totalCharacters = 0;
|
|
1347
|
+
|
|
1348
|
+
for (const entry of entries) {
|
|
1349
|
+
const contentType = contentTypes.get(entry.contentTypeId);
|
|
1350
|
+
if (!contentType) continue;
|
|
1351
|
+
|
|
1352
|
+
const chunks = chunkContentEntry(entry, contentType, options);
|
|
1353
|
+
totalChunks += chunks.length;
|
|
1354
|
+
totalCharacters += chunks.reduce((sum, c) => sum + c.text.length, 0);
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
return {
|
|
1358
|
+
totalEntries: entries.length,
|
|
1359
|
+
totalChunks,
|
|
1360
|
+
totalCharacters,
|
|
1361
|
+
averageChunksPerEntry:
|
|
1362
|
+
entries.length > 0 ? totalChunks / entries.length : 0,
|
|
1363
|
+
averageCharsPerChunk: totalChunks > 0 ? totalCharacters / totalChunks : 0,
|
|
1364
|
+
};
|
|
1365
|
+
}
|
|
1366
|
+
|
|
1367
|
+
// =============================================================================
|
|
1368
|
+
// Exports
|
|
1369
|
+
// =============================================================================
|
|
1370
|
+
|
|
1371
|
+
export { DEFAULT_CHUNK_OPTIONS, DEFAULT_EXTRACTION_OPTIONS };
|