convex-cms 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (379) hide show
  1. package/dist/cli/commands/admin.d.ts +16 -0
  2. package/dist/cli/commands/admin.d.ts.map +1 -0
  3. package/dist/cli/commands/admin.js +88 -0
  4. package/dist/cli/commands/admin.js.map +1 -0
  5. package/dist/cli/index.d.ts +3 -0
  6. package/dist/cli/index.d.ts.map +1 -0
  7. package/dist/cli/index.js +18 -0
  8. package/dist/cli/index.js.map +1 -0
  9. package/dist/cli/utils/detectConvexUrl.d.ts +13 -0
  10. package/dist/cli/utils/detectConvexUrl.d.ts.map +1 -0
  11. package/dist/cli/utils/detectConvexUrl.js +48 -0
  12. package/dist/cli/utils/detectConvexUrl.js.map +1 -0
  13. package/dist/cli/utils/openBrowser.d.ts +7 -0
  14. package/dist/cli/utils/openBrowser.d.ts.map +1 -0
  15. package/dist/cli/utils/openBrowser.js +17 -0
  16. package/dist/cli/utils/openBrowser.js.map +1 -0
  17. package/dist/client/admin-config.d.ts +126 -0
  18. package/dist/client/admin-config.d.ts.map +1 -0
  19. package/dist/client/admin-config.js +117 -0
  20. package/dist/client/admin-config.js.map +1 -0
  21. package/dist/client/adminApi.d.ts +2273 -0
  22. package/dist/client/adminApi.d.ts.map +1 -0
  23. package/dist/client/adminApi.js +716 -0
  24. package/dist/client/adminApi.js.map +1 -0
  25. package/dist/client/agentTools.d.ts +933 -0
  26. package/dist/client/agentTools.d.ts.map +1 -0
  27. package/dist/client/agentTools.js +1004 -0
  28. package/dist/client/agentTools.js.map +1 -0
  29. package/dist/client/argTypes.d.ts +212 -0
  30. package/dist/client/argTypes.d.ts.map +1 -0
  31. package/dist/client/argTypes.js +5 -0
  32. package/dist/client/argTypes.js.map +1 -0
  33. package/dist/client/field-types.d.ts +55 -0
  34. package/dist/client/field-types.d.ts.map +1 -0
  35. package/dist/client/field-types.js +152 -0
  36. package/dist/client/field-types.js.map +1 -0
  37. package/dist/client/index.d.ts +189 -0
  38. package/dist/client/index.d.ts.map +1 -0
  39. package/dist/client/index.js +668 -0
  40. package/dist/client/index.js.map +1 -0
  41. package/dist/client/queryBuilder.d.ts +765 -0
  42. package/dist/client/queryBuilder.d.ts.map +1 -0
  43. package/dist/client/queryBuilder.js +970 -0
  44. package/dist/client/queryBuilder.js.map +1 -0
  45. package/dist/client/schema/codegen.d.ts +128 -0
  46. package/dist/client/schema/codegen.d.ts.map +1 -0
  47. package/dist/client/schema/codegen.js +318 -0
  48. package/dist/client/schema/codegen.js.map +1 -0
  49. package/dist/client/schema/defineContentType.d.ts +221 -0
  50. package/dist/client/schema/defineContentType.d.ts.map +1 -0
  51. package/dist/client/schema/defineContentType.js +380 -0
  52. package/dist/client/schema/defineContentType.js.map +1 -0
  53. package/dist/client/schema/index.d.ts +85 -0
  54. package/dist/client/schema/index.d.ts.map +1 -0
  55. package/dist/client/schema/index.js +92 -0
  56. package/dist/client/schema/index.js.map +1 -0
  57. package/dist/client/schema/schemaDrift.d.ts +199 -0
  58. package/dist/client/schema/schemaDrift.d.ts.map +1 -0
  59. package/dist/client/schema/schemaDrift.js +340 -0
  60. package/dist/client/schema/schemaDrift.js.map +1 -0
  61. package/dist/client/schema/typedClient.d.ts +401 -0
  62. package/dist/client/schema/typedClient.d.ts.map +1 -0
  63. package/dist/client/schema/typedClient.js +269 -0
  64. package/dist/client/schema/typedClient.js.map +1 -0
  65. package/dist/client/schema/types.d.ts +477 -0
  66. package/dist/client/schema/types.d.ts.map +1 -0
  67. package/dist/client/schema/types.js +39 -0
  68. package/dist/client/schema/types.js.map +1 -0
  69. package/dist/client/types.d.ts +449 -0
  70. package/dist/client/types.d.ts.map +1 -0
  71. package/dist/client/types.js +149 -0
  72. package/dist/client/types.js.map +1 -0
  73. package/dist/client/workflows.d.ts +51 -0
  74. package/dist/client/workflows.d.ts.map +1 -0
  75. package/dist/client/workflows.js +103 -0
  76. package/dist/client/workflows.js.map +1 -0
  77. package/dist/client/wrapper.d.ts +2198 -0
  78. package/dist/client/wrapper.d.ts.map +1 -0
  79. package/dist/client/wrapper.js +2651 -0
  80. package/dist/client/wrapper.js.map +1 -0
  81. package/dist/component/_generated/api.d.ts +124 -0
  82. package/dist/component/_generated/api.d.ts.map +1 -0
  83. package/dist/component/_generated/api.js +31 -0
  84. package/dist/component/_generated/api.js.map +1 -0
  85. package/dist/component/_generated/component.d.ts +4321 -0
  86. package/dist/component/_generated/component.d.ts.map +1 -0
  87. package/dist/component/_generated/component.js +11 -0
  88. package/dist/component/_generated/component.js.map +1 -0
  89. package/dist/component/_generated/dataModel.d.ts +46 -0
  90. package/dist/component/_generated/dataModel.d.ts.map +1 -0
  91. package/dist/component/_generated/dataModel.js +11 -0
  92. package/dist/component/_generated/dataModel.js.map +1 -0
  93. package/dist/component/_generated/server.d.ts +121 -0
  94. package/dist/component/_generated/server.d.ts.map +1 -0
  95. package/dist/component/_generated/server.js +78 -0
  96. package/dist/component/_generated/server.js.map +1 -0
  97. package/dist/component/auditLog.d.ts +410 -0
  98. package/dist/component/auditLog.d.ts.map +1 -0
  99. package/dist/component/auditLog.js +607 -0
  100. package/dist/component/auditLog.js.map +1 -0
  101. package/dist/component/authorization.d.ts +323 -0
  102. package/dist/component/authorization.d.ts.map +1 -0
  103. package/dist/component/authorization.js +464 -0
  104. package/dist/component/authorization.js.map +1 -0
  105. package/dist/component/authorizationHooks.d.ts +184 -0
  106. package/dist/component/authorizationHooks.d.ts.map +1 -0
  107. package/dist/component/authorizationHooks.js +521 -0
  108. package/dist/component/authorizationHooks.js.map +1 -0
  109. package/dist/component/bulkOperations.d.ts +200 -0
  110. package/dist/component/bulkOperations.d.ts.map +1 -0
  111. package/dist/component/bulkOperations.js +568 -0
  112. package/dist/component/bulkOperations.js.map +1 -0
  113. package/dist/component/contentEntries.d.ts +719 -0
  114. package/dist/component/contentEntries.d.ts.map +1 -0
  115. package/dist/component/contentEntries.js +1617 -0
  116. package/dist/component/contentEntries.js.map +1 -0
  117. package/dist/component/contentEntryMutations.d.ts +505 -0
  118. package/dist/component/contentEntryMutations.d.ts.map +1 -0
  119. package/dist/component/contentEntryMutations.js +1009 -0
  120. package/dist/component/contentEntryMutations.js.map +1 -0
  121. package/dist/component/contentEntryValidation.d.ts +115 -0
  122. package/dist/component/contentEntryValidation.d.ts.map +1 -0
  123. package/dist/component/contentEntryValidation.js +546 -0
  124. package/dist/component/contentEntryValidation.js.map +1 -0
  125. package/dist/component/contentLock.d.ts +328 -0
  126. package/dist/component/contentLock.d.ts.map +1 -0
  127. package/dist/component/contentLock.js +471 -0
  128. package/dist/component/contentLock.js.map +1 -0
  129. package/dist/component/contentTypeMigration.d.ts +411 -0
  130. package/dist/component/contentTypeMigration.d.ts.map +1 -0
  131. package/dist/component/contentTypeMigration.js +805 -0
  132. package/dist/component/contentTypeMigration.js.map +1 -0
  133. package/dist/component/contentTypeMutations.d.ts +975 -0
  134. package/dist/component/contentTypeMutations.d.ts.map +1 -0
  135. package/dist/component/contentTypeMutations.js +768 -0
  136. package/dist/component/contentTypeMutations.js.map +1 -0
  137. package/dist/component/contentTypes.d.ts +538 -0
  138. package/dist/component/contentTypes.d.ts.map +1 -0
  139. package/dist/component/contentTypes.js +304 -0
  140. package/dist/component/contentTypes.js.map +1 -0
  141. package/dist/component/convex.config.d.ts +42 -0
  142. package/dist/component/convex.config.d.ts.map +1 -0
  143. package/dist/component/convex.config.js +43 -0
  144. package/dist/component/convex.config.js.map +1 -0
  145. package/dist/component/documentTypes.d.ts +186 -0
  146. package/dist/component/documentTypes.d.ts.map +1 -0
  147. package/dist/component/documentTypes.js +23 -0
  148. package/dist/component/documentTypes.js.map +1 -0
  149. package/dist/component/eventEmitter.d.ts +281 -0
  150. package/dist/component/eventEmitter.d.ts.map +1 -0
  151. package/dist/component/eventEmitter.js +300 -0
  152. package/dist/component/eventEmitter.js.map +1 -0
  153. package/dist/component/exportImport.d.ts +1120 -0
  154. package/dist/component/exportImport.d.ts.map +1 -0
  155. package/dist/component/exportImport.js +931 -0
  156. package/dist/component/exportImport.js.map +1 -0
  157. package/dist/component/index.d.ts +28 -0
  158. package/dist/component/index.d.ts.map +1 -0
  159. package/dist/component/index.js +142 -0
  160. package/dist/component/index.js.map +1 -0
  161. package/dist/component/lib/deepReferenceResolver.d.ts +252 -0
  162. package/dist/component/lib/deepReferenceResolver.d.ts.map +1 -0
  163. package/dist/component/lib/deepReferenceResolver.js +601 -0
  164. package/dist/component/lib/deepReferenceResolver.js.map +1 -0
  165. package/dist/component/lib/errors.d.ts +306 -0
  166. package/dist/component/lib/errors.d.ts.map +1 -0
  167. package/dist/component/lib/errors.js +407 -0
  168. package/dist/component/lib/errors.js.map +1 -0
  169. package/dist/component/lib/index.d.ts +10 -0
  170. package/dist/component/lib/index.d.ts.map +1 -0
  171. package/dist/component/lib/index.js +33 -0
  172. package/dist/component/lib/index.js.map +1 -0
  173. package/dist/component/lib/mediaReferenceResolver.d.ts +217 -0
  174. package/dist/component/lib/mediaReferenceResolver.d.ts.map +1 -0
  175. package/dist/component/lib/mediaReferenceResolver.js +326 -0
  176. package/dist/component/lib/mediaReferenceResolver.js.map +1 -0
  177. package/dist/component/lib/metadataExtractor.d.ts +245 -0
  178. package/dist/component/lib/metadataExtractor.d.ts.map +1 -0
  179. package/dist/component/lib/metadataExtractor.js +548 -0
  180. package/dist/component/lib/metadataExtractor.js.map +1 -0
  181. package/dist/component/lib/mutationAuth.d.ts +95 -0
  182. package/dist/component/lib/mutationAuth.d.ts.map +1 -0
  183. package/dist/component/lib/mutationAuth.js +146 -0
  184. package/dist/component/lib/mutationAuth.js.map +1 -0
  185. package/dist/component/lib/queries.d.ts +17 -0
  186. package/dist/component/lib/queries.d.ts.map +1 -0
  187. package/dist/component/lib/queries.js +49 -0
  188. package/dist/component/lib/queries.js.map +1 -0
  189. package/dist/component/lib/ragContentChunker.d.ts +423 -0
  190. package/dist/component/lib/ragContentChunker.d.ts.map +1 -0
  191. package/dist/component/lib/ragContentChunker.js +897 -0
  192. package/dist/component/lib/ragContentChunker.js.map +1 -0
  193. package/dist/component/lib/referenceResolver.d.ts +175 -0
  194. package/dist/component/lib/referenceResolver.d.ts.map +1 -0
  195. package/dist/component/lib/referenceResolver.js +293 -0
  196. package/dist/component/lib/referenceResolver.js.map +1 -0
  197. package/dist/component/lib/slugGenerator.d.ts +71 -0
  198. package/dist/component/lib/slugGenerator.d.ts.map +1 -0
  199. package/dist/component/lib/slugGenerator.js +207 -0
  200. package/dist/component/lib/slugGenerator.js.map +1 -0
  201. package/dist/component/lib/slugUniqueness.d.ts +131 -0
  202. package/dist/component/lib/slugUniqueness.d.ts.map +1 -0
  203. package/dist/component/lib/slugUniqueness.js +229 -0
  204. package/dist/component/lib/slugUniqueness.js.map +1 -0
  205. package/dist/component/lib/softDelete.d.ts +18 -0
  206. package/dist/component/lib/softDelete.d.ts.map +1 -0
  207. package/dist/component/lib/softDelete.js +29 -0
  208. package/dist/component/lib/softDelete.js.map +1 -0
  209. package/dist/component/localeFallbackChain.d.ts +410 -0
  210. package/dist/component/localeFallbackChain.d.ts.map +1 -0
  211. package/dist/component/localeFallbackChain.js +467 -0
  212. package/dist/component/localeFallbackChain.js.map +1 -0
  213. package/dist/component/localeFields.d.ts +508 -0
  214. package/dist/component/localeFields.d.ts.map +1 -0
  215. package/dist/component/localeFields.js +592 -0
  216. package/dist/component/localeFields.js.map +1 -0
  217. package/dist/component/mediaAssetMutations.d.ts +235 -0
  218. package/dist/component/mediaAssetMutations.d.ts.map +1 -0
  219. package/dist/component/mediaAssetMutations.js +558 -0
  220. package/dist/component/mediaAssetMutations.js.map +1 -0
  221. package/dist/component/mediaAssets.d.ts +168 -0
  222. package/dist/component/mediaAssets.d.ts.map +1 -0
  223. package/dist/component/mediaAssets.js +618 -0
  224. package/dist/component/mediaAssets.js.map +1 -0
  225. package/dist/component/mediaFolderMutations.d.ts +642 -0
  226. package/dist/component/mediaFolderMutations.d.ts.map +1 -0
  227. package/dist/component/mediaFolderMutations.js +849 -0
  228. package/dist/component/mediaFolderMutations.js.map +1 -0
  229. package/dist/component/mediaUploadMutations.d.ts +136 -0
  230. package/dist/component/mediaUploadMutations.d.ts.map +1 -0
  231. package/dist/component/mediaUploadMutations.js +205 -0
  232. package/dist/component/mediaUploadMutations.js.map +1 -0
  233. package/dist/component/mediaVariantMutations.d.ts +468 -0
  234. package/dist/component/mediaVariantMutations.d.ts.map +1 -0
  235. package/dist/component/mediaVariantMutations.js +737 -0
  236. package/dist/component/mediaVariantMutations.js.map +1 -0
  237. package/dist/component/mediaVariants.d.ts +525 -0
  238. package/dist/component/mediaVariants.d.ts.map +1 -0
  239. package/dist/component/mediaVariants.js +661 -0
  240. package/dist/component/mediaVariants.js.map +1 -0
  241. package/dist/component/ragContentIndexer.d.ts +595 -0
  242. package/dist/component/ragContentIndexer.d.ts.map +1 -0
  243. package/dist/component/ragContentIndexer.js +794 -0
  244. package/dist/component/ragContentIndexer.js.map +1 -0
  245. package/dist/component/rateLimitHooks.d.ts +266 -0
  246. package/dist/component/rateLimitHooks.d.ts.map +1 -0
  247. package/dist/component/rateLimitHooks.js +412 -0
  248. package/dist/component/rateLimitHooks.js.map +1 -0
  249. package/dist/component/roles.d.ts +649 -0
  250. package/dist/component/roles.d.ts.map +1 -0
  251. package/dist/component/roles.js +884 -0
  252. package/dist/component/roles.js.map +1 -0
  253. package/dist/component/scheduledPublish.d.ts +182 -0
  254. package/dist/component/scheduledPublish.d.ts.map +1 -0
  255. package/dist/component/scheduledPublish.js +304 -0
  256. package/dist/component/scheduledPublish.js.map +1 -0
  257. package/dist/component/schema.d.ts +4114 -0
  258. package/dist/component/schema.d.ts.map +1 -0
  259. package/dist/component/schema.js +469 -0
  260. package/dist/component/schema.js.map +1 -0
  261. package/dist/component/taxonomies.d.ts +476 -0
  262. package/dist/component/taxonomies.d.ts.map +1 -0
  263. package/dist/component/taxonomies.js +785 -0
  264. package/dist/component/taxonomies.js.map +1 -0
  265. package/dist/component/taxonomyMutations.d.ts +206 -0
  266. package/dist/component/taxonomyMutations.d.ts.map +1 -0
  267. package/dist/component/taxonomyMutations.js +1001 -0
  268. package/dist/component/taxonomyMutations.js.map +1 -0
  269. package/dist/component/trash.d.ts +265 -0
  270. package/dist/component/trash.d.ts.map +1 -0
  271. package/dist/component/trash.js +621 -0
  272. package/dist/component/trash.js.map +1 -0
  273. package/dist/component/types.d.ts +4 -0
  274. package/dist/component/types.d.ts.map +1 -0
  275. package/dist/component/types.js +2 -0
  276. package/dist/component/types.js.map +1 -0
  277. package/dist/component/userContext.d.ts +508 -0
  278. package/dist/component/userContext.d.ts.map +1 -0
  279. package/dist/component/userContext.js +615 -0
  280. package/dist/component/userContext.js.map +1 -0
  281. package/dist/component/validation.d.ts +387 -0
  282. package/dist/component/validation.d.ts.map +1 -0
  283. package/dist/component/validation.js +1052 -0
  284. package/dist/component/validation.js.map +1 -0
  285. package/dist/component/validators.d.ts +4645 -0
  286. package/dist/component/validators.d.ts.map +1 -0
  287. package/dist/component/validators.js +641 -0
  288. package/dist/component/validators.js.map +1 -0
  289. package/dist/component/versionMutations.d.ts +216 -0
  290. package/dist/component/versionMutations.d.ts.map +1 -0
  291. package/dist/component/versionMutations.js +321 -0
  292. package/dist/component/versionMutations.js.map +1 -0
  293. package/dist/component/webhookTrigger.d.ts +770 -0
  294. package/dist/component/webhookTrigger.d.ts.map +1 -0
  295. package/dist/component/webhookTrigger.js +1413 -0
  296. package/dist/component/webhookTrigger.js.map +1 -0
  297. package/dist/react/index.d.ts +316 -0
  298. package/dist/react/index.d.ts.map +1 -0
  299. package/dist/react/index.js +558 -0
  300. package/dist/react/index.js.map +1 -0
  301. package/dist/test.d.ts +2230 -0
  302. package/dist/test.d.ts.map +1 -0
  303. package/dist/test.js +1107 -0
  304. package/dist/test.js.map +1 -0
  305. package/package.json +95 -0
  306. package/src/cli/commands/admin.ts +104 -0
  307. package/src/cli/index.ts +21 -0
  308. package/src/cli/utils/detectConvexUrl.ts +54 -0
  309. package/src/cli/utils/openBrowser.ts +16 -0
  310. package/src/client/admin-config.ts +138 -0
  311. package/src/client/adminApi.ts +942 -0
  312. package/src/client/agentTools.ts +1311 -0
  313. package/src/client/argTypes.ts +316 -0
  314. package/src/client/field-types.ts +187 -0
  315. package/src/client/index.ts +1301 -0
  316. package/src/client/queryBuilder.ts +1100 -0
  317. package/src/client/schema/codegen.ts +500 -0
  318. package/src/client/schema/defineContentType.ts +501 -0
  319. package/src/client/schema/index.ts +169 -0
  320. package/src/client/schema/schemaDrift.ts +574 -0
  321. package/src/client/schema/typedClient.ts +688 -0
  322. package/src/client/schema/types.ts +666 -0
  323. package/src/client/types.ts +723 -0
  324. package/src/client/workflows.ts +141 -0
  325. package/src/client/wrapper.ts +4304 -0
  326. package/src/component/_generated/api.ts +140 -0
  327. package/src/component/_generated/component.ts +5029 -0
  328. package/src/component/_generated/dataModel.ts +60 -0
  329. package/src/component/_generated/server.ts +156 -0
  330. package/src/component/authorization.ts +647 -0
  331. package/src/component/authorizationHooks.ts +668 -0
  332. package/src/component/bulkOperations.ts +687 -0
  333. package/src/component/contentEntries.ts +1976 -0
  334. package/src/component/contentEntryMutations.ts +1223 -0
  335. package/src/component/contentEntryValidation.ts +707 -0
  336. package/src/component/contentLock.ts +550 -0
  337. package/src/component/contentTypeMigration.ts +1064 -0
  338. package/src/component/contentTypeMutations.ts +969 -0
  339. package/src/component/contentTypes.ts +346 -0
  340. package/src/component/convex.config.ts +44 -0
  341. package/src/component/documentTypes.ts +240 -0
  342. package/src/component/eventEmitter.ts +485 -0
  343. package/src/component/exportImport.ts +1169 -0
  344. package/src/component/index.ts +491 -0
  345. package/src/component/lib/deepReferenceResolver.ts +999 -0
  346. package/src/component/lib/errors.ts +816 -0
  347. package/src/component/lib/index.ts +145 -0
  348. package/src/component/lib/mediaReferenceResolver.ts +495 -0
  349. package/src/component/lib/metadataExtractor.ts +792 -0
  350. package/src/component/lib/mutationAuth.ts +199 -0
  351. package/src/component/lib/queries.ts +79 -0
  352. package/src/component/lib/ragContentChunker.ts +1371 -0
  353. package/src/component/lib/referenceResolver.ts +430 -0
  354. package/src/component/lib/slugGenerator.ts +262 -0
  355. package/src/component/lib/slugUniqueness.ts +333 -0
  356. package/src/component/lib/softDelete.ts +44 -0
  357. package/src/component/localeFallbackChain.ts +673 -0
  358. package/src/component/localeFields.ts +896 -0
  359. package/src/component/mediaAssetMutations.ts +725 -0
  360. package/src/component/mediaAssets.ts +932 -0
  361. package/src/component/mediaFolderMutations.ts +1046 -0
  362. package/src/component/mediaUploadMutations.ts +224 -0
  363. package/src/component/mediaVariantMutations.ts +900 -0
  364. package/src/component/mediaVariants.ts +793 -0
  365. package/src/component/ragContentIndexer.ts +1067 -0
  366. package/src/component/rateLimitHooks.ts +572 -0
  367. package/src/component/roles.ts +1360 -0
  368. package/src/component/scheduledPublish.ts +358 -0
  369. package/src/component/schema.ts +617 -0
  370. package/src/component/taxonomies.ts +949 -0
  371. package/src/component/taxonomyMutations.ts +1210 -0
  372. package/src/component/trash.ts +724 -0
  373. package/src/component/userContext.ts +898 -0
  374. package/src/component/validation.ts +1388 -0
  375. package/src/component/validators.ts +949 -0
  376. package/src/component/versionMutations.ts +392 -0
  377. package/src/component/webhookTrigger.ts +1922 -0
  378. package/src/react/index.ts +898 -0
  379. package/src/test.ts +1580 -0
@@ -0,0 +1,1371 @@
1
+ /**
2
+ * RAG Content Chunker
3
+ *
4
+ * Utility to extract and structure content from CMS entries for @convex-dev/rag indexing.
5
+ * This module provides:
6
+ *
7
+ * 1. **Content Extraction**: Extracts text from various CMS field types (text, richText, json, etc.)
8
+ * 2. **Semantic Chunking**: Splits content into meaningful chunks optimized for embedding
9
+ * 3. **Metadata Tagging**: Attaches relevant metadata (content type, field source, locale, etc.)
10
+ * 4. **Reference Handling**: Processes embedded references and includes contextual information
11
+ *
12
+ * The output is designed to be directly compatible with @convex-dev/rag's `add()` function.
13
+ *
14
+ * @example
15
+ * ```typescript
16
+ * import { extractContentForRag, chunkContentEntry } from "@convex-cms/core/lib";
17
+ * import { rag } from "@convex-dev/rag";
18
+ *
19
+ * // In a Convex action
20
+ * const chunks = await chunkContentEntry(ctx, entry, contentType, {
21
+ * includeMetadata: true,
22
+ * chunkOptions: { maxCharsSoftLimit: 1000 },
23
+ * });
24
+ *
25
+ * await rag.add(ctx, {
26
+ * namespace: "cms-content",
27
+ * key: entry._id,
28
+ * chunks: chunks.map(c => c.text),
29
+ * title: chunks[0]?.metadata?.title,
30
+ * });
31
+ * ```
32
+ *
33
+ * @module
34
+ */
35
+
36
+ // =============================================================================
37
+ // Type Definitions
38
+ // =============================================================================
39
+
40
+ /**
41
+ * Field definition structure from the CMS schema.
42
+ */
43
+ export interface FieldDefinition {
44
+ name: string;
45
+ label: string;
46
+ type: string;
47
+ required: boolean;
48
+ searchable?: boolean;
49
+ localized?: boolean;
50
+ description?: string;
51
+ options?: {
52
+ allowedContentTypes?: string[];
53
+ multiple?: boolean;
54
+ [key: string]: unknown;
55
+ };
56
+ }
57
+
58
+ /**
59
+ * Content type structure from the CMS.
60
+ */
61
+ export interface ContentTypeInfo {
62
+ _id: string;
63
+ name: string;
64
+ displayName: string;
65
+ fields: FieldDefinition[];
66
+ titleField?: string;
67
+ slugField?: string;
68
+ }
69
+
70
+ /**
71
+ * Content entry structure from the CMS.
72
+ */
73
+ export interface ContentEntryInfo {
74
+ _id: string;
75
+ contentTypeId: string;
76
+ slug: string;
77
+ status: string;
78
+ data: Record<string, unknown>;
79
+ locale?: string;
80
+ version: number;
81
+ _creationTime: number;
82
+ firstPublishedAt?: number;
83
+ lastPublishedAt?: number;
84
+ }
85
+
86
+ /**
87
+ * A resolved reference for context enrichment.
88
+ */
89
+ export interface ResolvedReferenceInfo {
90
+ id: string;
91
+ contentTypeName: string;
92
+ title?: string;
93
+ slug?: string;
94
+ }
95
+
96
+ /**
97
+ * Metadata attached to each content chunk.
98
+ * This metadata helps with filtering and relevance scoring during retrieval.
99
+ */
100
+ export interface ChunkMetadata {
101
+ /** The content entry ID this chunk came from */
102
+ entryId: string;
103
+ /** The content type name (e.g., "blog_post") */
104
+ contentType: string;
105
+ /** The content type display name (e.g., "Blog Post") */
106
+ contentTypeDisplayName: string;
107
+ /** The entry's URL slug */
108
+ slug: string;
109
+ /** Publishing status of the entry */
110
+ status: string;
111
+ /** Locale code if localized content */
112
+ locale?: string;
113
+ /** The field name(s) this chunk was extracted from */
114
+ sourceFields: string[];
115
+ /** The chunk index within the entry (0-based) */
116
+ chunkIndex: number;
117
+ /** Total number of chunks for this entry */
118
+ totalChunks: number;
119
+ /** The entry's title (if available) */
120
+ title?: string;
121
+ /** ISO timestamp when the entry was created */
122
+ createdAt: string;
123
+ /** ISO timestamp when the entry was first published */
124
+ firstPublishedAt?: string;
125
+ /** ISO timestamp when the entry was last published */
126
+ lastPublishedAt?: string;
127
+ /** Version number of the entry */
128
+ version: number;
129
+ /** IDs of referenced content entries (for relationship tracking) */
130
+ referencedEntryIds?: string[];
131
+ /** IDs of referenced media assets */
132
+ referencedMediaIds?: string[];
133
+ /** Semantic type of the chunk (heading, paragraph, list, etc.) */
134
+ semanticType?: ChunkSemanticType;
135
+ }
136
+
137
+ /**
138
+ * Semantic type classification for chunks.
139
+ * Helps with relevance scoring and filtering.
140
+ */
141
+ export type ChunkSemanticType =
142
+ | "title"
143
+ | "heading"
144
+ | "paragraph"
145
+ | "list"
146
+ | "quote"
147
+ | "code"
148
+ | "table"
149
+ | "mixed"
150
+ | "field_value";
151
+
152
+ /**
153
+ * A single content chunk ready for RAG indexing.
154
+ */
155
+ export interface ContentChunk {
156
+ /** The text content of the chunk */
157
+ text: string;
158
+ /** Metadata for filtering and context */
159
+ metadata: ChunkMetadata;
160
+ /** Optional custom embedding text (if different from display text) */
161
+ embeddingText?: string;
162
+ }
163
+
164
+ /**
165
+ * Options for the chunking algorithm.
166
+ */
167
+ export interface ChunkOptions {
168
+ /**
169
+ * Minimum number of lines before creating a chunk.
170
+ * Helps avoid very small chunks.
171
+ * @default 1
172
+ */
173
+ minLines?: number;
174
+
175
+ /**
176
+ * Soft minimum character limit for chunks.
177
+ * Chunker will try to create chunks at least this size.
178
+ * @default 100
179
+ */
180
+ minCharsSoftLimit?: number;
181
+
182
+ /**
183
+ * Soft maximum character limit for chunks.
184
+ * Chunker will try to split at natural boundaries before this limit.
185
+ * @default 1000
186
+ */
187
+ maxCharsSoftLimit?: number;
188
+
189
+ /**
190
+ * Hard maximum character limit for chunks.
191
+ * Chunks will be force-split at this limit.
192
+ * @default 4000
193
+ */
194
+ maxCharsHardLimit?: number;
195
+
196
+ /**
197
+ * Primary delimiter for splitting text into chunks.
198
+ * @default "\n\n" (paragraph breaks)
199
+ */
200
+ delimiter?: string;
201
+
202
+ /**
203
+ * Secondary delimiters to try when primary doesn't work.
204
+ * @default ["\n", ". ", ", "]
205
+ */
206
+ fallbackDelimiters?: string[];
207
+
208
+ /**
209
+ * Whether to preserve heading context in each chunk.
210
+ * When true, includes the most recent heading at the start of each chunk.
211
+ * @default true
212
+ */
213
+ preserveHeadingContext?: boolean;
214
+
215
+ /**
216
+ * Overlap characters between chunks for context continuity.
217
+ * @default 50
218
+ */
219
+ overlapChars?: number;
220
+ }
221
+
222
+ /**
223
+ * Options for content extraction and chunking.
224
+ */
225
+ export interface RagExtractionOptions {
226
+ /**
227
+ * Whether to include metadata with each chunk.
228
+ * @default true
229
+ */
230
+ includeMetadata?: boolean;
231
+
232
+ /**
233
+ * Field names to include in extraction.
234
+ * If not specified, all text-bearing fields are included.
235
+ */
236
+ includeFields?: string[];
237
+
238
+ /**
239
+ * Field names to exclude from extraction.
240
+ */
241
+ excludeFields?: string[];
242
+
243
+ /**
244
+ * Whether to extract text from rich text fields.
245
+ * @default true
246
+ */
247
+ extractRichText?: boolean;
248
+
249
+ /**
250
+ * Whether to extract text from JSON fields.
251
+ * @default true
252
+ */
253
+ extractJson?: boolean;
254
+
255
+ /**
256
+ * Whether to include reference context (titles of referenced entries).
257
+ * Requires passing resolved references.
258
+ * @default true
259
+ */
260
+ includeReferenceContext?: boolean;
261
+
262
+ /**
263
+ * Chunking algorithm options.
264
+ */
265
+ chunkOptions?: ChunkOptions;
266
+
267
+ /**
268
+ * Custom prefix for each chunk (e.g., for entry context).
269
+ * Supports placeholders: {contentType}, {title}, {slug}
270
+ */
271
+ chunkPrefix?: string;
272
+
273
+ /**
274
+ * Custom suffix for each chunk.
275
+ */
276
+ chunkSuffix?: string;
277
+
278
+ /**
279
+ * Whether to create a separate "summary" chunk with key fields.
280
+ * @default false
281
+ */
282
+ createSummaryChunk?: boolean;
283
+
284
+ /**
285
+ * Fields to include in the summary chunk.
286
+ * @default ["title", first searchable field]
287
+ */
288
+ summaryFields?: string[];
289
+ }
290
+
291
+ /**
292
+ * Result of content extraction before chunking.
293
+ */
294
+ export interface ExtractedContent {
295
+ /** Combined text content from all fields */
296
+ fullText: string;
297
+ /** Text content organized by field name */
298
+ fieldTexts: Record<string, string>;
299
+ /** Entry title (if available) */
300
+ title?: string;
301
+ /** Referenced entry IDs found in content */
302
+ referencedEntryIds: string[];
303
+ /** Referenced media IDs found in content */
304
+ referencedMediaIds: string[];
305
+ /** Source field information for tracking */
306
+ sourceInfo: Array<{
307
+ fieldName: string;
308
+ fieldLabel: string;
309
+ fieldType: string;
310
+ charCount: number;
311
+ }>;
312
+ }
313
+
314
+ // =============================================================================
315
+ // Default Configuration
316
+ // =============================================================================
317
+
318
+ const DEFAULT_CHUNK_OPTIONS: Required<ChunkOptions> = {
319
+ minLines: 1,
320
+ minCharsSoftLimit: 100,
321
+ maxCharsSoftLimit: 1000,
322
+ maxCharsHardLimit: 4000,
323
+ delimiter: "\n\n",
324
+ fallbackDelimiters: ["\n", ". ", ", "],
325
+ preserveHeadingContext: true,
326
+ overlapChars: 50,
327
+ };
328
+
329
+ const DEFAULT_EXTRACTION_OPTIONS: Required<RagExtractionOptions> = {
330
+ includeMetadata: true,
331
+ includeFields: [],
332
+ excludeFields: [],
333
+ extractRichText: true,
334
+ extractJson: true,
335
+ includeReferenceContext: true,
336
+ chunkOptions: DEFAULT_CHUNK_OPTIONS,
337
+ chunkPrefix: "",
338
+ chunkSuffix: "",
339
+ createSummaryChunk: false,
340
+ summaryFields: [],
341
+ };
342
+
343
+ // =============================================================================
344
+ // Text Extraction Functions
345
+ // =============================================================================
346
+
347
+ /**
348
+ * Extracts plain text from a rich text field value.
349
+ *
350
+ * Handles common rich text formats:
351
+ * - HTML strings (strips tags)
352
+ * - ProseMirror/Tiptap JSON structure
353
+ * - Markdown strings
354
+ * - Plain text strings
355
+ *
356
+ * @param value - The rich text field value
357
+ * @returns Plain text content
358
+ */
359
+ export function extractTextFromRichText(value: unknown): string {
360
+ if (value === null || value === undefined) {
361
+ return "";
362
+ }
363
+
364
+ // Handle string values (HTML, Markdown, or plain text)
365
+ if (typeof value === "string") {
366
+ return stripHtmlTags(value);
367
+ }
368
+
369
+ // Handle ProseMirror/Tiptap JSON structure
370
+ if (typeof value === "object" && value !== null) {
371
+ const obj = value as Record<string, unknown>;
372
+
373
+ // Check for ProseMirror doc structure
374
+ if (obj.type === "doc" && Array.isArray(obj.content)) {
375
+ return extractTextFromProseMirrorDoc(obj);
376
+ }
377
+
378
+ // Check for array of blocks
379
+ if (Array.isArray(value)) {
380
+ return value.map((block) => extractTextFromRichText(block)).join("\n\n");
381
+ }
382
+
383
+ // Generic object - try to find text content
384
+ if ("text" in obj && typeof obj.text === "string") {
385
+ return obj.text;
386
+ }
387
+
388
+ if ("content" in obj && typeof obj.content === "string") {
389
+ return obj.content;
390
+ }
391
+ }
392
+
393
+ return "";
394
+ }
395
+
396
+ /**
397
+ * Extracts text from a ProseMirror document structure.
398
+ */
399
+ function extractTextFromProseMirrorDoc(doc: Record<string, unknown>): string {
400
+ const content = doc.content as unknown[];
401
+ if (!Array.isArray(content)) {
402
+ return "";
403
+ }
404
+
405
+ const textParts: string[] = [];
406
+
407
+ for (const node of content) {
408
+ if (typeof node !== "object" || node === null) continue;
409
+
410
+ const nodeObj = node as Record<string, unknown>;
411
+ const nodeType = nodeObj.type as string;
412
+
413
+ switch (nodeType) {
414
+ case "paragraph":
415
+ case "heading":
416
+ textParts.push(extractTextFromProseMirrorNode(nodeObj));
417
+ break;
418
+
419
+ case "bulletList":
420
+ case "orderedList":
421
+ textParts.push(extractTextFromProseMirrorList(nodeObj));
422
+ break;
423
+
424
+ case "blockquote": {
425
+ const quoteText = extractTextFromProseMirrorDoc(nodeObj);
426
+ textParts.push(`"${quoteText}"`);
427
+ break;
428
+ }
429
+
430
+ case "codeBlock":
431
+ if (nodeObj.content && Array.isArray(nodeObj.content)) {
432
+ const codeText = (nodeObj.content as Array<{ text?: string }>)
433
+ .map((c) => c.text || "")
434
+ .join("");
435
+ textParts.push(codeText);
436
+ }
437
+ break;
438
+
439
+ case "horizontalRule":
440
+ // Skip horizontal rules
441
+ break;
442
+
443
+ default:
444
+ // Try generic extraction
445
+ if (nodeObj.content) {
446
+ textParts.push(extractTextFromProseMirrorDoc(nodeObj));
447
+ }
448
+ }
449
+ }
450
+
451
+ return textParts.filter(Boolean).join("\n\n");
452
+ }
453
+
454
+ /**
455
+ * Extracts text from a ProseMirror node with inline content.
456
+ */
457
+ function extractTextFromProseMirrorNode(node: Record<string, unknown>): string {
458
+ const content = node.content as unknown[];
459
+ if (!Array.isArray(content)) {
460
+ return "";
461
+ }
462
+
463
+ return content
464
+ .map((child) => {
465
+ if (typeof child !== "object" || child === null) return "";
466
+ const childObj = child as Record<string, unknown>;
467
+
468
+ if (childObj.type === "text") {
469
+ return (childObj.text as string) || "";
470
+ }
471
+
472
+ // Handle inline nodes with content
473
+ if (childObj.content) {
474
+ return extractTextFromProseMirrorNode(childObj);
475
+ }
476
+
477
+ return "";
478
+ })
479
+ .join("");
480
+ }
481
+
482
+ /**
483
+ * Extracts text from a ProseMirror list node.
484
+ */
485
+ function extractTextFromProseMirrorList(list: Record<string, unknown>): string {
486
+ const items = list.content as unknown[];
487
+ if (!Array.isArray(items)) {
488
+ return "";
489
+ }
490
+
491
+ return items
492
+ .map((item, _index) => {
493
+ if (typeof item !== "object" || item === null) return "";
494
+ const itemObj = item as Record<string, unknown>;
495
+
496
+ const itemText = extractTextFromProseMirrorDoc(itemObj);
497
+ return `- ${itemText}`;
498
+ })
499
+ .join("\n");
500
+ }
501
+
502
+ /**
503
+ * Strips HTML tags from a string, preserving structure where possible.
504
+ */
505
+ export function stripHtmlTags(html: string): string {
506
+ if (!html) return "";
507
+
508
+ // First, add newlines for block elements
509
+ let text = html
510
+ .replace(/<\/?(p|div|br|h[1-6]|li|tr)[^>]*>/gi, "\n")
511
+ .replace(/<\/?(ul|ol|table|blockquote)[^>]*>/gi, "\n\n");
512
+
513
+ // Remove remaining HTML tags
514
+ text = text.replace(/<[^>]*>/g, "");
515
+
516
+ // Decode common HTML entities
517
+ text = text
518
+ .replace(/&nbsp;/g, " ")
519
+ .replace(/&amp;/g, "&")
520
+ .replace(/&lt;/g, "<")
521
+ .replace(/&gt;/g, ">")
522
+ .replace(/&quot;/g, '"')
523
+ .replace(/&#39;/g, "'")
524
+ .replace(/&mdash;/g, "—")
525
+ .replace(/&ndash;/g, "–");
526
+
527
+ // Clean up whitespace
528
+ text = text
529
+ .split("\n")
530
+ .map((line) => line.trim())
531
+ .join("\n")
532
+ .replace(/\n{3,}/g, "\n\n")
533
+ .trim();
534
+
535
+ return text;
536
+ }
537
+
538
+ /**
539
+ * Extracts text from a JSON field value.
540
+ *
541
+ * Recursively extracts string values from objects and arrays.
542
+ * Useful for structured data fields that may contain text content.
543
+ *
544
+ * @param value - The JSON field value
545
+ * @param maxDepth - Maximum recursion depth
546
+ * @returns Extracted text content
547
+ */
548
+ export function extractTextFromJson(
549
+ value: unknown,
550
+ maxDepth: number = 5,
551
+ ): string {
552
+ if (maxDepth <= 0) return "";
553
+
554
+ if (value === null || value === undefined) {
555
+ return "";
556
+ }
557
+
558
+ if (typeof value === "string") {
559
+ return value;
560
+ }
561
+
562
+ if (typeof value === "number" || typeof value === "boolean") {
563
+ return String(value);
564
+ }
565
+
566
+ if (Array.isArray(value)) {
567
+ return value
568
+ .map((item) => extractTextFromJson(item, maxDepth - 1))
569
+ .filter(Boolean)
570
+ .join(", ");
571
+ }
572
+
573
+ if (typeof value === "object") {
574
+ const obj = value as Record<string, unknown>;
575
+ const textParts: string[] = [];
576
+
577
+ // Prioritize common text field names
578
+ const priorityKeys = [
579
+ "text",
580
+ "content",
581
+ "value",
582
+ "label",
583
+ "title",
584
+ "name",
585
+ "description",
586
+ ];
587
+ const seenKeys = new Set<string>();
588
+
589
+ for (const key of priorityKeys) {
590
+ if (key in obj) {
591
+ const extracted = extractTextFromJson(obj[key], maxDepth - 1);
592
+ if (extracted) {
593
+ textParts.push(extracted);
594
+ seenKeys.add(key);
595
+ }
596
+ }
597
+ }
598
+
599
+ // Then process remaining keys
600
+ for (const [key, val] of Object.entries(obj)) {
601
+ if (seenKeys.has(key)) continue;
602
+ // Skip internal/system keys
603
+ if (key.startsWith("_") || key.startsWith("$")) continue;
604
+
605
+ const extracted = extractTextFromJson(val, maxDepth - 1);
606
+ if (extracted) {
607
+ textParts.push(extracted);
608
+ }
609
+ }
610
+
611
+ return textParts.join(" ");
612
+ }
613
+
614
+ return "";
615
+ }
616
+
617
+ /**
618
+ * Extracts text from a select or multiSelect field value.
619
+ */
620
+ export function extractTextFromSelect(value: unknown): string {
621
+ if (value === null || value === undefined) {
622
+ return "";
623
+ }
624
+
625
+ if (typeof value === "string") {
626
+ return value;
627
+ }
628
+
629
+ if (Array.isArray(value)) {
630
+ return value.filter((v) => typeof v === "string").join(", ");
631
+ }
632
+
633
+ return "";
634
+ }
635
+
636
+ // =============================================================================
637
+ // Content Extraction
638
+ // =============================================================================
639
+
640
+ /**
641
+ * Extracts text content from a content entry based on its content type schema.
642
+ *
643
+ * This function:
644
+ * 1. Iterates through fields defined in the content type
645
+ * 2. Extracts text from each field based on its type
646
+ * 3. Tracks references (content and media)
647
+ * 4. Builds a combined text representation
648
+ *
649
+ * @param entry - The content entry to extract from
650
+ * @param contentType - The content type definition
651
+ * @param options - Extraction options
652
+ * @param resolvedReferences - Optional map of resolved reference information
653
+ * @returns Extracted content with metadata
654
+ *
655
+ * @example
656
+ * ```typescript
657
+ * const extracted = extractContent(entry, contentType, {
658
+ * includeFields: ["title", "content", "excerpt"],
659
+ * extractRichText: true,
660
+ * });
661
+ *
662
+ * console.log(extracted.fullText);
663
+ * // "My Blog Post\n\nThis is the main content...\n\nA brief excerpt."
664
+ * ```
665
+ */
666
+ export function extractContent(
667
+ entry: ContentEntryInfo,
668
+ contentType: ContentTypeInfo,
669
+ options: Partial<RagExtractionOptions> = {},
670
+ resolvedReferences?: Map<string, ResolvedReferenceInfo>,
671
+ ): ExtractedContent {
672
+ const opts = { ...DEFAULT_EXTRACTION_OPTIONS, ...options };
673
+ const data = entry.data || {};
674
+
675
+ const fieldTexts: Record<string, string> = {};
676
+ const sourceInfo: ExtractedContent["sourceInfo"] = [];
677
+ const referencedEntryIds: string[] = [];
678
+ const referencedMediaIds: string[] = [];
679
+
680
+ let title: string | undefined;
681
+
682
+ // Determine which fields to process
683
+ const fieldsToProcess = contentType.fields.filter((field) => {
684
+ // Check include list
685
+ if (opts.includeFields && opts.includeFields.length > 0) {
686
+ if (!opts.includeFields.includes(field.name)) return false;
687
+ }
688
+
689
+ // Check exclude list
690
+ if (opts.excludeFields && opts.excludeFields.length > 0) {
691
+ if (opts.excludeFields.includes(field.name)) return false;
692
+ }
693
+
694
+ return true;
695
+ });
696
+
697
+ // Process each field
698
+ for (const field of fieldsToProcess) {
699
+ const value = data[field.name];
700
+ if (value === null || value === undefined) continue;
701
+
702
+ let extractedText = "";
703
+
704
+ switch (field.type) {
705
+ case "text":
706
+ extractedText = typeof value === "string" ? value : String(value);
707
+ break;
708
+
709
+ case "richText":
710
+ if (opts.extractRichText) {
711
+ extractedText = extractTextFromRichText(value);
712
+ }
713
+ break;
714
+
715
+ case "json":
716
+ if (opts.extractJson) {
717
+ extractedText = extractTextFromJson(value);
718
+ }
719
+ break;
720
+
721
+ case "select":
722
+ case "multiSelect":
723
+ extractedText = extractTextFromSelect(value);
724
+ break;
725
+
726
+ case "reference": {
727
+ // Track reference IDs
728
+ const refIds = extractReferenceIds(value, field);
729
+ referencedEntryIds.push(...refIds);
730
+
731
+ // Include reference context if available
732
+ if (opts.includeReferenceContext && resolvedReferences) {
733
+ const refTexts = refIds
734
+ .map((id) => {
735
+ const ref = resolvedReferences.get(id);
736
+ if (ref && ref.title) {
737
+ return `[${ref.title}]`;
738
+ }
739
+ return null;
740
+ })
741
+ .filter(Boolean);
742
+
743
+ if (refTexts.length > 0) {
744
+ extractedText = `Referenced: ${refTexts.join(", ")}`;
745
+ }
746
+ }
747
+ break;
748
+ }
749
+
750
+ case "media": {
751
+ // Track media IDs
752
+ const mediaIds = extractMediaIds(value, field);
753
+ referencedMediaIds.push(...mediaIds);
754
+ // Media doesn't contribute to text content
755
+ break;
756
+ }
757
+
758
+ case "number":
759
+ case "boolean":
760
+ case "date":
761
+ case "datetime":
762
+ // These can optionally be included as context
763
+ extractedText = formatFieldValue(value, field.type);
764
+ break;
765
+
766
+ default:
767
+ // Unknown field type - try generic extraction
768
+ if (typeof value === "string") {
769
+ extractedText = value;
770
+ }
771
+ }
772
+
773
+ if (extractedText) {
774
+ fieldTexts[field.name] = extractedText;
775
+ sourceInfo.push({
776
+ fieldName: field.name,
777
+ fieldLabel: field.label,
778
+ fieldType: field.type,
779
+ charCount: extractedText.length,
780
+ });
781
+
782
+ // Extract title from title field
783
+ if (field.name === contentType.titleField) {
784
+ title = extractedText;
785
+ }
786
+ }
787
+ }
788
+
789
+ // Build full text with field labels for context
790
+ const fullTextParts: string[] = [];
791
+
792
+ // Add title first if available
793
+ if (title) {
794
+ fullTextParts.push(title);
795
+ }
796
+
797
+ // Add other fields
798
+ for (const field of fieldsToProcess) {
799
+ if (field.name === contentType.titleField) continue; // Already added
800
+ const text = fieldTexts[field.name];
801
+ if (text) {
802
+ fullTextParts.push(text);
803
+ }
804
+ }
805
+
806
+ return {
807
+ fullText: fullTextParts.join("\n\n"),
808
+ fieldTexts,
809
+ title,
810
+ referencedEntryIds,
811
+ referencedMediaIds,
812
+ sourceInfo,
813
+ };
814
+ }
815
+
816
+ /**
817
+ * Extracts reference IDs from a reference field value.
818
+ */
819
+ function extractReferenceIds(value: unknown, field: FieldDefinition): string[] {
820
+ if (value === null || value === undefined) {
821
+ return [];
822
+ }
823
+
824
+ const isMultiple = field.options?.multiple === true;
825
+
826
+ if (isMultiple && Array.isArray(value)) {
827
+ return value.filter((v) => typeof v === "string");
828
+ }
829
+
830
+ if (typeof value === "string") {
831
+ return [value];
832
+ }
833
+
834
+ return [];
835
+ }
836
+
837
+ /**
838
+ * Extracts media IDs from a media field value.
839
+ */
840
+ function extractMediaIds(value: unknown, field: FieldDefinition): string[] {
841
+ if (value === null || value === undefined) {
842
+ return [];
843
+ }
844
+
845
+ const isMultiple = field.options?.multiple === true;
846
+
847
+ if (isMultiple && Array.isArray(value)) {
848
+ return value.filter((v) => typeof v === "string");
849
+ }
850
+
851
+ if (typeof value === "string") {
852
+ return [value];
853
+ }
854
+
855
+ return [];
856
+ }
857
+
858
+ /**
859
+ * Formats a field value for text representation.
860
+ */
861
+ function formatFieldValue(value: unknown, fieldType: string): string {
862
+ if (value === null || value === undefined) {
863
+ return "";
864
+ }
865
+
866
+ switch (fieldType) {
867
+ case "number":
868
+ return typeof value === "number" ? value.toString() : String(value);
869
+
870
+ case "boolean":
871
+ return value ? "Yes" : "No";
872
+
873
+ case "date":
874
+ case "datetime":
875
+ if (typeof value === "string") {
876
+ return value;
877
+ }
878
+ if (typeof value === "number") {
879
+ return new Date(value).toISOString();
880
+ }
881
+ return String(value);
882
+
883
+ default:
884
+ return String(value);
885
+ }
886
+ }
887
+
888
+ // =============================================================================
889
+ // Text Chunking
890
+ // =============================================================================
891
+
892
+ /**
893
+ * Splits text into semantic chunks optimized for embedding.
894
+ *
895
+ * The algorithm:
896
+ * 1. First tries to split on paragraph breaks (default delimiter)
897
+ * 2. Falls back to line breaks if paragraphs are too large
898
+ * 3. Falls back to sentence boundaries if lines are too large
899
+ * 4. Force-splits at hard limit if necessary
900
+ * 5. Optionally preserves heading context
901
+ *
902
+ * @param text - The text to chunk
903
+ * @param options - Chunking options
904
+ * @returns Array of text chunks
905
+ *
906
+ * @example
907
+ * ```typescript
908
+ * const chunks = chunkText(longArticle, {
909
+ * maxCharsSoftLimit: 1000,
910
+ * preserveHeadingContext: true,
911
+ * });
912
+ * ```
913
+ */
914
+ export function chunkText(
915
+ text: string,
916
+ options: Partial<ChunkOptions> = {},
917
+ ): string[] {
918
+ const opts = { ...DEFAULT_CHUNK_OPTIONS, ...options };
919
+
920
+ if (!text || text.trim().length === 0) {
921
+ return [];
922
+ }
923
+
924
+ // If text is small enough, return as single chunk
925
+ if (text.length <= opts.maxCharsSoftLimit) {
926
+ return [text.trim()];
927
+ }
928
+
929
+ const chunks: string[] = [];
930
+ let currentHeading: string | null = null;
931
+
932
+ // Split by primary delimiter
933
+ let segments = text.split(opts.delimiter);
934
+
935
+ // If we have very few segments, try secondary splitting
936
+ if (segments.length <= 2 && text.length > opts.maxCharsSoftLimit) {
937
+ for (const fallback of opts.fallbackDelimiters) {
938
+ const fallbackSegments = text.split(fallback);
939
+ if (fallbackSegments.length > segments.length) {
940
+ segments = fallbackSegments;
941
+ break;
942
+ }
943
+ }
944
+ }
945
+
946
+ let currentChunk = "";
947
+
948
+ for (const segment of segments) {
949
+ const trimmedSegment = segment.trim();
950
+ if (!trimmedSegment) continue;
951
+
952
+ // Detect headings (lines that look like titles)
953
+ const isHeading = detectHeading(trimmedSegment);
954
+ if (isHeading && opts.preserveHeadingContext) {
955
+ currentHeading = trimmedSegment;
956
+ }
957
+
958
+ // Check if adding this segment would exceed soft limit
959
+ const potentialChunk = currentChunk
960
+ ? `${currentChunk}\n\n${trimmedSegment}`
961
+ : trimmedSegment;
962
+
963
+ if (potentialChunk.length > opts.maxCharsSoftLimit && currentChunk) {
964
+ // Save current chunk
965
+ chunks.push(finalizeChunk(currentChunk, currentHeading, opts));
966
+
967
+ // Start new chunk, potentially with heading context
968
+ if (opts.preserveHeadingContext && currentHeading && !isHeading) {
969
+ currentChunk = `${currentHeading}\n\n${trimmedSegment}`;
970
+ } else {
971
+ currentChunk = trimmedSegment;
972
+ }
973
+ } else {
974
+ currentChunk = potentialChunk;
975
+ }
976
+
977
+ // Handle segments that are too large even alone
978
+ if (currentChunk.length > opts.maxCharsHardLimit) {
979
+ const subChunks = forceSplitText(currentChunk, opts);
980
+ chunks.push(...subChunks.slice(0, -1));
981
+ currentChunk = subChunks[subChunks.length - 1] || "";
982
+ }
983
+ }
984
+
985
+ // Don't forget the last chunk
986
+ if (currentChunk.trim()) {
987
+ chunks.push(finalizeChunk(currentChunk, null, opts));
988
+ }
989
+
990
+ return chunks;
991
+ }
992
+
993
+ /**
994
+ * Detects if a text segment is likely a heading.
995
+ */
996
+ function detectHeading(text: string): boolean {
997
+ const trimmed = text.trim();
998
+
999
+ // Short lines that don't end with sentence punctuation are likely headings
1000
+ if (trimmed.length < 100 && !trimmed.match(/[.!?]$/)) {
1001
+ // Check if it starts with heading patterns
1002
+ if (
1003
+ trimmed.match(/^#{1,6}\s/) || // Markdown headings
1004
+ trimmed.match(/^[A-Z][\w\s]+:?$/) || // Title Case lines
1005
+ trimmed.match(/^\d+\.\s+[A-Z]/)
1006
+ ) {
1007
+ // Numbered sections
1008
+ return true;
1009
+ }
1010
+ }
1011
+
1012
+ return false;
1013
+ }
1014
+
1015
+ /**
1016
+ * Finalizes a chunk by adding overlap if needed.
1017
+ */
1018
+ function finalizeChunk(
1019
+ chunk: string,
1020
+ _heading: string | null,
1021
+ _opts: Required<ChunkOptions>,
1022
+ ): string {
1023
+ return chunk.trim();
1024
+ }
1025
+
1026
+ /**
1027
+ * Force-splits text that exceeds the hard limit.
1028
+ */
1029
+ function forceSplitText(text: string, opts: Required<ChunkOptions>): string[] {
1030
+ const chunks: string[] = [];
1031
+ let remaining = text;
1032
+
1033
+ while (remaining.length > opts.maxCharsHardLimit) {
1034
+ // Try to find a good split point
1035
+ let splitPoint = opts.maxCharsSoftLimit;
1036
+
1037
+ // Look for sentence boundary
1038
+ const sentenceEnd = remaining.lastIndexOf(". ", splitPoint);
1039
+ if (sentenceEnd > opts.minCharsSoftLimit) {
1040
+ splitPoint = sentenceEnd + 1;
1041
+ } else {
1042
+ // Look for word boundary
1043
+ const spacePoint = remaining.lastIndexOf(" ", splitPoint);
1044
+ if (spacePoint > opts.minCharsSoftLimit) {
1045
+ splitPoint = spacePoint;
1046
+ }
1047
+ }
1048
+
1049
+ chunks.push(remaining.slice(0, splitPoint).trim());
1050
+ remaining = remaining.slice(splitPoint).trim();
1051
+ }
1052
+
1053
+ if (remaining) {
1054
+ chunks.push(remaining);
1055
+ }
1056
+
1057
+ return chunks;
1058
+ }
1059
+
1060
+ // =============================================================================
1061
+ // Main API Functions
1062
+ // =============================================================================
1063
+
1064
+ /**
1065
+ * Processes a content entry into chunks ready for RAG indexing.
1066
+ *
1067
+ * This is the main function to use for preparing CMS content for @convex-dev/rag.
1068
+ * It combines extraction and chunking with full metadata.
1069
+ *
1070
+ * @param entry - The content entry to process
1071
+ * @param contentType - The content type definition
1072
+ * @param options - Extraction and chunking options
1073
+ * @param resolvedReferences - Optional map of resolved references for context
1074
+ * @returns Array of content chunks with metadata
1075
+ *
1076
+ * @example
1077
+ * ```typescript
1078
+ * // In a Convex action
1079
+ * export const indexEntry = action({
1080
+ * args: { entryId: v.id("contentEntries") },
1081
+ * handler: async (ctx, { entryId }) => {
1082
+ * const entry = await ctx.runQuery(api.contentEntries.get, { id: entryId });
1083
+ * const contentType = await ctx.runQuery(api.contentTypes.get, {
1084
+ * id: entry.contentTypeId
1085
+ * });
1086
+ *
1087
+ * const chunks = chunkContentEntry(entry, contentType, {
1088
+ * chunkOptions: { maxCharsSoftLimit: 800 },
1089
+ * includeMetadata: true,
1090
+ * });
1091
+ *
1092
+ * // Add to RAG index
1093
+ * await rag.add(ctx, {
1094
+ * namespace: `cms:${contentType.name}`,
1095
+ * key: entryId,
1096
+ * chunks: chunks.map(c => c.text),
1097
+ * title: entry.data.title,
1098
+ * });
1099
+ *
1100
+ * return { indexed: chunks.length };
1101
+ * },
1102
+ * });
1103
+ * ```
1104
+ */
1105
+ export function chunkContentEntry(
1106
+ entry: ContentEntryInfo,
1107
+ contentType: ContentTypeInfo,
1108
+ options: Partial<RagExtractionOptions> = {},
1109
+ resolvedReferences?: Map<string, ResolvedReferenceInfo>,
1110
+ ): ContentChunk[] {
1111
+ const opts = { ...DEFAULT_EXTRACTION_OPTIONS, ...options };
1112
+
1113
+ // Extract content from the entry
1114
+ const extracted = extractContent(
1115
+ entry,
1116
+ contentType,
1117
+ opts,
1118
+ resolvedReferences,
1119
+ );
1120
+
1121
+ if (!extracted.fullText) {
1122
+ return [];
1123
+ }
1124
+
1125
+ // Apply prefix/suffix to full text before chunking
1126
+ let textToChunk = extracted.fullText;
1127
+ if (opts.chunkPrefix) {
1128
+ const prefix = opts.chunkPrefix
1129
+ .replace("{contentType}", contentType.displayName)
1130
+ .replace("{title}", extracted.title || entry.slug)
1131
+ .replace("{slug}", entry.slug);
1132
+ textToChunk = `${prefix}\n\n${textToChunk}`;
1133
+ }
1134
+ if (opts.chunkSuffix) {
1135
+ textToChunk = `${textToChunk}\n\n${opts.chunkSuffix}`;
1136
+ }
1137
+
1138
+ // Chunk the text
1139
+ const textChunks = chunkText(textToChunk, opts.chunkOptions);
1140
+
1141
+ // Build content chunks with metadata
1142
+ const chunks: ContentChunk[] = textChunks.map((text, index) => {
1143
+ const metadata: ChunkMetadata = {
1144
+ entryId: entry._id,
1145
+ contentType: contentType.name,
1146
+ contentTypeDisplayName: contentType.displayName,
1147
+ slug: entry.slug,
1148
+ status: entry.status,
1149
+ locale: entry.locale,
1150
+ sourceFields: extracted.sourceInfo.map((s) => s.fieldName),
1151
+ chunkIndex: index,
1152
+ totalChunks: textChunks.length,
1153
+ title: extracted.title,
1154
+ createdAt: new Date(entry._creationTime).toISOString(),
1155
+ firstPublishedAt: entry.firstPublishedAt
1156
+ ? new Date(entry.firstPublishedAt).toISOString()
1157
+ : undefined,
1158
+ lastPublishedAt: entry.lastPublishedAt
1159
+ ? new Date(entry.lastPublishedAt).toISOString()
1160
+ : undefined,
1161
+ version: entry.version,
1162
+ referencedEntryIds:
1163
+ extracted.referencedEntryIds.length > 0
1164
+ ? extracted.referencedEntryIds
1165
+ : undefined,
1166
+ referencedMediaIds:
1167
+ extracted.referencedMediaIds.length > 0
1168
+ ? extracted.referencedMediaIds
1169
+ : undefined,
1170
+ semanticType: detectSemanticType(text),
1171
+ };
1172
+
1173
+ return opts.includeMetadata ? { text, metadata } : { text, metadata };
1174
+ });
1175
+
1176
+ // Optionally create a summary chunk
1177
+ if (opts.createSummaryChunk && chunks.length > 0) {
1178
+ const summaryChunk = createSummaryChunk(
1179
+ entry,
1180
+ contentType,
1181
+ extracted,
1182
+ chunks.length,
1183
+ );
1184
+ chunks.unshift(summaryChunk);
1185
+
1186
+ // Update chunk indices
1187
+ chunks.forEach((chunk, index) => {
1188
+ chunk.metadata.chunkIndex = index;
1189
+ chunk.metadata.totalChunks = chunks.length;
1190
+ });
1191
+ }
1192
+
1193
+ return chunks;
1194
+ }
1195
+
1196
+ /**
1197
+ * Detects the semantic type of a chunk based on its content.
1198
+ */
1199
+ function detectSemanticType(text: string): ChunkSemanticType {
1200
+ const trimmed = text.trim();
1201
+
1202
+ // Check for headings
1203
+ if (
1204
+ trimmed.match(/^#{1,6}\s/) ||
1205
+ (trimmed.length < 100 && !trimmed.includes("\n"))
1206
+ ) {
1207
+ const lines = trimmed.split("\n");
1208
+ if (lines.length === 1 && !trimmed.match(/[.!?]$/)) {
1209
+ return lines[0].length < 20 ? "title" : "heading";
1210
+ }
1211
+ }
1212
+
1213
+ // Check for lists
1214
+ if (trimmed.match(/^[-*]\s/m) || trimmed.match(/^\d+\.\s/m)) {
1215
+ return "list";
1216
+ }
1217
+
1218
+ // Check for quotes
1219
+ if (trimmed.startsWith('"') || trimmed.startsWith(">")) {
1220
+ return "quote";
1221
+ }
1222
+
1223
+ // Check for code
1224
+ if (trimmed.startsWith("```") || trimmed.match(/^\s{4}/m)) {
1225
+ return "code";
1226
+ }
1227
+
1228
+ // Default to paragraph or mixed
1229
+ return trimmed.includes("\n\n") ? "mixed" : "paragraph";
1230
+ }
1231
+
1232
+ /**
1233
+ * Creates a summary chunk from key fields.
1234
+ */
1235
+ function createSummaryChunk(
1236
+ entry: ContentEntryInfo,
1237
+ contentType: ContentTypeInfo,
1238
+ extracted: ExtractedContent,
1239
+ totalChunks: number,
1240
+ ): ContentChunk {
1241
+ const summaryParts: string[] = [];
1242
+
1243
+ // Add title
1244
+ if (extracted.title) {
1245
+ summaryParts.push(`Title: ${extracted.title}`);
1246
+ }
1247
+
1248
+ // Add content type
1249
+ summaryParts.push(`Type: ${contentType.displayName}`);
1250
+
1251
+ // Add status and dates
1252
+ summaryParts.push(`Status: ${entry.status}`);
1253
+ if (entry.lastPublishedAt) {
1254
+ summaryParts.push(
1255
+ `Published: ${new Date(entry.lastPublishedAt).toLocaleDateString()}`,
1256
+ );
1257
+ }
1258
+
1259
+ // Add brief excerpt from first field
1260
+ const firstField = Object.keys(extracted.fieldTexts)[0];
1261
+ if (firstField && extracted.fieldTexts[firstField]) {
1262
+ const excerpt = extracted.fieldTexts[firstField].slice(0, 200);
1263
+ summaryParts.push(
1264
+ `Summary: ${excerpt}${excerpt.length >= 200 ? "..." : ""}`,
1265
+ );
1266
+ }
1267
+
1268
+ return {
1269
+ text: summaryParts.join("\n"),
1270
+ metadata: {
1271
+ entryId: entry._id,
1272
+ contentType: contentType.name,
1273
+ contentTypeDisplayName: contentType.displayName,
1274
+ slug: entry.slug,
1275
+ status: entry.status,
1276
+ locale: entry.locale,
1277
+ sourceFields: ["_summary"],
1278
+ chunkIndex: 0,
1279
+ totalChunks: totalChunks + 1,
1280
+ title: extracted.title,
1281
+ createdAt: new Date(entry._creationTime).toISOString(),
1282
+ firstPublishedAt: entry.firstPublishedAt
1283
+ ? new Date(entry.firstPublishedAt).toISOString()
1284
+ : undefined,
1285
+ lastPublishedAt: entry.lastPublishedAt
1286
+ ? new Date(entry.lastPublishedAt).toISOString()
1287
+ : undefined,
1288
+ version: entry.version,
1289
+ semanticType: "field_value",
1290
+ },
1291
+ };
1292
+ }
1293
+
1294
+ // =============================================================================
1295
+ // Batch Processing Utilities
1296
+ // =============================================================================
1297
+
1298
+ /**
1299
+ * Processes multiple content entries into chunks.
1300
+ *
1301
+ * Useful for batch indexing operations.
1302
+ *
1303
+ * @param entries - Array of content entries
1304
+ * @param contentTypes - Map of content type ID to content type
1305
+ * @param options - Extraction options
1306
+ * @returns Map of entry ID to chunks
1307
+ */
1308
+ export function chunkMultipleEntries(
1309
+ entries: ContentEntryInfo[],
1310
+ contentTypes: Map<string, ContentTypeInfo>,
1311
+ options: Partial<RagExtractionOptions> = {},
1312
+ ): Map<string, ContentChunk[]> {
1313
+ const results = new Map<string, ContentChunk[]>();
1314
+
1315
+ for (const entry of entries) {
1316
+ const contentType = contentTypes.get(entry.contentTypeId);
1317
+ if (!contentType) {
1318
+ console.warn(`Content type not found for entry ${entry._id}`);
1319
+ continue;
1320
+ }
1321
+
1322
+ const chunks = chunkContentEntry(entry, contentType, options);
1323
+ results.set(entry._id, chunks);
1324
+ }
1325
+
1326
+ return results;
1327
+ }
1328
+
1329
+ /**
1330
+ * Calculates the total character count and chunk count for entries.
1331
+ *
1332
+ * Useful for estimating indexing costs.
1333
+ */
1334
+ export function estimateChunkingStats(
1335
+ entries: ContentEntryInfo[],
1336
+ contentTypes: Map<string, ContentTypeInfo>,
1337
+ options: Partial<RagExtractionOptions> = {},
1338
+ ): {
1339
+ totalEntries: number;
1340
+ totalChunks: number;
1341
+ totalCharacters: number;
1342
+ averageChunksPerEntry: number;
1343
+ averageCharsPerChunk: number;
1344
+ } {
1345
+ let totalChunks = 0;
1346
+ let totalCharacters = 0;
1347
+
1348
+ for (const entry of entries) {
1349
+ const contentType = contentTypes.get(entry.contentTypeId);
1350
+ if (!contentType) continue;
1351
+
1352
+ const chunks = chunkContentEntry(entry, contentType, options);
1353
+ totalChunks += chunks.length;
1354
+ totalCharacters += chunks.reduce((sum, c) => sum + c.text.length, 0);
1355
+ }
1356
+
1357
+ return {
1358
+ totalEntries: entries.length,
1359
+ totalChunks,
1360
+ totalCharacters,
1361
+ averageChunksPerEntry:
1362
+ entries.length > 0 ? totalChunks / entries.length : 0,
1363
+ averageCharsPerChunk: totalChunks > 0 ? totalCharacters / totalChunks : 0,
1364
+ };
1365
+ }
1366
+
1367
+ // =============================================================================
1368
+ // Exports
1369
+ // =============================================================================
1370
+
1371
+ export { DEFAULT_CHUNK_OPTIONS, DEFAULT_EXTRACTION_OPTIONS };