strapi-content-embeddings 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +69 -2
- package/dist/_chunks/{App-CA5bQnKQ.js → App-BfvnOBS9.js} +430 -7
- package/dist/_chunks/{App-C5NFY1UT.mjs → App-sRU0Nh3x.mjs} +432 -9
- package/dist/_chunks/{index-CIpGvEcJ.mjs → index-C58A29qR.mjs} +1 -1
- package/dist/_chunks/{index-CVCA8dDp.js → index-DkNKkHgk.js} +1 -1
- package/dist/admin/index.js +1 -1
- package/dist/admin/index.mjs +1 -1
- package/dist/admin/src/components/custom/SyncModal.d.ts +7 -0
- package/dist/admin/src/utils/api.d.ts +48 -0
- package/dist/server/index.js +141 -5
- package/dist/server/index.mjs +139 -5
- package/dist/server/src/config/index.d.ts +3 -0
- package/dist/server/src/index.d.ts +1 -0
- package/dist/server/src/services/sync.d.ts +1 -0
- package/dist/server/src/utils/preprocessing.d.ts +26 -0
- package/package.json +3 -1
package/dist/server/index.js
CHANGED
|
@@ -10,6 +10,10 @@ const types_js = require("@modelcontextprotocol/sdk/types.js");
|
|
|
10
10
|
const zod = require("zod");
|
|
11
11
|
const node_crypto = require("node:crypto");
|
|
12
12
|
const streamableHttp_js = require("@modelcontextprotocol/sdk/server/streamableHttp.js");
|
|
13
|
+
const htmlToText = require("html-to-text");
|
|
14
|
+
const removeMarkdown = require("remove-markdown");
|
|
15
|
+
const _interopDefault = (e) => e && e.__esModule ? e : { default: e };
|
|
16
|
+
const removeMarkdown__default = /* @__PURE__ */ _interopDefault(removeMarkdown);
|
|
13
17
|
const EMBEDDING_MODELS = {
|
|
14
18
|
"text-embedding-3-small": { dimensions: 1536 },
|
|
15
19
|
"text-embedding-3-large": { dimensions: 3072 },
|
|
@@ -22,7 +26,8 @@ const config = {
|
|
|
22
26
|
embeddingModel: "text-embedding-3-small",
|
|
23
27
|
chunkSize: 4e3,
|
|
24
28
|
chunkOverlap: 200,
|
|
25
|
-
autoChunk: false
|
|
29
|
+
autoChunk: false,
|
|
30
|
+
preprocessContent: true
|
|
26
31
|
},
|
|
27
32
|
validator(config2) {
|
|
28
33
|
if (!config2.openAIApiKey) {
|
|
@@ -1512,6 +1517,32 @@ const admin = [
|
|
|
1512
1517
|
}
|
|
1513
1518
|
]
|
|
1514
1519
|
}
|
|
1520
|
+
},
|
|
1521
|
+
{
|
|
1522
|
+
method: "GET",
|
|
1523
|
+
path: "/sync/status",
|
|
1524
|
+
handler: "controller.getSyncStatus",
|
|
1525
|
+
config: {
|
|
1526
|
+
policies: [
|
|
1527
|
+
{
|
|
1528
|
+
name: "admin::hasPermissions",
|
|
1529
|
+
config: { actions: ["plugin::strapi-content-embeddings.read"] }
|
|
1530
|
+
}
|
|
1531
|
+
]
|
|
1532
|
+
}
|
|
1533
|
+
},
|
|
1534
|
+
{
|
|
1535
|
+
method: "POST",
|
|
1536
|
+
path: "/sync",
|
|
1537
|
+
handler: "controller.syncFromNeon",
|
|
1538
|
+
config: {
|
|
1539
|
+
policies: [
|
|
1540
|
+
{
|
|
1541
|
+
name: "admin::hasPermissions",
|
|
1542
|
+
config: { actions: ["plugin::strapi-content-embeddings.update"] }
|
|
1543
|
+
}
|
|
1544
|
+
]
|
|
1545
|
+
}
|
|
1515
1546
|
}
|
|
1516
1547
|
];
|
|
1517
1548
|
const routes = {
|
|
@@ -1671,6 +1702,103 @@ function formatChunkTitle(baseTitle, chunkIndex, totalChunks) {
|
|
|
1671
1702
|
}
|
|
1672
1703
|
return `${baseTitle} [Part ${chunkIndex + 1}/${totalChunks}]`;
|
|
1673
1704
|
}
|
|
1705
|
+
const DEFAULT_OPTIONS = {
|
|
1706
|
+
stripHtml: true,
|
|
1707
|
+
stripMarkdown: true,
|
|
1708
|
+
normalizeWhitespace: true
|
|
1709
|
+
};
|
|
1710
|
+
function containsHtml(content) {
|
|
1711
|
+
return /<[a-z][\s\S]*>/i.test(content);
|
|
1712
|
+
}
|
|
1713
|
+
function containsMarkdown(content) {
|
|
1714
|
+
const markdownPatterns = [
|
|
1715
|
+
/^#{1,6}\s/m,
|
|
1716
|
+
// Headers: # ## ### etc
|
|
1717
|
+
/\*\*[^*]+\*\*/,
|
|
1718
|
+
// Bold: **text**
|
|
1719
|
+
/\*[^*]+\*/,
|
|
1720
|
+
// Italic: *text*
|
|
1721
|
+
/__[^_]+__/,
|
|
1722
|
+
// Bold: __text__
|
|
1723
|
+
/_[^_]+_/,
|
|
1724
|
+
// Italic: _text_
|
|
1725
|
+
/\[.+\]\(.+\)/,
|
|
1726
|
+
// Links: [text](url)
|
|
1727
|
+
/^[-*+]\s/m,
|
|
1728
|
+
// Unordered lists: - * +
|
|
1729
|
+
/^\d+\.\s/m,
|
|
1730
|
+
// Ordered lists: 1. 2. etc
|
|
1731
|
+
/^>\s/m,
|
|
1732
|
+
// Blockquotes: >
|
|
1733
|
+
/`[^`]+`/,
|
|
1734
|
+
// Inline code: `code`
|
|
1735
|
+
/```[\s\S]*?```/,
|
|
1736
|
+
// Code blocks: ```code```
|
|
1737
|
+
/^\|.+\|$/m
|
|
1738
|
+
// Tables: |col|col|
|
|
1739
|
+
];
|
|
1740
|
+
return markdownPatterns.some((pattern) => pattern.test(content));
|
|
1741
|
+
}
|
|
1742
|
+
function stripHtml(content) {
|
|
1743
|
+
if (!containsHtml(content)) {
|
|
1744
|
+
return content;
|
|
1745
|
+
}
|
|
1746
|
+
return htmlToText.convert(content, {
|
|
1747
|
+
wordwrap: false,
|
|
1748
|
+
preserveNewlines: true,
|
|
1749
|
+
selectors: [
|
|
1750
|
+
// Convert headings to plain text with colon
|
|
1751
|
+
{ selector: "h1", options: { uppercase: false, trailingLineBreaks: 1 } },
|
|
1752
|
+
{ selector: "h2", options: { uppercase: false, trailingLineBreaks: 1 } },
|
|
1753
|
+
{ selector: "h3", options: { uppercase: false, trailingLineBreaks: 1 } },
|
|
1754
|
+
{ selector: "h4", options: { uppercase: false, trailingLineBreaks: 1 } },
|
|
1755
|
+
{ selector: "h5", options: { uppercase: false, trailingLineBreaks: 1 } },
|
|
1756
|
+
{ selector: "h6", options: { uppercase: false, trailingLineBreaks: 1 } },
|
|
1757
|
+
// Remove images but keep alt text
|
|
1758
|
+
{ selector: "img", format: "skip" },
|
|
1759
|
+
// Remove scripts and styles completely
|
|
1760
|
+
{ selector: "script", format: "skip" },
|
|
1761
|
+
{ selector: "style", format: "skip" },
|
|
1762
|
+
// Keep link text, remove URLs
|
|
1763
|
+
{ selector: "a", options: { ignoreHref: true } }
|
|
1764
|
+
]
|
|
1765
|
+
});
|
|
1766
|
+
}
|
|
1767
|
+
function stripMarkdownSyntax(content) {
|
|
1768
|
+
if (!containsMarkdown(content)) {
|
|
1769
|
+
return content;
|
|
1770
|
+
}
|
|
1771
|
+
let result = removeMarkdown__default.default(content, {
|
|
1772
|
+
stripListLeaders: true,
|
|
1773
|
+
listUnicodeChar: "",
|
|
1774
|
+
gfm: true,
|
|
1775
|
+
useImgAltText: true
|
|
1776
|
+
});
|
|
1777
|
+
result = result.replace(/```[\s\S]*?```/g, (match) => {
|
|
1778
|
+
return match.replace(/```\w*\n?/g, "").replace(/```/g, "");
|
|
1779
|
+
}).replace(/`([^`]+)`/g, "$1").replace(/^[-*_]{3,}$/gm, "");
|
|
1780
|
+
return result;
|
|
1781
|
+
}
|
|
1782
|
+
function normalizeWhitespace(content) {
|
|
1783
|
+
return content.replace(/\n{3,}/g, "\n\n").replace(/[ \t]+/g, " ").split("\n").map((line) => line.trim()).join("\n").trim();
|
|
1784
|
+
}
|
|
1785
|
+
function preprocessContent(content, options2 = {}) {
|
|
1786
|
+
const opts = { ...DEFAULT_OPTIONS, ...options2 };
|
|
1787
|
+
if (!content || typeof content !== "string") {
|
|
1788
|
+
return "";
|
|
1789
|
+
}
|
|
1790
|
+
let result = content;
|
|
1791
|
+
if (opts.stripHtml) {
|
|
1792
|
+
result = stripHtml(result);
|
|
1793
|
+
}
|
|
1794
|
+
if (opts.stripMarkdown) {
|
|
1795
|
+
result = stripMarkdownSyntax(result);
|
|
1796
|
+
}
|
|
1797
|
+
if (opts.normalizeWhitespace) {
|
|
1798
|
+
result = normalizeWhitespace(result);
|
|
1799
|
+
}
|
|
1800
|
+
return result;
|
|
1801
|
+
}
|
|
1674
1802
|
const PLUGIN_ID$1 = "strapi-content-embeddings";
|
|
1675
1803
|
const CONTENT_TYPE_UID$1 = `plugin::${PLUGIN_ID$1}.embedding`;
|
|
1676
1804
|
const embeddings = ({ strapi }) => ({
|
|
@@ -1683,6 +1811,8 @@ const embeddings = ({ strapi }) => ({
|
|
|
1683
1811
|
chunkSize: config2.chunkSize || 4e3,
|
|
1684
1812
|
chunkOverlap: config2.chunkOverlap || 200,
|
|
1685
1813
|
autoChunk: config2.autoChunk || false,
|
|
1814
|
+
preprocessContent: config2.preprocessContent !== false,
|
|
1815
|
+
// Default true
|
|
1686
1816
|
...config2
|
|
1687
1817
|
};
|
|
1688
1818
|
},
|
|
@@ -1690,8 +1820,9 @@ const embeddings = ({ strapi }) => ({
|
|
|
1690
1820
|
* Create a single embedding (no chunking)
|
|
1691
1821
|
*/
|
|
1692
1822
|
async createEmbedding(data) {
|
|
1693
|
-
const { title, content, collectionType, fieldName, metadata, related, autoChunk } = data.data;
|
|
1823
|
+
const { title, content: rawContent, collectionType, fieldName, metadata, related, autoChunk } = data.data;
|
|
1694
1824
|
const config2 = this.getConfig();
|
|
1825
|
+
const content = config2.preprocessContent ? preprocessContent(rawContent) : rawContent;
|
|
1695
1826
|
const shouldChunk = autoChunk ?? config2.autoChunk;
|
|
1696
1827
|
const chunkSize = config2.chunkSize || 4e3;
|
|
1697
1828
|
if (shouldChunk && needsChunking(content, chunkSize)) {
|
|
@@ -1741,8 +1872,9 @@ const embeddings = ({ strapi }) => ({
|
|
|
1741
1872
|
* Creates multiple embedding entities, one per chunk
|
|
1742
1873
|
*/
|
|
1743
1874
|
async createChunkedEmbedding(data) {
|
|
1744
|
-
const { title, content, collectionType, fieldName, metadata, related } = data.data;
|
|
1875
|
+
const { title, content: rawContent, collectionType, fieldName, metadata, related } = data.data;
|
|
1745
1876
|
const config2 = this.getConfig();
|
|
1877
|
+
const content = config2.preprocessContent ? preprocessContent(rawContent) : rawContent;
|
|
1746
1878
|
const chunkSize = config2.chunkSize || 4e3;
|
|
1747
1879
|
const chunkOverlap = config2.chunkOverlap || 200;
|
|
1748
1880
|
const chunks = chunkContent(content, { chunkSize, chunkOverlap });
|
|
@@ -2011,8 +2143,9 @@ const embeddings = ({ strapi }) => ({
|
|
|
2011
2143
|
}
|
|
2012
2144
|
},
|
|
2013
2145
|
async updateEmbedding(id, data) {
|
|
2014
|
-
const { title, content, metadata, autoChunk } = data.data;
|
|
2146
|
+
const { title, content: rawContent, metadata, autoChunk } = data.data;
|
|
2015
2147
|
const config2 = this.getConfig();
|
|
2148
|
+
const content = rawContent !== void 0 && config2.preprocessContent ? preprocessContent(rawContent) : rawContent;
|
|
2016
2149
|
const currentEntry = await strapi.documents(CONTENT_TYPE_UID$1).findOne({
|
|
2017
2150
|
documentId: id
|
|
2018
2151
|
});
|
|
@@ -2028,7 +2161,9 @@ const embeddings = ({ strapi }) => ({
|
|
|
2028
2161
|
const contentNeedsChunking = shouldChunk && needsChunking(newContent, chunkSize);
|
|
2029
2162
|
const contentChanged = content !== void 0 && content !== currentEntry.content;
|
|
2030
2163
|
if (hasRelatedChunks || contentNeedsChunking) {
|
|
2031
|
-
const result = await this.updateChunkedEmbedding(id,
|
|
2164
|
+
const result = await this.updateChunkedEmbedding(id, {
|
|
2165
|
+
data: { ...data.data, content }
|
|
2166
|
+
});
|
|
2032
2167
|
return result.entity;
|
|
2033
2168
|
}
|
|
2034
2169
|
const updateData = {};
|
|
@@ -2121,6 +2256,7 @@ const sync = ({ strapi }) => ({
|
|
|
2121
2256
|
const result = {
|
|
2122
2257
|
success: false,
|
|
2123
2258
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2259
|
+
dryRun,
|
|
2124
2260
|
neonCount: 0,
|
|
2125
2261
|
strapiCount: 0,
|
|
2126
2262
|
actions: {
|
package/dist/server/index.mjs
CHANGED
|
@@ -9,6 +9,8 @@ import { ListToolsRequestSchema, CallToolRequestSchema } from "@modelcontextprot
|
|
|
9
9
|
import { z } from "zod";
|
|
10
10
|
import { randomUUID } from "node:crypto";
|
|
11
11
|
import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
|
|
12
|
+
import { convert } from "html-to-text";
|
|
13
|
+
import removeMarkdown from "remove-markdown";
|
|
12
14
|
const EMBEDDING_MODELS = {
|
|
13
15
|
"text-embedding-3-small": { dimensions: 1536 },
|
|
14
16
|
"text-embedding-3-large": { dimensions: 3072 },
|
|
@@ -21,7 +23,8 @@ const config = {
|
|
|
21
23
|
embeddingModel: "text-embedding-3-small",
|
|
22
24
|
chunkSize: 4e3,
|
|
23
25
|
chunkOverlap: 200,
|
|
24
|
-
autoChunk: false
|
|
26
|
+
autoChunk: false,
|
|
27
|
+
preprocessContent: true
|
|
25
28
|
},
|
|
26
29
|
validator(config2) {
|
|
27
30
|
if (!config2.openAIApiKey) {
|
|
@@ -1511,6 +1514,32 @@ const admin = [
|
|
|
1511
1514
|
}
|
|
1512
1515
|
]
|
|
1513
1516
|
}
|
|
1517
|
+
},
|
|
1518
|
+
{
|
|
1519
|
+
method: "GET",
|
|
1520
|
+
path: "/sync/status",
|
|
1521
|
+
handler: "controller.getSyncStatus",
|
|
1522
|
+
config: {
|
|
1523
|
+
policies: [
|
|
1524
|
+
{
|
|
1525
|
+
name: "admin::hasPermissions",
|
|
1526
|
+
config: { actions: ["plugin::strapi-content-embeddings.read"] }
|
|
1527
|
+
}
|
|
1528
|
+
]
|
|
1529
|
+
}
|
|
1530
|
+
},
|
|
1531
|
+
{
|
|
1532
|
+
method: "POST",
|
|
1533
|
+
path: "/sync",
|
|
1534
|
+
handler: "controller.syncFromNeon",
|
|
1535
|
+
config: {
|
|
1536
|
+
policies: [
|
|
1537
|
+
{
|
|
1538
|
+
name: "admin::hasPermissions",
|
|
1539
|
+
config: { actions: ["plugin::strapi-content-embeddings.update"] }
|
|
1540
|
+
}
|
|
1541
|
+
]
|
|
1542
|
+
}
|
|
1514
1543
|
}
|
|
1515
1544
|
];
|
|
1516
1545
|
const routes = {
|
|
@@ -1670,6 +1699,103 @@ function formatChunkTitle(baseTitle, chunkIndex, totalChunks) {
|
|
|
1670
1699
|
}
|
|
1671
1700
|
return `${baseTitle} [Part ${chunkIndex + 1}/${totalChunks}]`;
|
|
1672
1701
|
}
|
|
1702
|
+
const DEFAULT_OPTIONS = {
|
|
1703
|
+
stripHtml: true,
|
|
1704
|
+
stripMarkdown: true,
|
|
1705
|
+
normalizeWhitespace: true
|
|
1706
|
+
};
|
|
1707
|
+
function containsHtml(content) {
|
|
1708
|
+
return /<[a-z][\s\S]*>/i.test(content);
|
|
1709
|
+
}
|
|
1710
|
+
function containsMarkdown(content) {
|
|
1711
|
+
const markdownPatterns = [
|
|
1712
|
+
/^#{1,6}\s/m,
|
|
1713
|
+
// Headers: # ## ### etc
|
|
1714
|
+
/\*\*[^*]+\*\*/,
|
|
1715
|
+
// Bold: **text**
|
|
1716
|
+
/\*[^*]+\*/,
|
|
1717
|
+
// Italic: *text*
|
|
1718
|
+
/__[^_]+__/,
|
|
1719
|
+
// Bold: __text__
|
|
1720
|
+
/_[^_]+_/,
|
|
1721
|
+
// Italic: _text_
|
|
1722
|
+
/\[.+\]\(.+\)/,
|
|
1723
|
+
// Links: [text](url)
|
|
1724
|
+
/^[-*+]\s/m,
|
|
1725
|
+
// Unordered lists: - * +
|
|
1726
|
+
/^\d+\.\s/m,
|
|
1727
|
+
// Ordered lists: 1. 2. etc
|
|
1728
|
+
/^>\s/m,
|
|
1729
|
+
// Blockquotes: >
|
|
1730
|
+
/`[^`]+`/,
|
|
1731
|
+
// Inline code: `code`
|
|
1732
|
+
/```[\s\S]*?```/,
|
|
1733
|
+
// Code blocks: ```code```
|
|
1734
|
+
/^\|.+\|$/m
|
|
1735
|
+
// Tables: |col|col|
|
|
1736
|
+
];
|
|
1737
|
+
return markdownPatterns.some((pattern) => pattern.test(content));
|
|
1738
|
+
}
|
|
1739
|
+
function stripHtml(content) {
|
|
1740
|
+
if (!containsHtml(content)) {
|
|
1741
|
+
return content;
|
|
1742
|
+
}
|
|
1743
|
+
return convert(content, {
|
|
1744
|
+
wordwrap: false,
|
|
1745
|
+
preserveNewlines: true,
|
|
1746
|
+
selectors: [
|
|
1747
|
+
// Convert headings to plain text with colon
|
|
1748
|
+
{ selector: "h1", options: { uppercase: false, trailingLineBreaks: 1 } },
|
|
1749
|
+
{ selector: "h2", options: { uppercase: false, trailingLineBreaks: 1 } },
|
|
1750
|
+
{ selector: "h3", options: { uppercase: false, trailingLineBreaks: 1 } },
|
|
1751
|
+
{ selector: "h4", options: { uppercase: false, trailingLineBreaks: 1 } },
|
|
1752
|
+
{ selector: "h5", options: { uppercase: false, trailingLineBreaks: 1 } },
|
|
1753
|
+
{ selector: "h6", options: { uppercase: false, trailingLineBreaks: 1 } },
|
|
1754
|
+
// Remove images but keep alt text
|
|
1755
|
+
{ selector: "img", format: "skip" },
|
|
1756
|
+
// Remove scripts and styles completely
|
|
1757
|
+
{ selector: "script", format: "skip" },
|
|
1758
|
+
{ selector: "style", format: "skip" },
|
|
1759
|
+
// Keep link text, remove URLs
|
|
1760
|
+
{ selector: "a", options: { ignoreHref: true } }
|
|
1761
|
+
]
|
|
1762
|
+
});
|
|
1763
|
+
}
|
|
1764
|
+
function stripMarkdownSyntax(content) {
|
|
1765
|
+
if (!containsMarkdown(content)) {
|
|
1766
|
+
return content;
|
|
1767
|
+
}
|
|
1768
|
+
let result = removeMarkdown(content, {
|
|
1769
|
+
stripListLeaders: true,
|
|
1770
|
+
listUnicodeChar: "",
|
|
1771
|
+
gfm: true,
|
|
1772
|
+
useImgAltText: true
|
|
1773
|
+
});
|
|
1774
|
+
result = result.replace(/```[\s\S]*?```/g, (match) => {
|
|
1775
|
+
return match.replace(/```\w*\n?/g, "").replace(/```/g, "");
|
|
1776
|
+
}).replace(/`([^`]+)`/g, "$1").replace(/^[-*_]{3,}$/gm, "");
|
|
1777
|
+
return result;
|
|
1778
|
+
}
|
|
1779
|
+
function normalizeWhitespace(content) {
|
|
1780
|
+
return content.replace(/\n{3,}/g, "\n\n").replace(/[ \t]+/g, " ").split("\n").map((line) => line.trim()).join("\n").trim();
|
|
1781
|
+
}
|
|
1782
|
+
function preprocessContent(content, options2 = {}) {
|
|
1783
|
+
const opts = { ...DEFAULT_OPTIONS, ...options2 };
|
|
1784
|
+
if (!content || typeof content !== "string") {
|
|
1785
|
+
return "";
|
|
1786
|
+
}
|
|
1787
|
+
let result = content;
|
|
1788
|
+
if (opts.stripHtml) {
|
|
1789
|
+
result = stripHtml(result);
|
|
1790
|
+
}
|
|
1791
|
+
if (opts.stripMarkdown) {
|
|
1792
|
+
result = stripMarkdownSyntax(result);
|
|
1793
|
+
}
|
|
1794
|
+
if (opts.normalizeWhitespace) {
|
|
1795
|
+
result = normalizeWhitespace(result);
|
|
1796
|
+
}
|
|
1797
|
+
return result;
|
|
1798
|
+
}
|
|
1673
1799
|
const PLUGIN_ID$1 = "strapi-content-embeddings";
|
|
1674
1800
|
const CONTENT_TYPE_UID$1 = `plugin::${PLUGIN_ID$1}.embedding`;
|
|
1675
1801
|
const embeddings = ({ strapi }) => ({
|
|
@@ -1682,6 +1808,8 @@ const embeddings = ({ strapi }) => ({
|
|
|
1682
1808
|
chunkSize: config2.chunkSize || 4e3,
|
|
1683
1809
|
chunkOverlap: config2.chunkOverlap || 200,
|
|
1684
1810
|
autoChunk: config2.autoChunk || false,
|
|
1811
|
+
preprocessContent: config2.preprocessContent !== false,
|
|
1812
|
+
// Default true
|
|
1685
1813
|
...config2
|
|
1686
1814
|
};
|
|
1687
1815
|
},
|
|
@@ -1689,8 +1817,9 @@ const embeddings = ({ strapi }) => ({
|
|
|
1689
1817
|
* Create a single embedding (no chunking)
|
|
1690
1818
|
*/
|
|
1691
1819
|
async createEmbedding(data) {
|
|
1692
|
-
const { title, content, collectionType, fieldName, metadata, related, autoChunk } = data.data;
|
|
1820
|
+
const { title, content: rawContent, collectionType, fieldName, metadata, related, autoChunk } = data.data;
|
|
1693
1821
|
const config2 = this.getConfig();
|
|
1822
|
+
const content = config2.preprocessContent ? preprocessContent(rawContent) : rawContent;
|
|
1694
1823
|
const shouldChunk = autoChunk ?? config2.autoChunk;
|
|
1695
1824
|
const chunkSize = config2.chunkSize || 4e3;
|
|
1696
1825
|
if (shouldChunk && needsChunking(content, chunkSize)) {
|
|
@@ -1740,8 +1869,9 @@ const embeddings = ({ strapi }) => ({
|
|
|
1740
1869
|
* Creates multiple embedding entities, one per chunk
|
|
1741
1870
|
*/
|
|
1742
1871
|
async createChunkedEmbedding(data) {
|
|
1743
|
-
const { title, content, collectionType, fieldName, metadata, related } = data.data;
|
|
1872
|
+
const { title, content: rawContent, collectionType, fieldName, metadata, related } = data.data;
|
|
1744
1873
|
const config2 = this.getConfig();
|
|
1874
|
+
const content = config2.preprocessContent ? preprocessContent(rawContent) : rawContent;
|
|
1745
1875
|
const chunkSize = config2.chunkSize || 4e3;
|
|
1746
1876
|
const chunkOverlap = config2.chunkOverlap || 200;
|
|
1747
1877
|
const chunks = chunkContent(content, { chunkSize, chunkOverlap });
|
|
@@ -2010,8 +2140,9 @@ const embeddings = ({ strapi }) => ({
|
|
|
2010
2140
|
}
|
|
2011
2141
|
},
|
|
2012
2142
|
async updateEmbedding(id, data) {
|
|
2013
|
-
const { title, content, metadata, autoChunk } = data.data;
|
|
2143
|
+
const { title, content: rawContent, metadata, autoChunk } = data.data;
|
|
2014
2144
|
const config2 = this.getConfig();
|
|
2145
|
+
const content = rawContent !== void 0 && config2.preprocessContent ? preprocessContent(rawContent) : rawContent;
|
|
2015
2146
|
const currentEntry = await strapi.documents(CONTENT_TYPE_UID$1).findOne({
|
|
2016
2147
|
documentId: id
|
|
2017
2148
|
});
|
|
@@ -2027,7 +2158,9 @@ const embeddings = ({ strapi }) => ({
|
|
|
2027
2158
|
const contentNeedsChunking = shouldChunk && needsChunking(newContent, chunkSize);
|
|
2028
2159
|
const contentChanged = content !== void 0 && content !== currentEntry.content;
|
|
2029
2160
|
if (hasRelatedChunks || contentNeedsChunking) {
|
|
2030
|
-
const result = await this.updateChunkedEmbedding(id,
|
|
2161
|
+
const result = await this.updateChunkedEmbedding(id, {
|
|
2162
|
+
data: { ...data.data, content }
|
|
2163
|
+
});
|
|
2031
2164
|
return result.entity;
|
|
2032
2165
|
}
|
|
2033
2166
|
const updateData = {};
|
|
@@ -2120,6 +2253,7 @@ const sync = ({ strapi }) => ({
|
|
|
2120
2253
|
const result = {
|
|
2121
2254
|
success: false,
|
|
2122
2255
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2256
|
+
dryRun,
|
|
2123
2257
|
neonCount: 0,
|
|
2124
2258
|
strapiCount: 0,
|
|
2125
2259
|
actions: {
|
|
@@ -20,6 +20,8 @@ export interface PluginConfigSchema {
|
|
|
20
20
|
chunkOverlap?: number;
|
|
21
21
|
/** Automatically chunk content that exceeds chunkSize (default: false) */
|
|
22
22
|
autoChunk?: boolean;
|
|
23
|
+
/** Preprocess content before embedding - strips HTML/Markdown (default: true) */
|
|
24
|
+
preprocessContent?: boolean;
|
|
23
25
|
}
|
|
24
26
|
declare const _default: {
|
|
25
27
|
default: {
|
|
@@ -29,6 +31,7 @@ declare const _default: {
|
|
|
29
31
|
chunkSize: number;
|
|
30
32
|
chunkOverlap: number;
|
|
31
33
|
autoChunk: boolean;
|
|
34
|
+
preprocessContent: boolean;
|
|
32
35
|
};
|
|
33
36
|
validator(config: PluginConfigSchema): void;
|
|
34
37
|
};
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content preprocessing utilities for cleaning text before embedding
|
|
3
|
+
* Handles HTML stripping, markdown conversion, and text normalization
|
|
4
|
+
*/
|
|
5
|
+
export interface PreprocessOptions {
|
|
6
|
+
/** Strip HTML tags from content (default: true) */
|
|
7
|
+
stripHtml?: boolean;
|
|
8
|
+
/** Strip markdown syntax from content (default: true) */
|
|
9
|
+
stripMarkdown?: boolean;
|
|
10
|
+
/** Normalize whitespace (collapse multiple spaces/newlines) (default: true) */
|
|
11
|
+
normalizeWhitespace?: boolean;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Preprocess content for embedding
|
|
15
|
+
* Strips HTML, markdown, and normalizes whitespace
|
|
16
|
+
*
|
|
17
|
+
* @param content - The raw content to preprocess
|
|
18
|
+
* @param options - Preprocessing options
|
|
19
|
+
* @returns Cleaned plain text ready for embedding
|
|
20
|
+
*/
|
|
21
|
+
export declare function preprocessContent(content: string, options?: PreprocessOptions): string;
|
|
22
|
+
/**
|
|
23
|
+
* Check if content needs preprocessing
|
|
24
|
+
* Returns true if content contains HTML or Markdown
|
|
25
|
+
*/
|
|
26
|
+
export declare function needsPreprocessing(content: string): boolean;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "strapi-content-embeddings",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.8",
|
|
4
4
|
"description": "Strapi v5 plugin for vector embeddings with OpenAI and Neon PostgreSQL. Enables semantic search, RAG chat, and MCP (Model Context Protocol) integration.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"strapi",
|
|
@@ -65,8 +65,10 @@
|
|
|
65
65
|
"@modelcontextprotocol/sdk": "^1.12.0",
|
|
66
66
|
"@strapi/design-system": "^2.0.0-rc.12",
|
|
67
67
|
"@strapi/icons": "^2.0.0-rc.12",
|
|
68
|
+
"html-to-text": "^9.0.5",
|
|
68
69
|
"langchain": "^1.2.4",
|
|
69
70
|
"pg": "^8.13.1",
|
|
71
|
+
"remove-markdown": "^0.5.5",
|
|
70
72
|
"qs": "^6.13.1",
|
|
71
73
|
"react-intl": "^6.8.4",
|
|
72
74
|
"react-markdown": "^10.1.0",
|