strapi-content-embeddings 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,10 @@ const types_js = require("@modelcontextprotocol/sdk/types.js");
10
10
  const zod = require("zod");
11
11
  const node_crypto = require("node:crypto");
12
12
  const streamableHttp_js = require("@modelcontextprotocol/sdk/server/streamableHttp.js");
13
+ const htmlToText = require("html-to-text");
14
+ const removeMarkdown = require("remove-markdown");
15
+ const _interopDefault = (e) => e && e.__esModule ? e : { default: e };
16
+ const removeMarkdown__default = /* @__PURE__ */ _interopDefault(removeMarkdown);
13
17
  const EMBEDDING_MODELS = {
14
18
  "text-embedding-3-small": { dimensions: 1536 },
15
19
  "text-embedding-3-large": { dimensions: 3072 },
@@ -22,7 +26,8 @@ const config = {
22
26
  embeddingModel: "text-embedding-3-small",
23
27
  chunkSize: 4e3,
24
28
  chunkOverlap: 200,
25
- autoChunk: false
29
+ autoChunk: false,
30
+ preprocessContent: true
26
31
  },
27
32
  validator(config2) {
28
33
  if (!config2.openAIApiKey) {
@@ -1512,6 +1517,32 @@ const admin = [
1512
1517
  }
1513
1518
  ]
1514
1519
  }
1520
+ },
1521
+ {
1522
+ method: "GET",
1523
+ path: "/sync/status",
1524
+ handler: "controller.getSyncStatus",
1525
+ config: {
1526
+ policies: [
1527
+ {
1528
+ name: "admin::hasPermissions",
1529
+ config: { actions: ["plugin::strapi-content-embeddings.read"] }
1530
+ }
1531
+ ]
1532
+ }
1533
+ },
1534
+ {
1535
+ method: "POST",
1536
+ path: "/sync",
1537
+ handler: "controller.syncFromNeon",
1538
+ config: {
1539
+ policies: [
1540
+ {
1541
+ name: "admin::hasPermissions",
1542
+ config: { actions: ["plugin::strapi-content-embeddings.update"] }
1543
+ }
1544
+ ]
1545
+ }
1515
1546
  }
1516
1547
  ];
1517
1548
  const routes = {
@@ -1671,6 +1702,103 @@ function formatChunkTitle(baseTitle, chunkIndex, totalChunks) {
1671
1702
  }
1672
1703
  return `${baseTitle} [Part ${chunkIndex + 1}/${totalChunks}]`;
1673
1704
  }
1705
+ const DEFAULT_OPTIONS = {
1706
+ stripHtml: true,
1707
+ stripMarkdown: true,
1708
+ normalizeWhitespace: true
1709
+ };
1710
+ function containsHtml(content) {
1711
+ return /<[a-z][\s\S]*>/i.test(content);
1712
+ }
1713
+ function containsMarkdown(content) {
1714
+ const markdownPatterns = [
1715
+ /^#{1,6}\s/m,
1716
+ // Headers: # ## ### etc
1717
+ /\*\*[^*]+\*\*/,
1718
+ // Bold: **text**
1719
+ /\*[^*]+\*/,
1720
+ // Italic: *text*
1721
+ /__[^_]+__/,
1722
+ // Bold: __text__
1723
+ /_[^_]+_/,
1724
+ // Italic: _text_
1725
+ /\[.+\]\(.+\)/,
1726
+ // Links: [text](url)
1727
+ /^[-*+]\s/m,
1728
+ // Unordered lists: - * +
1729
+ /^\d+\.\s/m,
1730
+ // Ordered lists: 1. 2. etc
1731
+ /^>\s/m,
1732
+ // Blockquotes: >
1733
+ /`[^`]+`/,
1734
+ // Inline code: `code`
1735
+ /```[\s\S]*?```/,
1736
+ // Code blocks: ```code```
1737
+ /^\|.+\|$/m
1738
+ // Tables: |col|col|
1739
+ ];
1740
+ return markdownPatterns.some((pattern) => pattern.test(content));
1741
+ }
1742
+ function stripHtml(content) {
1743
+ if (!containsHtml(content)) {
1744
+ return content;
1745
+ }
1746
+ return htmlToText.convert(content, {
1747
+ wordwrap: false,
1748
+ preserveNewlines: true,
1749
+ selectors: [
1750
+ // Convert headings to plain text with colon
1751
+ { selector: "h1", options: { uppercase: false, trailingLineBreaks: 1 } },
1752
+ { selector: "h2", options: { uppercase: false, trailingLineBreaks: 1 } },
1753
+ { selector: "h3", options: { uppercase: false, trailingLineBreaks: 1 } },
1754
+ { selector: "h4", options: { uppercase: false, trailingLineBreaks: 1 } },
1755
+ { selector: "h5", options: { uppercase: false, trailingLineBreaks: 1 } },
1756
+ { selector: "h6", options: { uppercase: false, trailingLineBreaks: 1 } },
1757
+ // Remove images but keep alt text
1758
+ { selector: "img", format: "skip" },
1759
+ // Remove scripts and styles completely
1760
+ { selector: "script", format: "skip" },
1761
+ { selector: "style", format: "skip" },
1762
+ // Keep link text, remove URLs
1763
+ { selector: "a", options: { ignoreHref: true } }
1764
+ ]
1765
+ });
1766
+ }
1767
+ function stripMarkdownSyntax(content) {
1768
+ if (!containsMarkdown(content)) {
1769
+ return content;
1770
+ }
1771
+ let result = removeMarkdown__default.default(content, {
1772
+ stripListLeaders: true,
1773
+ listUnicodeChar: "",
1774
+ gfm: true,
1775
+ useImgAltText: true
1776
+ });
1777
+ result = result.replace(/```[\s\S]*?```/g, (match) => {
1778
+ return match.replace(/```\w*\n?/g, "").replace(/```/g, "");
1779
+ }).replace(/`([^`]+)`/g, "$1").replace(/^[-*_]{3,}$/gm, "");
1780
+ return result;
1781
+ }
1782
+ function normalizeWhitespace(content) {
1783
+ return content.replace(/\n{3,}/g, "\n\n").replace(/[ \t]+/g, " ").split("\n").map((line) => line.trim()).join("\n").trim();
1784
+ }
1785
+ function preprocessContent(content, options2 = {}) {
1786
+ const opts = { ...DEFAULT_OPTIONS, ...options2 };
1787
+ if (!content || typeof content !== "string") {
1788
+ return "";
1789
+ }
1790
+ let result = content;
1791
+ if (opts.stripHtml) {
1792
+ result = stripHtml(result);
1793
+ }
1794
+ if (opts.stripMarkdown) {
1795
+ result = stripMarkdownSyntax(result);
1796
+ }
1797
+ if (opts.normalizeWhitespace) {
1798
+ result = normalizeWhitespace(result);
1799
+ }
1800
+ return result;
1801
+ }
1674
1802
  const PLUGIN_ID$1 = "strapi-content-embeddings";
1675
1803
  const CONTENT_TYPE_UID$1 = `plugin::${PLUGIN_ID$1}.embedding`;
1676
1804
  const embeddings = ({ strapi }) => ({
@@ -1683,6 +1811,8 @@ const embeddings = ({ strapi }) => ({
1683
1811
  chunkSize: config2.chunkSize || 4e3,
1684
1812
  chunkOverlap: config2.chunkOverlap || 200,
1685
1813
  autoChunk: config2.autoChunk || false,
1814
+ preprocessContent: config2.preprocessContent !== false,
1815
+ // Default true
1686
1816
  ...config2
1687
1817
  };
1688
1818
  },
@@ -1690,8 +1820,9 @@ const embeddings = ({ strapi }) => ({
1690
1820
  * Create a single embedding (no chunking)
1691
1821
  */
1692
1822
  async createEmbedding(data) {
1693
- const { title, content, collectionType, fieldName, metadata, related, autoChunk } = data.data;
1823
+ const { title, content: rawContent, collectionType, fieldName, metadata, related, autoChunk } = data.data;
1694
1824
  const config2 = this.getConfig();
1825
+ const content = config2.preprocessContent ? preprocessContent(rawContent) : rawContent;
1695
1826
  const shouldChunk = autoChunk ?? config2.autoChunk;
1696
1827
  const chunkSize = config2.chunkSize || 4e3;
1697
1828
  if (shouldChunk && needsChunking(content, chunkSize)) {
@@ -1741,8 +1872,9 @@ const embeddings = ({ strapi }) => ({
1741
1872
  * Creates multiple embedding entities, one per chunk
1742
1873
  */
1743
1874
  async createChunkedEmbedding(data) {
1744
- const { title, content, collectionType, fieldName, metadata, related } = data.data;
1875
+ const { title, content: rawContent, collectionType, fieldName, metadata, related } = data.data;
1745
1876
  const config2 = this.getConfig();
1877
+ const content = config2.preprocessContent ? preprocessContent(rawContent) : rawContent;
1746
1878
  const chunkSize = config2.chunkSize || 4e3;
1747
1879
  const chunkOverlap = config2.chunkOverlap || 200;
1748
1880
  const chunks = chunkContent(content, { chunkSize, chunkOverlap });
@@ -2011,8 +2143,9 @@ const embeddings = ({ strapi }) => ({
2011
2143
  }
2012
2144
  },
2013
2145
  async updateEmbedding(id, data) {
2014
- const { title, content, metadata, autoChunk } = data.data;
2146
+ const { title, content: rawContent, metadata, autoChunk } = data.data;
2015
2147
  const config2 = this.getConfig();
2148
+ const content = rawContent !== void 0 && config2.preprocessContent ? preprocessContent(rawContent) : rawContent;
2016
2149
  const currentEntry = await strapi.documents(CONTENT_TYPE_UID$1).findOne({
2017
2150
  documentId: id
2018
2151
  });
@@ -2028,7 +2161,9 @@ const embeddings = ({ strapi }) => ({
2028
2161
  const contentNeedsChunking = shouldChunk && needsChunking(newContent, chunkSize);
2029
2162
  const contentChanged = content !== void 0 && content !== currentEntry.content;
2030
2163
  if (hasRelatedChunks || contentNeedsChunking) {
2031
- const result = await this.updateChunkedEmbedding(id, data);
2164
+ const result = await this.updateChunkedEmbedding(id, {
2165
+ data: { ...data.data, content }
2166
+ });
2032
2167
  return result.entity;
2033
2168
  }
2034
2169
  const updateData = {};
@@ -2121,6 +2256,7 @@ const sync = ({ strapi }) => ({
2121
2256
  const result = {
2122
2257
  success: false,
2123
2258
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2259
+ dryRun,
2124
2260
  neonCount: 0,
2125
2261
  strapiCount: 0,
2126
2262
  actions: {
@@ -9,6 +9,8 @@ import { ListToolsRequestSchema, CallToolRequestSchema } from "@modelcontextprot
9
9
  import { z } from "zod";
10
10
  import { randomUUID } from "node:crypto";
11
11
  import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
12
+ import { convert } from "html-to-text";
13
+ import removeMarkdown from "remove-markdown";
12
14
  const EMBEDDING_MODELS = {
13
15
  "text-embedding-3-small": { dimensions: 1536 },
14
16
  "text-embedding-3-large": { dimensions: 3072 },
@@ -21,7 +23,8 @@ const config = {
21
23
  embeddingModel: "text-embedding-3-small",
22
24
  chunkSize: 4e3,
23
25
  chunkOverlap: 200,
24
- autoChunk: false
26
+ autoChunk: false,
27
+ preprocessContent: true
25
28
  },
26
29
  validator(config2) {
27
30
  if (!config2.openAIApiKey) {
@@ -1511,6 +1514,32 @@ const admin = [
1511
1514
  }
1512
1515
  ]
1513
1516
  }
1517
+ },
1518
+ {
1519
+ method: "GET",
1520
+ path: "/sync/status",
1521
+ handler: "controller.getSyncStatus",
1522
+ config: {
1523
+ policies: [
1524
+ {
1525
+ name: "admin::hasPermissions",
1526
+ config: { actions: ["plugin::strapi-content-embeddings.read"] }
1527
+ }
1528
+ ]
1529
+ }
1530
+ },
1531
+ {
1532
+ method: "POST",
1533
+ path: "/sync",
1534
+ handler: "controller.syncFromNeon",
1535
+ config: {
1536
+ policies: [
1537
+ {
1538
+ name: "admin::hasPermissions",
1539
+ config: { actions: ["plugin::strapi-content-embeddings.update"] }
1540
+ }
1541
+ ]
1542
+ }
1514
1543
  }
1515
1544
  ];
1516
1545
  const routes = {
@@ -1670,6 +1699,103 @@ function formatChunkTitle(baseTitle, chunkIndex, totalChunks) {
1670
1699
  }
1671
1700
  return `${baseTitle} [Part ${chunkIndex + 1}/${totalChunks}]`;
1672
1701
  }
1702
+ const DEFAULT_OPTIONS = {
1703
+ stripHtml: true,
1704
+ stripMarkdown: true,
1705
+ normalizeWhitespace: true
1706
+ };
1707
+ function containsHtml(content) {
1708
+ return /<[a-z][\s\S]*>/i.test(content);
1709
+ }
1710
+ function containsMarkdown(content) {
1711
+ const markdownPatterns = [
1712
+ /^#{1,6}\s/m,
1713
+ // Headers: # ## ### etc
1714
+ /\*\*[^*]+\*\*/,
1715
+ // Bold: **text**
1716
+ /\*[^*]+\*/,
1717
+ // Italic: *text*
1718
+ /__[^_]+__/,
1719
+ // Bold: __text__
1720
+ /_[^_]+_/,
1721
+ // Italic: _text_
1722
+ /\[.+\]\(.+\)/,
1723
+ // Links: [text](url)
1724
+ /^[-*+]\s/m,
1725
+ // Unordered lists: - * +
1726
+ /^\d+\.\s/m,
1727
+ // Ordered lists: 1. 2. etc
1728
+ /^>\s/m,
1729
+ // Blockquotes: >
1730
+ /`[^`]+`/,
1731
+ // Inline code: `code`
1732
+ /```[\s\S]*?```/,
1733
+ // Code blocks: ```code```
1734
+ /^\|.+\|$/m
1735
+ // Tables: |col|col|
1736
+ ];
1737
+ return markdownPatterns.some((pattern) => pattern.test(content));
1738
+ }
1739
+ function stripHtml(content) {
1740
+ if (!containsHtml(content)) {
1741
+ return content;
1742
+ }
1743
+ return convert(content, {
1744
+ wordwrap: false,
1745
+ preserveNewlines: true,
1746
+ selectors: [
1747
+ // Convert headings to plain text with colon
1748
+ { selector: "h1", options: { uppercase: false, trailingLineBreaks: 1 } },
1749
+ { selector: "h2", options: { uppercase: false, trailingLineBreaks: 1 } },
1750
+ { selector: "h3", options: { uppercase: false, trailingLineBreaks: 1 } },
1751
+ { selector: "h4", options: { uppercase: false, trailingLineBreaks: 1 } },
1752
+ { selector: "h5", options: { uppercase: false, trailingLineBreaks: 1 } },
1753
+ { selector: "h6", options: { uppercase: false, trailingLineBreaks: 1 } },
1754
+ // Remove images but keep alt text
1755
+ { selector: "img", format: "skip" },
1756
+ // Remove scripts and styles completely
1757
+ { selector: "script", format: "skip" },
1758
+ { selector: "style", format: "skip" },
1759
+ // Keep link text, remove URLs
1760
+ { selector: "a", options: { ignoreHref: true } }
1761
+ ]
1762
+ });
1763
+ }
1764
+ function stripMarkdownSyntax(content) {
1765
+ if (!containsMarkdown(content)) {
1766
+ return content;
1767
+ }
1768
+ let result = removeMarkdown(content, {
1769
+ stripListLeaders: true,
1770
+ listUnicodeChar: "",
1771
+ gfm: true,
1772
+ useImgAltText: true
1773
+ });
1774
+ result = result.replace(/```[\s\S]*?```/g, (match) => {
1775
+ return match.replace(/```\w*\n?/g, "").replace(/```/g, "");
1776
+ }).replace(/`([^`]+)`/g, "$1").replace(/^[-*_]{3,}$/gm, "");
1777
+ return result;
1778
+ }
1779
+ function normalizeWhitespace(content) {
1780
+ return content.replace(/\n{3,}/g, "\n\n").replace(/[ \t]+/g, " ").split("\n").map((line) => line.trim()).join("\n").trim();
1781
+ }
1782
+ function preprocessContent(content, options2 = {}) {
1783
+ const opts = { ...DEFAULT_OPTIONS, ...options2 };
1784
+ if (!content || typeof content !== "string") {
1785
+ return "";
1786
+ }
1787
+ let result = content;
1788
+ if (opts.stripHtml) {
1789
+ result = stripHtml(result);
1790
+ }
1791
+ if (opts.stripMarkdown) {
1792
+ result = stripMarkdownSyntax(result);
1793
+ }
1794
+ if (opts.normalizeWhitespace) {
1795
+ result = normalizeWhitespace(result);
1796
+ }
1797
+ return result;
1798
+ }
1673
1799
  const PLUGIN_ID$1 = "strapi-content-embeddings";
1674
1800
  const CONTENT_TYPE_UID$1 = `plugin::${PLUGIN_ID$1}.embedding`;
1675
1801
  const embeddings = ({ strapi }) => ({
@@ -1682,6 +1808,8 @@ const embeddings = ({ strapi }) => ({
1682
1808
  chunkSize: config2.chunkSize || 4e3,
1683
1809
  chunkOverlap: config2.chunkOverlap || 200,
1684
1810
  autoChunk: config2.autoChunk || false,
1811
+ preprocessContent: config2.preprocessContent !== false,
1812
+ // Default true
1685
1813
  ...config2
1686
1814
  };
1687
1815
  },
@@ -1689,8 +1817,9 @@ const embeddings = ({ strapi }) => ({
1689
1817
  * Create a single embedding (no chunking)
1690
1818
  */
1691
1819
  async createEmbedding(data) {
1692
- const { title, content, collectionType, fieldName, metadata, related, autoChunk } = data.data;
1820
+ const { title, content: rawContent, collectionType, fieldName, metadata, related, autoChunk } = data.data;
1693
1821
  const config2 = this.getConfig();
1822
+ const content = config2.preprocessContent ? preprocessContent(rawContent) : rawContent;
1694
1823
  const shouldChunk = autoChunk ?? config2.autoChunk;
1695
1824
  const chunkSize = config2.chunkSize || 4e3;
1696
1825
  if (shouldChunk && needsChunking(content, chunkSize)) {
@@ -1740,8 +1869,9 @@ const embeddings = ({ strapi }) => ({
1740
1869
  * Creates multiple embedding entities, one per chunk
1741
1870
  */
1742
1871
  async createChunkedEmbedding(data) {
1743
- const { title, content, collectionType, fieldName, metadata, related } = data.data;
1872
+ const { title, content: rawContent, collectionType, fieldName, metadata, related } = data.data;
1744
1873
  const config2 = this.getConfig();
1874
+ const content = config2.preprocessContent ? preprocessContent(rawContent) : rawContent;
1745
1875
  const chunkSize = config2.chunkSize || 4e3;
1746
1876
  const chunkOverlap = config2.chunkOverlap || 200;
1747
1877
  const chunks = chunkContent(content, { chunkSize, chunkOverlap });
@@ -2010,8 +2140,9 @@ const embeddings = ({ strapi }) => ({
2010
2140
  }
2011
2141
  },
2012
2142
  async updateEmbedding(id, data) {
2013
- const { title, content, metadata, autoChunk } = data.data;
2143
+ const { title, content: rawContent, metadata, autoChunk } = data.data;
2014
2144
  const config2 = this.getConfig();
2145
+ const content = rawContent !== void 0 && config2.preprocessContent ? preprocessContent(rawContent) : rawContent;
2015
2146
  const currentEntry = await strapi.documents(CONTENT_TYPE_UID$1).findOne({
2016
2147
  documentId: id
2017
2148
  });
@@ -2027,7 +2158,9 @@ const embeddings = ({ strapi }) => ({
2027
2158
  const contentNeedsChunking = shouldChunk && needsChunking(newContent, chunkSize);
2028
2159
  const contentChanged = content !== void 0 && content !== currentEntry.content;
2029
2160
  if (hasRelatedChunks || contentNeedsChunking) {
2030
- const result = await this.updateChunkedEmbedding(id, data);
2161
+ const result = await this.updateChunkedEmbedding(id, {
2162
+ data: { ...data.data, content }
2163
+ });
2031
2164
  return result.entity;
2032
2165
  }
2033
2166
  const updateData = {};
@@ -2120,6 +2253,7 @@ const sync = ({ strapi }) => ({
2120
2253
  const result = {
2121
2254
  success: false,
2122
2255
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2256
+ dryRun,
2123
2257
  neonCount: 0,
2124
2258
  strapiCount: 0,
2125
2259
  actions: {
@@ -20,6 +20,8 @@ export interface PluginConfigSchema {
20
20
  chunkOverlap?: number;
21
21
  /** Automatically chunk content that exceeds chunkSize (default: false) */
22
22
  autoChunk?: boolean;
23
+ /** Preprocess content before embedding - strips HTML/Markdown (default: true) */
24
+ preprocessContent?: boolean;
23
25
  }
24
26
  declare const _default: {
25
27
  default: {
@@ -29,6 +31,7 @@ declare const _default: {
29
31
  chunkSize: number;
30
32
  chunkOverlap: number;
31
33
  autoChunk: boolean;
34
+ preprocessContent: boolean;
32
35
  };
33
36
  validator(config: PluginConfigSchema): void;
34
37
  };
@@ -16,6 +16,7 @@ declare const _default: {
16
16
  chunkSize: number;
17
17
  chunkOverlap: number;
18
18
  autoChunk: boolean;
19
+ preprocessContent: boolean;
19
20
  };
20
21
  validator(config: import("./config").PluginConfigSchema): void;
21
22
  };
@@ -2,6 +2,7 @@ import type { Core } from "@strapi/strapi";
2
2
  export interface SyncResult {
3
3
  success: boolean;
4
4
  timestamp: string;
5
+ dryRun: boolean;
5
6
  neonCount: number;
6
7
  strapiCount: number;
7
8
  actions: {
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Content preprocessing utilities for cleaning text before embedding
3
+ * Handles HTML stripping, markdown conversion, and text normalization
4
+ */
5
+ export interface PreprocessOptions {
6
+ /** Strip HTML tags from content (default: true) */
7
+ stripHtml?: boolean;
8
+ /** Strip markdown syntax from content (default: true) */
9
+ stripMarkdown?: boolean;
10
+ /** Normalize whitespace (collapse multiple spaces/newlines) (default: true) */
11
+ normalizeWhitespace?: boolean;
12
+ }
13
+ /**
14
+ * Preprocess content for embedding
15
+ * Strips HTML, markdown, and normalizes whitespace
16
+ *
17
+ * @param content - The raw content to preprocess
18
+ * @param options - Preprocessing options
19
+ * @returns Cleaned plain text ready for embedding
20
+ */
21
+ export declare function preprocessContent(content: string, options?: PreprocessOptions): string;
22
+ /**
23
+ * Check if content needs preprocessing
24
+ * Returns true if content contains HTML or Markdown
25
+ */
26
+ export declare function needsPreprocessing(content: string): boolean;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "strapi-content-embeddings",
3
- "version": "0.1.7",
3
+ "version": "0.1.8",
4
4
  "description": "Strapi v5 plugin for vector embeddings with OpenAI and Neon PostgreSQL. Enables semantic search, RAG chat, and MCP (Model Context Protocol) integration.",
5
5
  "keywords": [
6
6
  "strapi",
@@ -65,8 +65,10 @@
65
65
  "@modelcontextprotocol/sdk": "^1.12.0",
66
66
  "@strapi/design-system": "^2.0.0-rc.12",
67
67
  "@strapi/icons": "^2.0.0-rc.12",
68
+ "html-to-text": "^9.0.5",
68
69
  "langchain": "^1.2.4",
69
70
  "pg": "^8.13.1",
71
+ "remove-markdown": "^0.5.5",
70
72
  "qs": "^6.13.1",
71
73
  "react-intl": "^6.8.4",
72
74
  "react-markdown": "^10.1.0",