npm - yt-transcript-strapi-plugin - Versions diffs - 0.0.22 → 0.0.26 - Mend

yt-transcript-strapi-plugin 0.0.22 → 0.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/dist/server/index.js CHANGED Viewed

@@ -15,7 +15,18 @@ const ListTranscriptsSchema = zod.z.object({
   sort: zod.z.string().optional().default("createdAt:desc")
 });
 const GetTranscriptSchema = zod.z.object({
-  videoId: zod.z.string().min(1, "Video ID is required")
+  videoId: zod.z.string().min(1, "Video ID is required"),
+  includeFullTranscript: zod.z.boolean().optional().default(false),
+  includeTimecodes: zod.z.boolean().optional().default(false),
+  startTime: zod.z.number().min(0).optional(),
+  endTime: zod.z.number().min(0).optional(),
+  chunkIndex: zod.z.number().int().min(0).optional(),
+  chunkSize: zod.z.number().int().min(30).optional()
+});
+const SearchTranscriptSchema = zod.z.object({
+  videoId: zod.z.string().min(1, "Video ID is required"),
+  query: zod.z.string().min(1, "Search query is required"),
+  maxResults: zod.z.number().int().min(1).max(20).optional().default(5)
 });
 const FindTranscriptsSchema = zod.z.object({
   query: zod.z.string().optional(),
@@ -30,6 +41,7 @@ const ToolSchemas = {
   fetch_transcript: FetchTranscriptSchema,
   list_transcripts: ListTranscriptsSchema,
   get_transcript: GetTranscriptSchema,
+  search_transcript: SearchTranscriptSchema,
   find_transcripts: FindTranscriptsSchema
 };
 function validateToolInput(toolName, input) {
@@ -64,7 +76,7 @@ function extractYouTubeID(urlOrID) {
 }
 const fetchTranscriptTool = {
   name: "fetch_transcript",
-  description: "Fetch a transcript from YouTube for a given video ID or URL. The transcript is saved to the database for future retrieval.",
+  description: "Fetch a transcript from YouTube for a given video ID or URL. The transcript is saved to the database. Returns metadata and preview only to avoid context overflow. Use get_transcript to retrieve content.",
   inputSchema: {
     type: "object",
     properties: {
@@ -76,9 +88,47 @@ const fetchTranscriptTool = {
     required: ["videoId"]
   }
 };
+function getVideoDurationMs$1(timecodes) {
+  if (!timecodes || timecodes.length === 0) return 0;
+  const lastEntry = timecodes[timecodes.length - 1];
+  return lastEntry.end || lastEntry.start + (lastEntry.duration || 0);
+}
+function formatTime$2(ms) {
+  const totalSeconds = Math.floor(ms / 1e3);
+  const hours = Math.floor(totalSeconds / 3600);
+  const minutes = Math.floor(totalSeconds % 3600 / 60);
+  const seconds = totalSeconds % 60;
+  if (hours > 0) {
+    return `${hours}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
+  }
+  return `${minutes}:${seconds.toString().padStart(2, "0")}`;
+}
+function buildMetadataResponse(transcript2, previewLength, cached) {
+  const fullText = transcript2.fullTranscript || "";
+  const timecodes = transcript2.transcriptWithTimeCodes || [];
+  const durationMs = getVideoDurationMs$1(timecodes);
+  const wordCount = fullText.split(/\s+/).length;
+  const preview = fullText.length > previewLength ? fullText.substring(0, previewLength) + "..." : fullText;
+  return {
+    message: cached ? "Transcript already exists in database" : "Transcript fetched and saved successfully",
+    cached,
+    videoId: transcript2.videoId,
+    title: transcript2.title,
+    metadata: {
+      wordCount,
+      characterCount: fullText.length,
+      duration: formatTime$2(durationMs),
+      durationSeconds: Math.floor(durationMs / 1e3)
+    },
+    preview,
+    usage: "Use get_transcript with videoId to retrieve full content, specific time ranges, or paginated chunks."
+  };
+}
 async function handleFetchTranscript(strapi, args) {
   const validatedArgs = validateToolInput("fetch_transcript", args);
   const { videoId: videoIdOrUrl } = validatedArgs;
+  const pluginConfig = await strapi.config.get("plugin::yt-transcript-strapi-plugin");
+  const previewLength = pluginConfig?.previewLength || 500;
   const videoId = extractYouTubeID(videoIdOrUrl);
   if (!videoId) {
     throw new Error(`Invalid YouTube video ID or URL: "${videoIdOrUrl}". Please provide a valid 11-character video ID or YouTube URL.`);
@@ -91,11 +141,7 @@ async function handleFetchTranscript(strapi, args) {
         {
           type: "text",
           text: JSON.stringify(
-            {
-              message: "Transcript already exists in database",
-              data: existingTranscript,
-              cached: true
-            },
+            buildMetadataResponse(existingTranscript, previewLength, true),
             null,
             2
           )
@@ -119,11 +165,7 @@ async function handleFetchTranscript(strapi, args) {
       {
         type: "text",
         text: JSON.stringify(
-          {
-            message: "Transcript fetched and saved successfully",
-            data: savedTranscript,
-            cached: false
-          },
+          buildMetadataResponse(savedTranscript, previewLength, false),
           null,
           2
         )
@@ -191,21 +233,82 @@ async function handleListTranscripts(strapi, args) {
 }
 const getTranscriptTool = {
   name: "get_transcript",
-  description: "Get a specific saved transcript by YouTube video ID. Returns the full transcript data including any readable version if available.",
+  description: "Get a saved transcript by YouTube video ID. Returns metadata and preview by default. Use parameters to get full content or specific time ranges to avoid context overflow.",
   inputSchema: {
     type: "object",
     properties: {
       videoId: {
         type: "string",
         description: 'YouTube video ID (e.g., "dQw4w9WgXcQ") or full YouTube URL'
+      },
+      includeFullTranscript: {
+        type: "boolean",
+        description: "Include the complete transcript text. Warning: may cause context overflow for long videos. Default: false",
+        default: false
+      },
+      includeTimecodes: {
+        type: "boolean",
+        description: "Include the transcript with timecodes array. Warning: significantly increases response size. Default: false",
+        default: false
+      },
+      startTime: {
+        type: "number",
+        description: "Start time in seconds for fetching a specific portion of the transcript"
+      },
+      endTime: {
+        type: "number",
+        description: "End time in seconds for fetching a specific portion of the transcript"
+      },
+      chunkIndex: {
+        type: "number",
+        description: "Chunk index (0-based) when paginating through transcript. Use with chunkSize to paginate through long videos."
+      },
+      chunkSize: {
+        type: "number",
+        description: "Chunk size in seconds. Overrides config default. Use with chunkIndex for pagination."
       }
     },
     required: ["videoId"]
   }
 };
+function getTranscriptForTimeRange(timecodes, startTimeMs, endTimeMs) {
+  const entries = timecodes.filter(
+    (entry) => entry.start >= startTimeMs && entry.start < endTimeMs
+  );
+  const text = entries.map((e) => e.text).join(" ");
+  return { text, entries };
+}
+function getVideoDurationMs(timecodes) {
+  if (!timecodes || timecodes.length === 0) return 0;
+  const lastEntry = timecodes[timecodes.length - 1];
+  return lastEntry.end || lastEntry.start + (lastEntry.duration || 0);
+}
+function formatTime$1(ms) {
+  const totalSeconds = Math.floor(ms / 1e3);
+  const hours = Math.floor(totalSeconds / 3600);
+  const minutes = Math.floor(totalSeconds % 3600 / 60);
+  const seconds = totalSeconds % 60;
+  if (hours > 0) {
+    return `${hours}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
+  }
+  return `${minutes}:${seconds.toString().padStart(2, "0")}`;
+}
 async function handleGetTranscript(strapi, args) {
   const validatedArgs = validateToolInput("get_transcript", args);
-  const { videoId: videoIdOrUrl } = validatedArgs;
+  const {
+    videoId: videoIdOrUrl,
+    includeFullTranscript,
+    includeTimecodes,
+    startTime,
+    endTime,
+    chunkIndex,
+    chunkSize: chunkSizeOverride
+  } = validatedArgs;
+  const pluginConfig = await strapi.config.get("plugin::yt-transcript-strapi-plugin");
+  const defaultChunkSize = pluginConfig?.chunkSizeSeconds || 300;
+  const previewLength = pluginConfig?.previewLength || 500;
+  const maxFullTranscriptLength = pluginConfig?.maxFullTranscriptLength || 5e4;
+  const chunkSizeSeconds = chunkSizeOverride || defaultChunkSize;
   const videoId = extractYouTubeID(videoIdOrUrl);
   if (!videoId) {
     throw new Error(`Invalid YouTube video ID or URL: "${videoIdOrUrl}". Please provide a valid 11-character video ID or YouTube URL.`);
@@ -230,13 +333,308 @@ async function handleGetTranscript(strapi, args) {
       ]
     };
   }
+  const timecodes = transcript2.transcriptWithTimeCodes || [];
+  const fullText = transcript2.fullTranscript || "";
+  const durationMs = getVideoDurationMs(timecodes);
+  const totalChunks = Math.ceil(durationMs / (chunkSizeSeconds * 1e3));
+  const wordCount = fullText.split(/\s+/).length;
+  const response = {
+    videoId: transcript2.videoId,
+    title: transcript2.title,
+    metadata: {
+      wordCount,
+      characterCount: fullText.length,
+      duration: formatTime$1(durationMs),
+      durationSeconds: Math.floor(durationMs / 1e3),
+      totalChunks,
+      chunkSizeSeconds
+    }
+  };
+  if (startTime !== void 0 || endTime !== void 0) {
+    const startMs = (startTime || 0) * 1e3;
+    const endMs = endTime !== void 0 ? endTime * 1e3 : durationMs;
+    const { text, entries } = getTranscriptForTimeRange(timecodes, startMs, endMs);
+    response.timeRange = {
+      startTime: startTime || 0,
+      endTime: endTime || Math.floor(durationMs / 1e3),
+      startFormatted: formatTime$1(startMs),
+      endFormatted: formatTime$1(endMs)
+    };
+    response.transcript = text;
+    if (includeTimecodes) {
+      response.transcriptWithTimeCodes = entries;
+    }
+  } else if (chunkIndex !== void 0) {
+    const chunkStartMs = chunkIndex * chunkSizeSeconds * 1e3;
+    const chunkEndMs = Math.min((chunkIndex + 1) * chunkSizeSeconds * 1e3, durationMs);
+    if (chunkStartMs >= durationMs) {
+      response.error = `Chunk index ${chunkIndex} is out of range. Total chunks: ${totalChunks} (0-${totalChunks - 1})`;
+    } else {
+      const { text, entries } = getTranscriptForTimeRange(timecodes, chunkStartMs, chunkEndMs);
+      response.chunk = {
+        index: chunkIndex,
+        totalChunks,
+        startTime: Math.floor(chunkStartMs / 1e3),
+        endTime: Math.floor(chunkEndMs / 1e3),
+        startFormatted: formatTime$1(chunkStartMs),
+        endFormatted: formatTime$1(chunkEndMs)
+      };
+      response.transcript = text;
+      if (includeTimecodes) {
+        response.transcriptWithTimeCodes = entries;
+      }
+      if (chunkIndex < totalChunks - 1) {
+        response.nextChunk = `Use chunkIndex: ${chunkIndex + 1} to get the next portion`;
+      }
+      if (chunkIndex > 0) {
+        response.previousChunk = `Use chunkIndex: ${chunkIndex - 1} to get the previous portion`;
+      }
+    }
+  } else if (includeFullTranscript || fullText.length <= maxFullTranscriptLength) {
+    response.transcript = fullText;
+    if (includeTimecodes) {
+      response.transcriptWithTimeCodes = timecodes;
+    }
+    if (includeFullTranscript && fullText.length > maxFullTranscriptLength) {
+      response.warning = "Full transcript included. For long videos, consider using chunkIndex, startTime/endTime, or search_transcript to reduce response size.";
+    } else if (fullText.length <= maxFullTranscriptLength) {
+      response.note = "Full transcript auto-loaded (fits within context limit).";
+    }
+  } else {
+    const preview = fullText.length > previewLength ? fullText.substring(0, previewLength) + "..." : fullText;
+    response.preview = preview;
+    response.isLargeTranscript = true;
+    response.usage = {
+      fullTranscript: "Set includeFullTranscript: true to get complete text (warning: may exceed context)",
+      search: "Use search_transcript to find relevant portions by keyword (recommended for large transcripts)",
+      timeRange: "Use startTime and endTime (in seconds) to get a specific portion",
+      pagination: `Use chunkIndex (0-${totalChunks - 1}) to paginate through ${chunkSizeSeconds}s chunks`
+    };
+  }
+  return {
+    content: [
+      {
+        type: "text",
+        text: JSON.stringify(response, null, 2)
+      }
+    ]
+  };
+}
+const searchTranscriptTool = {
+  name: "search_transcript",
+  description: "Search within a saved transcript using BM25 scoring. Returns the most relevant segments matching your query with timestamps. Use this to find specific content in long videos without loading the entire transcript.",
+  inputSchema: {
+    type: "object",
+    properties: {
+      videoId: {
+        type: "string",
+        description: 'YouTube video ID (e.g., "dQw4w9WgXcQ") or full YouTube URL'
+      },
+      query: {
+        type: "string",
+        description: "Search query - keywords or phrases to find in the transcript"
+      },
+      maxResults: {
+        type: "number",
+        description: "Maximum number of results to return (default: 5, max: 20)",
+        default: 5
+      }
+    },
+    required: ["videoId", "query"]
+  }
+};
+function tokenize(text) {
+  return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((word) => word.length > 1);
+}
+function calculateIDF(segments, vocabulary) {
+  const idf = /* @__PURE__ */ new Map();
+  const N = segments.length;
+  for (const term of vocabulary) {
+    const docsWithTerm = segments.filter(
+      (seg) => tokenize(seg.text).includes(term)
+    ).length;
+    idf.set(term, Math.log((N - docsWithTerm + 0.5) / (docsWithTerm + 0.5) + 1));
+  }
+  return idf;
+}
+function bm25Score(segmentTokens, queryTokens, idf, avgDocLength, k1 = 1.5, b = 0.75) {
+  const docLength = segmentTokens.length;
+  let score = 0;
+  const tf = /* @__PURE__ */ new Map();
+  for (const token of segmentTokens) {
+    tf.set(token, (tf.get(token) || 0) + 1);
+  }
+  for (const term of queryTokens) {
+    const termFreq = tf.get(term) || 0;
+    const termIdf = idf.get(term) || 0;
+    if (termFreq > 0) {
+      const numerator = termFreq * (k1 + 1);
+      const denominator = termFreq + k1 * (1 - b + b * (docLength / avgDocLength));
+      score += termIdf * (numerator / denominator);
+    }
+  }
+  return score;
+}
+function formatTime(ms) {
+  const totalSeconds = Math.floor(ms / 1e3);
+  const hours = Math.floor(totalSeconds / 3600);
+  const minutes = Math.floor(totalSeconds % 3600 / 60);
+  const seconds = totalSeconds % 60;
+  if (hours > 0) {
+    return `${hours}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
+  }
+  return `${minutes}:${seconds.toString().padStart(2, "0")}`;
+}
+function createSegments(timecodes, segmentDurationMs) {
+  if (!timecodes || timecodes.length === 0) return [];
+  const segments = [];
+  let currentSegment = [];
+  let segmentStartTime = timecodes[0].start;
+  for (const entry of timecodes) {
+    const segmentEndTime = segmentStartTime + segmentDurationMs;
+    if (entry.start < segmentEndTime) {
+      currentSegment.push(entry);
+    } else {
+      if (currentSegment.length > 0) {
+        const endTime = currentSegment[currentSegment.length - 1].end || currentSegment[currentSegment.length - 1].start + (currentSegment[currentSegment.length - 1].duration || 0);
+        segments.push({
+          text: currentSegment.map((e) => e.text).join(" "),
+          startTime: Math.floor(segmentStartTime / 1e3),
+          endTime: Math.floor(endTime / 1e3),
+          startFormatted: formatTime(segmentStartTime),
+          endFormatted: formatTime(endTime)
+        });
+      }
+      segmentStartTime = entry.start;
+      currentSegment = [entry];
+    }
+  }
+  if (currentSegment.length > 0) {
+    const endTime = currentSegment[currentSegment.length - 1].end || currentSegment[currentSegment.length - 1].start + (currentSegment[currentSegment.length - 1].duration || 0);
+    segments.push({
+      text: currentSegment.map((e) => e.text).join(" "),
+      startTime: Math.floor(segmentStartTime / 1e3),
+      endTime: Math.floor(endTime / 1e3),
+      startFormatted: formatTime(segmentStartTime),
+      endFormatted: formatTime(endTime)
+    });
+  }
+  return segments;
+}
+async function handleSearchTranscript(strapi, args) {
+  const validatedArgs = validateToolInput("search_transcript", args);
+  const { videoId: videoIdOrUrl, query, maxResults: maxResultsInput } = validatedArgs;
+  const pluginConfig = await strapi.config.get("plugin::yt-transcript-strapi-plugin");
+  const segmentSeconds = pluginConfig?.searchSegmentSeconds || 30;
+  const maxResults = Math.min(Math.max(maxResultsInput || 5, 1), 20);
+  const videoId = extractYouTubeID(videoIdOrUrl);
+  if (!videoId) {
+    throw new Error(`Invalid YouTube video ID or URL: "${videoIdOrUrl}". Please provide a valid 11-character video ID or YouTube URL.`);
+  }
+  const service2 = strapi.plugin("yt-transcript-strapi-plugin").service("service");
+  const transcript2 = await service2.findTranscript(videoId);
+  if (!transcript2) {
+    return {
+      content: [
+        {
+          type: "text",
+          text: JSON.stringify(
+            {
+              error: true,
+              message: `No transcript found for video ID: ${videoId}. Use fetch_transcript to fetch it from YouTube first.`,
+              videoId
+            },
+            null,
+            2
+          )
+        }
+      ]
+    };
+  }
+  const timecodes = transcript2.transcriptWithTimeCodes || [];
+  if (timecodes.length === 0) {
+    return {
+      content: [
+        {
+          type: "text",
+          text: JSON.stringify(
+            {
+              error: true,
+              message: "Transcript has no timecode data for searching.",
+              videoId
+            },
+            null,
+            2
+          )
+        }
+      ]
+    };
+  }
+  const segments = createSegments(timecodes, segmentSeconds * 1e3);
+  if (segments.length === 0) {
+    return {
+      content: [
+        {
+          type: "text",
+          text: JSON.stringify(
+            {
+              error: true,
+              message: "Could not create searchable segments from transcript.",
+              videoId
+            },
+            null,
+            2
+          )
+        }
+      ]
+    };
+  }
+  const queryTokens = tokenize(query);
+  if (queryTokens.length === 0) {
+    return {
+      content: [
+        {
+          type: "text",
+          text: JSON.stringify(
+            {
+              error: true,
+              message: "Query is empty or contains only stop words.",
+              query
+            },
+            null,
+            2
+          )
+        }
+      ]
+    };
+  }
+  const vocabulary = new Set(queryTokens);
+  const idf = calculateIDF(segments, vocabulary);
+  const avgDocLength = segments.reduce((sum, seg) => sum + tokenize(seg.text).length, 0) / segments.length;
+  const scoredSegments = segments.map((segment) => ({
+    ...segment,
+    score: bm25Score(tokenize(segment.text), queryTokens, idf, avgDocLength)
+  }));
+  const results = scoredSegments.filter((seg) => seg.score > 0).sort((a, b) => b.score - a.score).slice(0, maxResults);
   return {
     content: [
       {
         type: "text",
         text: JSON.stringify(
           {
-            data: transcript2
+            videoId: transcript2.videoId,
+            title: transcript2.title,
+            query,
+            totalSegments: segments.length,
+            matchingResults: results.length,
+            results: results.map((r) => ({
+              text: r.text,
+              startTime: r.startTime,
+              endTime: r.endTime,
+              timeRange: `${r.startFormatted} - ${r.endFormatted}`,
+              score: Math.round(r.score * 100) / 100
+            })),
+            usage: results.length > 0 ? `Use get_transcript with startTime: ${results[0].startTime} and endTime: ${results[0].endTime} to get full context for the top result.` : "No matches found. Try different keywords."
           },
           null,
           2
@@ -358,12 +756,14 @@ const tools = [
   fetchTranscriptTool,
   listTranscriptsTool,
   getTranscriptTool,
+  searchTranscriptTool,
   findTranscriptsTool
 ];
 const toolHandlers = {
   fetch_transcript: handleFetchTranscript,
   list_transcripts: handleListTranscripts,
   get_transcript: handleGetTranscript,
+  search_transcript: handleSearchTranscript,
   find_transcripts: handleFindTranscripts
 };
 async function handleToolCall(strapi, request) {
@@ -439,28 +839,32 @@ const register = ({ strapi }) => {
 };
 const config = {
   default: {
-    openAIApiKey: "",
-    model: "gpt-4o-mini",
-    temp: 0.7,
-    maxTokens: 4096,
-    proxyUrl: ""
+    proxyUrl: "",
     // Optional: HTTP/HTTPS proxy for YouTube requests (e.g., 'http://user:pass@proxy.example.com:8080')
+    chunkSizeSeconds: 300,
+    // Default chunk size for transcript pagination (5 minutes)
+    previewLength: 500,
+    // Default preview length in characters
+    maxFullTranscriptLength: 5e4,
+    // Auto-load full transcript if under this character count (~12K tokens)
+    searchSegmentSeconds: 30
+    // Segment size for BM25 search scoring
   },
   validator(config2) {
-    if (config2.openAIApiKey && typeof config2.openAIApiKey !== "string") {
-      throw new Error("openAIApiKey must be a string");
+    if (config2.proxyUrl && typeof config2.proxyUrl !== "string") {
+      throw new Error("proxyUrl must be a string");
     }
-    if (config2.model && typeof config2.model !== "string") {
-      throw new Error("model must be a string");
+    if (config2.chunkSizeSeconds !== void 0 && (typeof config2.chunkSizeSeconds !== "number" || config2.chunkSizeSeconds < 30)) {
+      throw new Error("chunkSizeSeconds must be a number >= 30");
     }
-    if (config2.temp !== void 0 && (typeof config2.temp !== "number" || config2.temp < 0 || config2.temp > 2)) {
-      throw new Error("temp must be a number between 0 and 2");
+    if (config2.previewLength !== void 0 && (typeof config2.previewLength !== "number" || config2.previewLength < 100)) {
+      throw new Error("previewLength must be a number >= 100");
     }
-    if (config2.maxTokens !== void 0 && (typeof config2.maxTokens !== "number" || config2.maxTokens < 1)) {
-      throw new Error("maxTokens must be a positive number");
+    if (config2.maxFullTranscriptLength !== void 0 && (typeof config2.maxFullTranscriptLength !== "number" || config2.maxFullTranscriptLength < 1e3)) {
+      throw new Error("maxFullTranscriptLength must be a number >= 1000");
     }
-    if (config2.proxyUrl && typeof config2.proxyUrl !== "string") {
-      throw new Error("proxyUrl must be a string");
+    if (config2.searchSegmentSeconds !== void 0 && (typeof config2.searchSegmentSeconds !== "number" || config2.searchSegmentSeconds < 10)) {
+      throw new Error("searchSegmentSeconds must be a number >= 10");
     }
   }
 };
@@ -590,8 +994,7 @@ const contentApi = [
     path: "/mcp",
     handler: "mcp.handle",
     config: {
-      policies: [],
-      auth: false
+      policies: []
     }
   },
   {
@@ -599,8 +1002,7 @@ const contentApi = [
     path: "/mcp",
     handler: "mcp.handle",
     config: {
-      policies: [],
-      auth: false
+      policies: []
     }
   },
   {
@@ -608,8 +1010,7 @@ const contentApi = [
     path: "/mcp",
     handler: "mcp.handle",
     config: {
-      policies: [],
-      auth: false
+      policies: []
     }
   },
   // Other routes