yt-transcript-strapi-plugin 0.0.16 → 0.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/server/index.js
CHANGED
|
@@ -7,6 +7,8 @@ const streamableHttp_js = require("@modelcontextprotocol/sdk/server/streamableHt
|
|
|
7
7
|
const textsplitters = require("@langchain/textsplitters");
|
|
8
8
|
const prompts = require("@langchain/core/prompts");
|
|
9
9
|
const openai = require("@langchain/openai");
|
|
10
|
+
const youtubei_js = require("youtubei.js");
|
|
11
|
+
const undici = require("undici");
|
|
10
12
|
const FetchTranscriptSchema = zod.z.object({
|
|
11
13
|
videoId: zod.z.string().min(1, "Video ID or URL is required"),
|
|
12
14
|
generateReadable: zod.z.boolean().optional().default(false)
|
|
@@ -439,7 +441,9 @@ const config = {
|
|
|
439
441
|
openAIApiKey: "",
|
|
440
442
|
model: "gpt-4o-mini",
|
|
441
443
|
temp: 0.7,
|
|
442
|
-
maxTokens: 4096
|
|
444
|
+
maxTokens: 4096,
|
|
445
|
+
proxyUrl: ""
|
|
446
|
+
// Optional: HTTP/HTTPS proxy for YouTube requests (e.g., 'http://user:pass@proxy.example.com:8080')
|
|
443
447
|
},
|
|
444
448
|
validator(config2) {
|
|
445
449
|
if (config2.openAIApiKey && typeof config2.openAIApiKey !== "string") {
|
|
@@ -454,6 +458,9 @@ const config = {
|
|
|
454
458
|
if (config2.maxTokens !== void 0 && (typeof config2.maxTokens !== "number" || config2.maxTokens < 1)) {
|
|
455
459
|
throw new Error("maxTokens must be a positive number");
|
|
456
460
|
}
|
|
461
|
+
if (config2.proxyUrl && typeof config2.proxyUrl !== "string") {
|
|
462
|
+
throw new Error("proxyUrl must be a string");
|
|
463
|
+
}
|
|
457
464
|
}
|
|
458
465
|
};
|
|
459
466
|
const kind = "collectionType";
|
|
@@ -656,127 +663,127 @@ async function initializeModel({
|
|
|
656
663
|
maxTokens: 1e3
|
|
657
664
|
});
|
|
658
665
|
}
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
666
|
+
function createProxyFetch(proxyUrl) {
|
|
667
|
+
if (!proxyUrl) {
|
|
668
|
+
return void 0;
|
|
669
|
+
}
|
|
670
|
+
const proxyAgent = new undici.ProxyAgent(proxyUrl);
|
|
671
|
+
return async (input, init) => {
|
|
672
|
+
if (input instanceof Request) {
|
|
673
|
+
const url = input.url;
|
|
674
|
+
return undici.fetch(url, {
|
|
675
|
+
method: input.method,
|
|
676
|
+
headers: input.headers,
|
|
677
|
+
body: input.body,
|
|
678
|
+
...init,
|
|
679
|
+
dispatcher: proxyAgent
|
|
680
|
+
});
|
|
681
|
+
}
|
|
682
|
+
return undici.fetch(input, { ...init, dispatcher: proxyAgent });
|
|
683
|
+
};
|
|
669
684
|
}
|
|
670
|
-
function
|
|
671
|
-
|
|
672
|
-
if (match && match[1]) {
|
|
673
|
-
return match[1];
|
|
674
|
-
}
|
|
675
|
-
throw new Error("Could not extract INNERTUBE_API_KEY from page");
|
|
685
|
+
function decodeHtmlEntities(text) {
|
|
686
|
+
return text.replace(/'/g, "'").replace(/"/g, '"').replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/ /g, " ").replace(/'/g, "'").replace(/&#(\d+);/g, (_, num) => String.fromCharCode(parseInt(num, 10))).replace(/<[^>]+>/g, "").trim();
|
|
676
687
|
}
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
688
|
+
function parsePTagFormat(xml) {
|
|
689
|
+
const segments = [];
|
|
690
|
+
const pTagRegex = /<p\s+t="(\d+)"\s+d="(\d+)"[^>]*>([\s\S]*?)<\/p>/g;
|
|
691
|
+
let match = pTagRegex.exec(xml);
|
|
692
|
+
while (match !== null) {
|
|
693
|
+
const [, startMsStr, durationMsStr, rawText] = match;
|
|
694
|
+
if (startMsStr && durationMsStr && rawText) {
|
|
695
|
+
const text = decodeHtmlEntities(rawText);
|
|
696
|
+
if (text) {
|
|
697
|
+
const start = parseInt(startMsStr, 10);
|
|
698
|
+
const duration = parseInt(durationMsStr, 10);
|
|
699
|
+
segments.push({
|
|
700
|
+
text,
|
|
701
|
+
start,
|
|
702
|
+
end: start + duration,
|
|
703
|
+
duration
|
|
704
|
+
});
|
|
705
|
+
}
|
|
683
706
|
}
|
|
707
|
+
match = pTagRegex.exec(xml);
|
|
684
708
|
}
|
|
685
|
-
return
|
|
709
|
+
return segments;
|
|
686
710
|
}
|
|
687
|
-
|
|
688
|
-
const
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
711
|
+
function parseTextTagFormat(xml) {
|
|
712
|
+
const segments = [];
|
|
713
|
+
const textTagRegex = /<text\s+start="([\d.]+)"(?:\s+dur="([\d.]+)")?[^>]*>([\s\S]*?)<\/text>/g;
|
|
714
|
+
let match = textTagRegex.exec(xml);
|
|
715
|
+
while (match !== null) {
|
|
716
|
+
const [, startStr, durStr, rawText] = match;
|
|
717
|
+
if (startStr && rawText) {
|
|
718
|
+
const text = decodeHtmlEntities(rawText);
|
|
719
|
+
if (text) {
|
|
720
|
+
const start = Math.round(parseFloat(startStr) * 1e3);
|
|
721
|
+
const duration = Math.round(parseFloat(durStr || "0") * 1e3);
|
|
722
|
+
segments.push({
|
|
723
|
+
text,
|
|
724
|
+
start,
|
|
725
|
+
end: start + duration,
|
|
726
|
+
duration
|
|
727
|
+
});
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
match = textTagRegex.exec(xml);
|
|
698
731
|
}
|
|
699
|
-
|
|
700
|
-
|
|
732
|
+
return segments;
|
|
733
|
+
}
|
|
734
|
+
function parseTimedTextXml(xml) {
|
|
735
|
+
const pSegments = parsePTagFormat(xml);
|
|
736
|
+
if (pSegments.length > 0) {
|
|
737
|
+
return pSegments;
|
|
701
738
|
}
|
|
702
|
-
return
|
|
739
|
+
return parseTextTagFormat(xml);
|
|
703
740
|
}
|
|
704
|
-
async function
|
|
705
|
-
const
|
|
706
|
-
|
|
741
|
+
async function fetchTimedTextXml(captionUrl, proxyFetch) {
|
|
742
|
+
const fetchFn = proxyFetch || fetch;
|
|
743
|
+
const response = await fetchFn(captionUrl, {
|
|
707
744
|
headers: {
|
|
708
|
-
"
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
context: INNERTUBE_CONTEXT,
|
|
712
|
-
videoId
|
|
713
|
-
})
|
|
745
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
746
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
747
|
+
}
|
|
714
748
|
});
|
|
715
|
-
if (response.status === 429) {
|
|
716
|
-
throw new Error("IP blocked by YouTube (rate limited)");
|
|
717
|
-
}
|
|
718
749
|
if (!response.ok) {
|
|
719
|
-
throw new Error(`
|
|
750
|
+
throw new Error(`Failed to fetch timedtext: ${response.status}`);
|
|
720
751
|
}
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
const segments = [];
|
|
725
|
-
const regex = /<text\s+start="([\d.]+)"(?:\s+dur="([\d.]+)")?[^>]*>([\s\S]*?)<\/text>/g;
|
|
726
|
-
let match;
|
|
727
|
-
while ((match = regex.exec(xml)) !== null) {
|
|
728
|
-
const start = parseFloat(match[1]);
|
|
729
|
-
const duration = parseFloat(match[2] || "0");
|
|
730
|
-
const text = decodeHtml(match[3]);
|
|
731
|
-
if (text) {
|
|
732
|
-
segments.push({
|
|
733
|
-
text,
|
|
734
|
-
start: Math.round(start * 1e3),
|
|
735
|
-
end: Math.round((start + duration) * 1e3),
|
|
736
|
-
duration: Math.round(duration * 1e3)
|
|
737
|
-
});
|
|
738
|
-
}
|
|
752
|
+
const xml = await response.text();
|
|
753
|
+
if (!xml || xml.length === 0) {
|
|
754
|
+
throw new Error("Empty timedtext response");
|
|
739
755
|
}
|
|
740
|
-
return
|
|
756
|
+
return xml;
|
|
741
757
|
}
|
|
742
|
-
async function fetchTranscriptFromYouTube(videoId) {
|
|
743
|
-
const
|
|
744
|
-
const
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
const
|
|
754
|
-
if (!
|
|
758
|
+
async function fetchTranscriptFromYouTube(videoId, options2) {
|
|
759
|
+
const proxyFetch = createProxyFetch(options2?.proxyUrl);
|
|
760
|
+
const client = await youtubei_js.Innertube.create({
|
|
761
|
+
generate_session_locally: true,
|
|
762
|
+
lang: "en",
|
|
763
|
+
location: "US",
|
|
764
|
+
retrieve_player: false,
|
|
765
|
+
fetch: proxyFetch
|
|
766
|
+
});
|
|
767
|
+
const info2 = await client.getBasicInfo(videoId);
|
|
768
|
+
const title = info2.basic_info?.title;
|
|
769
|
+
const captionTracks = info2.captions?.caption_tracks;
|
|
770
|
+
if (!captionTracks || captionTracks.length === 0) {
|
|
771
|
+
const reason = info2.playability_status?.reason;
|
|
772
|
+
if (reason && reason.includes("Sign in")) {
|
|
773
|
+
throw new Error(
|
|
774
|
+
"YouTube requires sign-in. This usually means the IP is blocked. Configure a residential proxy in the plugin settings."
|
|
775
|
+
);
|
|
776
|
+
}
|
|
755
777
|
throw new Error("No captions available for this video");
|
|
756
778
|
}
|
|
757
|
-
const
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
track = captionTracks.find((t) => t.languageCode === "en");
|
|
761
|
-
}
|
|
762
|
-
if (!track) {
|
|
763
|
-
track = captionTracks[0];
|
|
764
|
-
}
|
|
765
|
-
if (track.baseUrl.includes("&exp=xpe")) {
|
|
766
|
-
throw new Error("This video requires PoToken authentication (not supported)");
|
|
779
|
+
const englishTrack = captionTracks.find((t) => t.language_code === "en" && t.kind !== "asr") || captionTracks.find((t) => t.language_code?.startsWith("en")) || captionTracks[0];
|
|
780
|
+
if (!englishTrack?.base_url) {
|
|
781
|
+
throw new Error("No valid caption track URL found");
|
|
767
782
|
}
|
|
768
|
-
const
|
|
769
|
-
const
|
|
770
|
-
if (!captionResponse.ok) {
|
|
771
|
-
throw new Error(`Failed to fetch transcript: ${captionResponse.status}`);
|
|
772
|
-
}
|
|
773
|
-
const transcriptXml = await captionResponse.text();
|
|
774
|
-
if (!transcriptXml || transcriptXml.length === 0) {
|
|
775
|
-
throw new Error("Transcript response was empty");
|
|
776
|
-
}
|
|
777
|
-
const segments = parseTranscriptXml(transcriptXml);
|
|
783
|
+
const xml = await fetchTimedTextXml(englishTrack.base_url, proxyFetch);
|
|
784
|
+
const segments = parseTimedTextXml(xml);
|
|
778
785
|
if (segments.length === 0) {
|
|
779
|
-
throw new Error("Failed to parse any transcript segments");
|
|
786
|
+
throw new Error("Failed to parse any transcript segments from XML");
|
|
780
787
|
}
|
|
781
788
|
return {
|
|
782
789
|
videoId,
|
|
@@ -785,9 +792,9 @@ async function fetchTranscriptFromYouTube(videoId) {
|
|
|
785
792
|
transcriptWithTimeCodes: segments
|
|
786
793
|
};
|
|
787
794
|
}
|
|
788
|
-
const fetchTranscript = async (videoId) => {
|
|
795
|
+
const fetchTranscript = async (videoId, options2) => {
|
|
789
796
|
try {
|
|
790
|
-
return await fetchTranscriptFromYouTube(videoId);
|
|
797
|
+
return await fetchTranscriptFromYouTube(videoId, options2);
|
|
791
798
|
} catch (error) {
|
|
792
799
|
throw new Error(
|
|
793
800
|
`Failed to fetch transcript for video ${videoId}. The video may not have captions enabled, or may be unavailable. Error: ${error instanceof Error ? error.message : String(error)}`
|
|
@@ -836,7 +843,12 @@ const service = ({ strapi: strapi2 }) => ({
|
|
|
836
843
|
if (!isValid) {
|
|
837
844
|
return { error: "Invalid video ID", data: null };
|
|
838
845
|
}
|
|
839
|
-
const
|
|
846
|
+
const pluginSettings = await strapi2.config.get(
|
|
847
|
+
"plugin::yt-transcript-strapi-plugin"
|
|
848
|
+
);
|
|
849
|
+
const transcriptData = await fetchTranscript(identifier, {
|
|
850
|
+
proxyUrl: pluginSettings?.proxyUrl
|
|
851
|
+
});
|
|
840
852
|
return {
|
|
841
853
|
title: transcriptData.title,
|
|
842
854
|
fullTranscript: transcriptData.fullTranscript,
|
package/dist/server/index.mjs
CHANGED
|
@@ -6,6 +6,8 @@ import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/
|
|
|
6
6
|
import { TokenTextSplitter } from "@langchain/textsplitters";
|
|
7
7
|
import { PromptTemplate } from "@langchain/core/prompts";
|
|
8
8
|
import { ChatOpenAI } from "@langchain/openai";
|
|
9
|
+
import { Innertube } from "youtubei.js";
|
|
10
|
+
import { ProxyAgent, fetch as fetch$1 } from "undici";
|
|
9
11
|
const FetchTranscriptSchema = z.object({
|
|
10
12
|
videoId: z.string().min(1, "Video ID or URL is required"),
|
|
11
13
|
generateReadable: z.boolean().optional().default(false)
|
|
@@ -438,7 +440,9 @@ const config = {
|
|
|
438
440
|
openAIApiKey: "",
|
|
439
441
|
model: "gpt-4o-mini",
|
|
440
442
|
temp: 0.7,
|
|
441
|
-
maxTokens: 4096
|
|
443
|
+
maxTokens: 4096,
|
|
444
|
+
proxyUrl: ""
|
|
445
|
+
// Optional: HTTP/HTTPS proxy for YouTube requests (e.g., 'http://user:pass@proxy.example.com:8080')
|
|
442
446
|
},
|
|
443
447
|
validator(config2) {
|
|
444
448
|
if (config2.openAIApiKey && typeof config2.openAIApiKey !== "string") {
|
|
@@ -453,6 +457,9 @@ const config = {
|
|
|
453
457
|
if (config2.maxTokens !== void 0 && (typeof config2.maxTokens !== "number" || config2.maxTokens < 1)) {
|
|
454
458
|
throw new Error("maxTokens must be a positive number");
|
|
455
459
|
}
|
|
460
|
+
if (config2.proxyUrl && typeof config2.proxyUrl !== "string") {
|
|
461
|
+
throw new Error("proxyUrl must be a string");
|
|
462
|
+
}
|
|
456
463
|
}
|
|
457
464
|
};
|
|
458
465
|
const kind = "collectionType";
|
|
@@ -655,127 +662,127 @@ async function initializeModel({
|
|
|
655
662
|
maxTokens: 1e3
|
|
656
663
|
});
|
|
657
664
|
}
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
665
|
+
function createProxyFetch(proxyUrl) {
|
|
666
|
+
if (!proxyUrl) {
|
|
667
|
+
return void 0;
|
|
668
|
+
}
|
|
669
|
+
const proxyAgent = new ProxyAgent(proxyUrl);
|
|
670
|
+
return async (input, init) => {
|
|
671
|
+
if (input instanceof Request) {
|
|
672
|
+
const url = input.url;
|
|
673
|
+
return fetch$1(url, {
|
|
674
|
+
method: input.method,
|
|
675
|
+
headers: input.headers,
|
|
676
|
+
body: input.body,
|
|
677
|
+
...init,
|
|
678
|
+
dispatcher: proxyAgent
|
|
679
|
+
});
|
|
680
|
+
}
|
|
681
|
+
return fetch$1(input, { ...init, dispatcher: proxyAgent });
|
|
682
|
+
};
|
|
668
683
|
}
|
|
669
|
-
function
|
|
670
|
-
|
|
671
|
-
if (match && match[1]) {
|
|
672
|
-
return match[1];
|
|
673
|
-
}
|
|
674
|
-
throw new Error("Could not extract INNERTUBE_API_KEY from page");
|
|
684
|
+
function decodeHtmlEntities(text) {
|
|
685
|
+
return text.replace(/'/g, "'").replace(/"/g, '"').replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/ /g, " ").replace(/'/g, "'").replace(/&#(\d+);/g, (_, num) => String.fromCharCode(parseInt(num, 10))).replace(/<[^>]+>/g, "").trim();
|
|
675
686
|
}
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
687
|
+
function parsePTagFormat(xml) {
|
|
688
|
+
const segments = [];
|
|
689
|
+
const pTagRegex = /<p\s+t="(\d+)"\s+d="(\d+)"[^>]*>([\s\S]*?)<\/p>/g;
|
|
690
|
+
let match = pTagRegex.exec(xml);
|
|
691
|
+
while (match !== null) {
|
|
692
|
+
const [, startMsStr, durationMsStr, rawText] = match;
|
|
693
|
+
if (startMsStr && durationMsStr && rawText) {
|
|
694
|
+
const text = decodeHtmlEntities(rawText);
|
|
695
|
+
if (text) {
|
|
696
|
+
const start = parseInt(startMsStr, 10);
|
|
697
|
+
const duration = parseInt(durationMsStr, 10);
|
|
698
|
+
segments.push({
|
|
699
|
+
text,
|
|
700
|
+
start,
|
|
701
|
+
end: start + duration,
|
|
702
|
+
duration
|
|
703
|
+
});
|
|
704
|
+
}
|
|
682
705
|
}
|
|
706
|
+
match = pTagRegex.exec(xml);
|
|
683
707
|
}
|
|
684
|
-
return
|
|
708
|
+
return segments;
|
|
685
709
|
}
|
|
686
|
-
|
|
687
|
-
const
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
710
|
+
function parseTextTagFormat(xml) {
|
|
711
|
+
const segments = [];
|
|
712
|
+
const textTagRegex = /<text\s+start="([\d.]+)"(?:\s+dur="([\d.]+)")?[^>]*>([\s\S]*?)<\/text>/g;
|
|
713
|
+
let match = textTagRegex.exec(xml);
|
|
714
|
+
while (match !== null) {
|
|
715
|
+
const [, startStr, durStr, rawText] = match;
|
|
716
|
+
if (startStr && rawText) {
|
|
717
|
+
const text = decodeHtmlEntities(rawText);
|
|
718
|
+
if (text) {
|
|
719
|
+
const start = Math.round(parseFloat(startStr) * 1e3);
|
|
720
|
+
const duration = Math.round(parseFloat(durStr || "0") * 1e3);
|
|
721
|
+
segments.push({
|
|
722
|
+
text,
|
|
723
|
+
start,
|
|
724
|
+
end: start + duration,
|
|
725
|
+
duration
|
|
726
|
+
});
|
|
727
|
+
}
|
|
728
|
+
}
|
|
729
|
+
match = textTagRegex.exec(xml);
|
|
697
730
|
}
|
|
698
|
-
|
|
699
|
-
|
|
731
|
+
return segments;
|
|
732
|
+
}
|
|
733
|
+
function parseTimedTextXml(xml) {
|
|
734
|
+
const pSegments = parsePTagFormat(xml);
|
|
735
|
+
if (pSegments.length > 0) {
|
|
736
|
+
return pSegments;
|
|
700
737
|
}
|
|
701
|
-
return
|
|
738
|
+
return parseTextTagFormat(xml);
|
|
702
739
|
}
|
|
703
|
-
async function
|
|
704
|
-
const
|
|
705
|
-
|
|
740
|
+
async function fetchTimedTextXml(captionUrl, proxyFetch) {
|
|
741
|
+
const fetchFn = proxyFetch || fetch;
|
|
742
|
+
const response = await fetchFn(captionUrl, {
|
|
706
743
|
headers: {
|
|
707
|
-
"
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
context: INNERTUBE_CONTEXT,
|
|
711
|
-
videoId
|
|
712
|
-
})
|
|
744
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
745
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
746
|
+
}
|
|
713
747
|
});
|
|
714
|
-
if (response.status === 429) {
|
|
715
|
-
throw new Error("IP blocked by YouTube (rate limited)");
|
|
716
|
-
}
|
|
717
748
|
if (!response.ok) {
|
|
718
|
-
throw new Error(`
|
|
749
|
+
throw new Error(`Failed to fetch timedtext: ${response.status}`);
|
|
719
750
|
}
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
const segments = [];
|
|
724
|
-
const regex = /<text\s+start="([\d.]+)"(?:\s+dur="([\d.]+)")?[^>]*>([\s\S]*?)<\/text>/g;
|
|
725
|
-
let match;
|
|
726
|
-
while ((match = regex.exec(xml)) !== null) {
|
|
727
|
-
const start = parseFloat(match[1]);
|
|
728
|
-
const duration = parseFloat(match[2] || "0");
|
|
729
|
-
const text = decodeHtml(match[3]);
|
|
730
|
-
if (text) {
|
|
731
|
-
segments.push({
|
|
732
|
-
text,
|
|
733
|
-
start: Math.round(start * 1e3),
|
|
734
|
-
end: Math.round((start + duration) * 1e3),
|
|
735
|
-
duration: Math.round(duration * 1e3)
|
|
736
|
-
});
|
|
737
|
-
}
|
|
751
|
+
const xml = await response.text();
|
|
752
|
+
if (!xml || xml.length === 0) {
|
|
753
|
+
throw new Error("Empty timedtext response");
|
|
738
754
|
}
|
|
739
|
-
return
|
|
755
|
+
return xml;
|
|
740
756
|
}
|
|
741
|
-
async function fetchTranscriptFromYouTube(videoId) {
|
|
742
|
-
const
|
|
743
|
-
const
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
const
|
|
753
|
-
if (!
|
|
757
|
+
async function fetchTranscriptFromYouTube(videoId, options2) {
|
|
758
|
+
const proxyFetch = createProxyFetch(options2?.proxyUrl);
|
|
759
|
+
const client = await Innertube.create({
|
|
760
|
+
generate_session_locally: true,
|
|
761
|
+
lang: "en",
|
|
762
|
+
location: "US",
|
|
763
|
+
retrieve_player: false,
|
|
764
|
+
fetch: proxyFetch
|
|
765
|
+
});
|
|
766
|
+
const info2 = await client.getBasicInfo(videoId);
|
|
767
|
+
const title = info2.basic_info?.title;
|
|
768
|
+
const captionTracks = info2.captions?.caption_tracks;
|
|
769
|
+
if (!captionTracks || captionTracks.length === 0) {
|
|
770
|
+
const reason = info2.playability_status?.reason;
|
|
771
|
+
if (reason && reason.includes("Sign in")) {
|
|
772
|
+
throw new Error(
|
|
773
|
+
"YouTube requires sign-in. This usually means the IP is blocked. Configure a residential proxy in the plugin settings."
|
|
774
|
+
);
|
|
775
|
+
}
|
|
754
776
|
throw new Error("No captions available for this video");
|
|
755
777
|
}
|
|
756
|
-
const
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
track = captionTracks.find((t) => t.languageCode === "en");
|
|
760
|
-
}
|
|
761
|
-
if (!track) {
|
|
762
|
-
track = captionTracks[0];
|
|
763
|
-
}
|
|
764
|
-
if (track.baseUrl.includes("&exp=xpe")) {
|
|
765
|
-
throw new Error("This video requires PoToken authentication (not supported)");
|
|
778
|
+
const englishTrack = captionTracks.find((t) => t.language_code === "en" && t.kind !== "asr") || captionTracks.find((t) => t.language_code?.startsWith("en")) || captionTracks[0];
|
|
779
|
+
if (!englishTrack?.base_url) {
|
|
780
|
+
throw new Error("No valid caption track URL found");
|
|
766
781
|
}
|
|
767
|
-
const
|
|
768
|
-
const
|
|
769
|
-
if (!captionResponse.ok) {
|
|
770
|
-
throw new Error(`Failed to fetch transcript: ${captionResponse.status}`);
|
|
771
|
-
}
|
|
772
|
-
const transcriptXml = await captionResponse.text();
|
|
773
|
-
if (!transcriptXml || transcriptXml.length === 0) {
|
|
774
|
-
throw new Error("Transcript response was empty");
|
|
775
|
-
}
|
|
776
|
-
const segments = parseTranscriptXml(transcriptXml);
|
|
782
|
+
const xml = await fetchTimedTextXml(englishTrack.base_url, proxyFetch);
|
|
783
|
+
const segments = parseTimedTextXml(xml);
|
|
777
784
|
if (segments.length === 0) {
|
|
778
|
-
throw new Error("Failed to parse any transcript segments");
|
|
785
|
+
throw new Error("Failed to parse any transcript segments from XML");
|
|
779
786
|
}
|
|
780
787
|
return {
|
|
781
788
|
videoId,
|
|
@@ -784,9 +791,9 @@ async function fetchTranscriptFromYouTube(videoId) {
|
|
|
784
791
|
transcriptWithTimeCodes: segments
|
|
785
792
|
};
|
|
786
793
|
}
|
|
787
|
-
const fetchTranscript = async (videoId) => {
|
|
794
|
+
const fetchTranscript = async (videoId, options2) => {
|
|
788
795
|
try {
|
|
789
|
-
return await fetchTranscriptFromYouTube(videoId);
|
|
796
|
+
return await fetchTranscriptFromYouTube(videoId, options2);
|
|
790
797
|
} catch (error) {
|
|
791
798
|
throw new Error(
|
|
792
799
|
`Failed to fetch transcript for video ${videoId}. The video may not have captions enabled, or may be unavailable. Error: ${error instanceof Error ? error.message : String(error)}`
|
|
@@ -835,7 +842,12 @@ const service = ({ strapi: strapi2 }) => ({
|
|
|
835
842
|
if (!isValid) {
|
|
836
843
|
return { error: "Invalid video ID", data: null };
|
|
837
844
|
}
|
|
838
|
-
const
|
|
845
|
+
const pluginSettings = await strapi2.config.get(
|
|
846
|
+
"plugin::yt-transcript-strapi-plugin"
|
|
847
|
+
);
|
|
848
|
+
const transcriptData = await fetchTranscript(identifier, {
|
|
849
|
+
proxyUrl: pluginSettings?.proxyUrl
|
|
850
|
+
});
|
|
839
851
|
return {
|
|
840
852
|
title: transcriptData.title,
|
|
841
853
|
fullTranscript: transcriptData.fullTranscript,
|
|
@@ -4,12 +4,14 @@ declare const _default: {
|
|
|
4
4
|
model: string;
|
|
5
5
|
temp: number;
|
|
6
6
|
maxTokens: number;
|
|
7
|
+
proxyUrl: string;
|
|
7
8
|
};
|
|
8
9
|
validator(config: {
|
|
9
10
|
openAIApiKey?: string;
|
|
10
11
|
model?: string;
|
|
11
12
|
temp?: number;
|
|
12
13
|
maxTokens?: number;
|
|
14
|
+
proxyUrl?: string;
|
|
13
15
|
}): void;
|
|
14
16
|
};
|
|
15
17
|
export default _default;
|
|
@@ -14,12 +14,14 @@ declare const _default: {
|
|
|
14
14
|
model: string;
|
|
15
15
|
temp: number;
|
|
16
16
|
maxTokens: number;
|
|
17
|
+
proxyUrl: string;
|
|
17
18
|
};
|
|
18
19
|
validator(config: {
|
|
19
20
|
openAIApiKey?: string;
|
|
20
21
|
model?: string;
|
|
21
22
|
temp?: number;
|
|
22
23
|
maxTokens?: number;
|
|
24
|
+
proxyUrl?: string;
|
|
23
25
|
}): void;
|
|
24
26
|
};
|
|
25
27
|
controllers: {
|
|
@@ -10,8 +10,13 @@ export interface TranscriptData {
|
|
|
10
10
|
fullTranscript: string;
|
|
11
11
|
transcriptWithTimeCodes: TranscriptSegment[];
|
|
12
12
|
}
|
|
13
|
+
export interface FetchOptions {
|
|
14
|
+
proxyUrl?: string;
|
|
15
|
+
}
|
|
13
16
|
/**
|
|
14
17
|
* Main entry point for fetching YouTube transcripts
|
|
18
|
+
* @param videoId - The YouTube video ID
|
|
19
|
+
* @param options - Optional configuration including proxy settings
|
|
15
20
|
*/
|
|
16
|
-
declare const fetchTranscript: (videoId: string) => Promise<TranscriptData>;
|
|
21
|
+
declare const fetchTranscript: (videoId: string, options?: FetchOptions) => Promise<TranscriptData>;
|
|
17
22
|
export default fetchTranscript;
|
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "0.0.
|
|
2
|
+
"version": "0.0.18",
|
|
3
3
|
"keywords": [
|
|
4
4
|
"yt-transcript-strapi-plugin",
|
|
5
5
|
"youtube",
|
|
@@ -50,6 +50,8 @@
|
|
|
50
50
|
"@strapi/icons": "^2.0.0-rc.12",
|
|
51
51
|
"langchain": "^0.3.5",
|
|
52
52
|
"react-intl": "^6.8.7",
|
|
53
|
+
"undici": "^6.21.0",
|
|
54
|
+
"youtubei.js": "^16.0.1",
|
|
53
55
|
"zod": "^3.23.0"
|
|
54
56
|
},
|
|
55
57
|
"bundledDependencies": [
|