summd 0.1.9 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/url-to-md.js +70 -67
- package/package.json +1 -1
package/dist/url-to-md.js
CHANGED
|
@@ -78,90 +78,93 @@ async function fetchYouTubeTitle(url, videoId) {
|
|
|
78
78
|
catch { }
|
|
79
79
|
return `YouTube: ${videoId}`;
|
|
80
80
|
}
|
|
81
|
-
// Browser cookie sources — tried in order, first one
|
|
82
|
-
// Cookies bypass bot-detection and geo-blocks; required on most IPs.
|
|
81
|
+
// Browser cookie sources — tried in order, first working one is reused for all runs.
|
|
83
82
|
const BROWSERS = ['chrome', 'chromium', 'firefox', 'safari', 'edge'];
|
|
84
|
-
//
|
|
85
|
-
//
|
|
83
|
+
// Subtitle strategies — tried in order, stop at first that produces output.
|
|
84
|
+
// Each downloads only the requested languages, not everything.
|
|
85
|
+
const SUB_STRATEGIES = [
|
|
86
|
+
// 1. Original-language ASR: yt-dlp 'orig' matches the video's primary language
|
|
87
|
+
['--write-auto-subs', '--sub-langs', 'orig'],
|
|
88
|
+
// 2. English manual + ASR
|
|
89
|
+
['--write-subs', '--write-auto-subs', '--sub-langs', 'en'],
|
|
90
|
+
// 3. Most-spoken languages by global user count (covers remaining cases)
|
|
91
|
+
['--write-subs', '--write-auto-subs', '--sub-langs', 'zh-Hans,zh,ja,ko,fr,de,es,pt,hi,ar,ru'],
|
|
92
|
+
];
|
|
86
93
|
async function ytDlpTranscript(url, videoId) {
|
|
87
94
|
const dir = tmpdir();
|
|
88
95
|
const outTemplate = join(dir, videoId);
|
|
89
|
-
const run = (cookieArgs) =>
|
|
90
|
-
'--write-subs',
|
|
91
|
-
'--write-auto-subs',
|
|
92
|
-
'--sub-langs', 'all', // accept any language — no hardcoded preference
|
|
93
|
-
'--sub-format', 'json3',
|
|
94
|
-
'--skip-download',
|
|
95
|
-
'--quiet',
|
|
96
|
-
'--no-progress',
|
|
97
|
-
...cookieArgs,
|
|
98
|
-
'-o', outTemplate,
|
|
99
|
-
url,
|
|
100
|
-
], { timeout: 30_000 });
|
|
101
|
-
// Try with browser cookies first (handles bot-detection, auth, geo-block).
|
|
102
|
-
// Fall back to no-cookies only if no browser is available.
|
|
103
|
-
let ran = false;
|
|
104
|
-
for (const browser of BROWSERS) {
|
|
96
|
+
const run = async (subArgs, cookieArgs) => {
|
|
105
97
|
try {
|
|
106
|
-
await
|
|
107
|
-
|
|
108
|
-
|
|
98
|
+
await execFileAsync('yt-dlp', [
|
|
99
|
+
'--sub-format', 'json3',
|
|
100
|
+
'--skip-download', '--quiet', '--no-progress',
|
|
101
|
+
...subArgs, ...cookieArgs,
|
|
102
|
+
'-o', outTemplate, url,
|
|
103
|
+
], { timeout: 30_000 });
|
|
104
|
+
return 'ok';
|
|
109
105
|
}
|
|
110
106
|
catch (e) {
|
|
111
107
|
const err = e;
|
|
112
108
|
if (err.code === 'ENOENT')
|
|
113
|
-
return
|
|
109
|
+
return 'not-installed';
|
|
114
110
|
const stderr = err.stderr ?? '';
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
// yt-dlp ran but failed for another reason (video unavailable, no captions…)
|
|
119
|
-
ran = true;
|
|
120
|
-
break;
|
|
111
|
+
if (stderr.includes('Could not find') || stderr.includes('cookies from browser'))
|
|
112
|
+
return 'cookie-error';
|
|
113
|
+
return 'other-error';
|
|
121
114
|
}
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
if (!ran) {
|
|
115
|
+
};
|
|
116
|
+
const readAndClean = () => {
|
|
125
117
|
try {
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
if (err.code === 'ENOENT')
|
|
131
|
-
return { transcript: null, reason: 'not-installed' };
|
|
132
|
-
return { transcript: null, reason: 'no-transcript' };
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
// Pick the best subtitle file and clean up all temp files.
|
|
136
|
-
// Priority: original-language ASR (-orig suffix) > English > first available.
|
|
137
|
-
// yt-dlp names the original-language auto-generated track with an "-orig" infix,
|
|
138
|
-
// e.g. zh-Hans-orig.json3 — this is the most reliable source for non-English videos.
|
|
139
|
-
try {
|
|
140
|
-
const files = readdirSync(dir).filter(f => f.startsWith(videoId) && f.endsWith('.json3'));
|
|
141
|
-
const pick = files.find(f => f.includes('-orig.')) ?? // original language ASR
|
|
142
|
-
files.find(f => /\.en(-\w+)?\.json3$/.test(f)) ?? // English (manual or ASR)
|
|
143
|
-
files[0]; // first available
|
|
144
|
-
let transcript = null;
|
|
145
|
-
for (const file of files) {
|
|
146
|
-
const filePath = join(dir, file);
|
|
147
|
-
try {
|
|
148
|
-
if (file === pick) {
|
|
149
|
-
const text = parseTimedText(JSON.parse(readFileSync(filePath, 'utf8')));
|
|
150
|
-
if (text)
|
|
151
|
-
transcript = text;
|
|
152
|
-
}
|
|
153
|
-
}
|
|
154
|
-
finally {
|
|
118
|
+
const files = readdirSync(dir).filter(f => f.startsWith(videoId) && f.endsWith('.json3'));
|
|
119
|
+
let transcript = null;
|
|
120
|
+
for (const file of files) {
|
|
121
|
+
const filePath = join(dir, file);
|
|
155
122
|
try {
|
|
156
|
-
|
|
123
|
+
if (!transcript) {
|
|
124
|
+
const text = parseTimedText(JSON.parse(readFileSync(filePath, 'utf8')));
|
|
125
|
+
if (text)
|
|
126
|
+
transcript = text;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
finally {
|
|
130
|
+
try {
|
|
131
|
+
unlinkSync(filePath);
|
|
132
|
+
}
|
|
133
|
+
catch { }
|
|
157
134
|
}
|
|
158
|
-
catch { } // clean up every file, not just the picked one
|
|
159
135
|
}
|
|
136
|
+
return transcript;
|
|
137
|
+
}
|
|
138
|
+
catch {
|
|
139
|
+
return null;
|
|
160
140
|
}
|
|
161
|
-
|
|
162
|
-
|
|
141
|
+
};
|
|
142
|
+
// Phase 1: find a working browser cookie source using the first subtitle strategy.
|
|
143
|
+
// The result tells us both which browser works AND whether strategy 1 produced output.
|
|
144
|
+
let cookieArgs = [];
|
|
145
|
+
for (const browser of BROWSERS) {
|
|
146
|
+
const outcome = await run(SUB_STRATEGIES[0], ['--cookies-from-browser', browser]);
|
|
147
|
+
if (outcome === 'not-installed')
|
|
148
|
+
return { transcript: null, reason: 'not-installed' };
|
|
149
|
+
if (outcome === 'cookie-error')
|
|
150
|
+
continue; // browser not available, try next
|
|
151
|
+
cookieArgs = ['--cookies-from-browser', browser];
|
|
152
|
+
break;
|
|
153
|
+
}
|
|
154
|
+
// cookieArgs is empty if no browser found — fall back to cookie-less requests
|
|
155
|
+
// Check if strategy 1 produced output
|
|
156
|
+
const t1 = readAndClean();
|
|
157
|
+
if (t1)
|
|
158
|
+
return { transcript: t1 };
|
|
159
|
+
// Phase 2: try remaining subtitle strategies with the confirmed cookie source
|
|
160
|
+
for (const subArgs of SUB_STRATEGIES.slice(1)) {
|
|
161
|
+
const outcome = await run(subArgs, cookieArgs);
|
|
162
|
+
if (outcome === 'not-installed')
|
|
163
|
+
return { transcript: null, reason: 'not-installed' };
|
|
164
|
+
const t = readAndClean();
|
|
165
|
+
if (t)
|
|
166
|
+
return { transcript: t };
|
|
163
167
|
}
|
|
164
|
-
catch { }
|
|
165
168
|
return { transcript: null, reason: 'no-transcript' };
|
|
166
169
|
}
|
|
167
170
|
// json3 parser — shared algorithm with browser extension
|