summd 0.1.9 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/url-to-md.js +98 -64
- package/package.json +1 -1
package/dist/url-to-md.js
CHANGED
|
@@ -78,90 +78,124 @@ async function fetchYouTubeTitle(url, videoId) {
|
|
|
78
78
|
catch { }
|
|
79
79
|
return `YouTube: ${videoId}`;
|
|
80
80
|
}
|
|
81
|
-
// Browser cookie sources — tried in order, first one
|
|
82
|
-
// Cookies bypass bot-detection and geo-blocks; required on most IPs.
|
|
81
|
+
// Browser cookie sources — tried in order, first working one is reused for all runs.
|
|
83
82
|
const BROWSERS = ['chrome', 'chromium', 'firefox', 'safari', 'edge'];
|
|
84
|
-
//
|
|
85
|
-
//
|
|
83
|
+
// Subtitle fallback strategies used when native language detection fails.
|
|
84
|
+
// Tried in order after the native-language attempt.
|
|
85
|
+
const SUB_FALLBACKS = [
|
|
86
|
+
// 1. English manual + ASR
|
|
87
|
+
['--write-subs', '--write-auto-subs', '--sub-langs', 'en'],
|
|
88
|
+
// 2. Most-spoken languages by global user count (covers remaining cases)
|
|
89
|
+
['--write-subs', '--write-auto-subs', '--sub-langs', 'zh-Hans,zh,ja,ko,fr,de,es,pt,hi,ar,ru'],
|
|
90
|
+
];
|
|
91
|
+
// Detect the video's primary language via yt-dlp --dump-json.
|
|
92
|
+
// Returns a language code like 'zh-Hans', 'en', 'ja', etc., or null on failure.
|
|
93
|
+
async function getVideoLanguage(url, cookieArgs) {
|
|
94
|
+
try {
|
|
95
|
+
const { stdout } = await execFileAsync('yt-dlp', [
|
|
96
|
+
'--dump-json', '--no-playlist', '--quiet',
|
|
97
|
+
...cookieArgs, url,
|
|
98
|
+
], { timeout: 15_000 });
|
|
99
|
+
const meta = JSON.parse(stdout);
|
|
100
|
+
return meta.language ?? null;
|
|
101
|
+
}
|
|
102
|
+
catch {
|
|
103
|
+
return null;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
86
106
|
async function ytDlpTranscript(url, videoId) {
|
|
87
107
|
const dir = tmpdir();
|
|
88
108
|
const outTemplate = join(dir, videoId);
|
|
89
|
-
const run = (cookieArgs) =>
|
|
90
|
-
'--write-subs',
|
|
91
|
-
'--write-auto-subs',
|
|
92
|
-
'--sub-langs', 'all', // accept any language — no hardcoded preference
|
|
93
|
-
'--sub-format', 'json3',
|
|
94
|
-
'--skip-download',
|
|
95
|
-
'--quiet',
|
|
96
|
-
'--no-progress',
|
|
97
|
-
...cookieArgs,
|
|
98
|
-
'-o', outTemplate,
|
|
99
|
-
url,
|
|
100
|
-
], { timeout: 30_000 });
|
|
101
|
-
// Try with browser cookies first (handles bot-detection, auth, geo-block).
|
|
102
|
-
// Fall back to no-cookies only if no browser is available.
|
|
103
|
-
let ran = false;
|
|
104
|
-
for (const browser of BROWSERS) {
|
|
109
|
+
const run = async (subArgs, cookieArgs) => {
|
|
105
110
|
try {
|
|
106
|
-
await
|
|
107
|
-
|
|
108
|
-
|
|
111
|
+
await execFileAsync('yt-dlp', [
|
|
112
|
+
'--sub-format', 'json3',
|
|
113
|
+
'--skip-download', '--quiet', '--no-progress',
|
|
114
|
+
...subArgs, ...cookieArgs,
|
|
115
|
+
'-o', outTemplate, url,
|
|
116
|
+
], { timeout: 30_000 });
|
|
117
|
+
return 'ok';
|
|
109
118
|
}
|
|
110
119
|
catch (e) {
|
|
111
120
|
const err = e;
|
|
112
121
|
if (err.code === 'ENOENT')
|
|
113
|
-
return
|
|
122
|
+
return 'not-installed';
|
|
114
123
|
const stderr = err.stderr ?? '';
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
// yt-dlp ran but failed for another reason (video unavailable, no captions…)
|
|
119
|
-
ran = true;
|
|
120
|
-
break;
|
|
124
|
+
if (stderr.includes('Could not find') || stderr.includes('cookies from browser'))
|
|
125
|
+
return 'cookie-error';
|
|
126
|
+
return 'other-error';
|
|
121
127
|
}
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
|
|
128
|
+
};
|
|
129
|
+
const readAndClean = () => {
|
|
130
|
+
try {
|
|
131
|
+
const files = readdirSync(dir).filter(f => f.startsWith(videoId) && f.endsWith('.json3'));
|
|
132
|
+
let transcript = null;
|
|
133
|
+
for (const file of files) {
|
|
134
|
+
const filePath = join(dir, file);
|
|
135
|
+
try {
|
|
136
|
+
if (!transcript) {
|
|
137
|
+
const text = parseTimedText(JSON.parse(readFileSync(filePath, 'utf8')));
|
|
138
|
+
if (text)
|
|
139
|
+
transcript = text;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
finally {
|
|
143
|
+
try {
|
|
144
|
+
unlinkSync(filePath);
|
|
145
|
+
}
|
|
146
|
+
catch { }
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
return transcript;
|
|
150
|
+
}
|
|
151
|
+
catch {
|
|
152
|
+
return null;
|
|
153
|
+
}
|
|
154
|
+
};
|
|
155
|
+
// Phase 1: find a working browser cookie source.
|
|
156
|
+
// Try each browser with a no-op probe (--dump-json is light and confirms auth works).
|
|
157
|
+
let cookieArgs = [];
|
|
158
|
+
for (const browser of BROWSERS) {
|
|
125
159
|
try {
|
|
126
|
-
await
|
|
160
|
+
await execFileAsync('yt-dlp', [
|
|
161
|
+
'--dump-json', '--no-playlist', '--quiet',
|
|
162
|
+
'--cookies-from-browser', browser, url,
|
|
163
|
+
], { timeout: 15_000 });
|
|
164
|
+
cookieArgs = ['--cookies-from-browser', browser];
|
|
165
|
+
break;
|
|
127
166
|
}
|
|
128
167
|
catch (e) {
|
|
129
168
|
const err = e;
|
|
130
169
|
if (err.code === 'ENOENT')
|
|
131
170
|
return { transcript: null, reason: 'not-installed' };
|
|
132
|
-
|
|
171
|
+
const stderr = err.stderr ?? '';
|
|
172
|
+
if (stderr.includes('Could not find') || stderr.includes('cookies from browser'))
|
|
173
|
+
continue;
|
|
174
|
+
// Other errors (bot-detection without cookies, etc.) — stop trying browsers
|
|
175
|
+
break;
|
|
133
176
|
}
|
|
134
177
|
}
|
|
135
|
-
//
|
|
136
|
-
//
|
|
137
|
-
|
|
138
|
-
//
|
|
139
|
-
|
|
140
|
-
const
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
try {
|
|
156
|
-
unlinkSync(filePath);
|
|
157
|
-
}
|
|
158
|
-
catch { } // clean up every file, not just the picked one
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
if (transcript)
|
|
162
|
-
return { transcript };
|
|
178
|
+
// cookieArgs is empty if no browser found — fall back to cookie-less requests
|
|
179
|
+
// Phase 2: detect the video's primary language from metadata.
|
|
180
|
+
const nativeLang = await getVideoLanguage(url, cookieArgs);
|
|
181
|
+
// Phase 3: try native language subtitles first (most accurate).
|
|
182
|
+
if (nativeLang) {
|
|
183
|
+
const outcome = await run(['--write-subs', '--write-auto-subs', '--sub-langs', nativeLang], cookieArgs);
|
|
184
|
+
if (outcome === 'not-installed')
|
|
185
|
+
return { transcript: null, reason: 'not-installed' };
|
|
186
|
+
const t = readAndClean();
|
|
187
|
+
if (t)
|
|
188
|
+
return { transcript: t };
|
|
189
|
+
}
|
|
190
|
+
// Phase 4: fallback strategies with the confirmed cookie source.
|
|
191
|
+
for (const subArgs of SUB_FALLBACKS) {
|
|
192
|
+
const outcome = await run(subArgs, cookieArgs);
|
|
193
|
+
if (outcome === 'not-installed')
|
|
194
|
+
return { transcript: null, reason: 'not-installed' };
|
|
195
|
+
const t = readAndClean();
|
|
196
|
+
if (t)
|
|
197
|
+
return { transcript: t };
|
|
163
198
|
}
|
|
164
|
-
catch { }
|
|
165
199
|
return { transcript: null, reason: 'no-transcript' };
|
|
166
200
|
}
|
|
167
201
|
// json3 parser — shared algorithm with browser extension
|