npm - headless-youtube-captions - Versions diffs - 1.0.0 - Mend

headless-youtube-captions 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/.claude/settings.local.json +24 -0
package/CHANGELOG.md +24 -0
package/README.md +134 -0
package/package.json +41 -0
package/src/index.js +290 -0
package/test/index.test.js +23 -0

package/.claude/settings.local.json ADDED Viewed

@@ -0,0 +1,24 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(find:*)",
+      "Bash(npm test)",
+      "Bash(node:*)",
+      "Bash(npm run build:*)",
+      "WebFetch(domain:github.com)",
+      "Bash(npm install:*)",
+      "Bash(npm run test:*)",
+      "mcp__server-sequential-thinking__sequentialthinking",
+      "Bash(ls:*)",
+      "mcp__puppeteer__puppeteer_navigate",
+      "mcp__puppeteer__puppeteer_screenshot",
+      "mcp__puppeteer__puppeteer_evaluate",
+      "mcp__puppeteer__puppeteer_click",
+      "Bash(rm:*)",
+      "Bash(npm audit:*)",
+      "Bash(npm whoami:*)",
+      "Bash(npm publish:*)"
+    ],
+    "deny": []
+  }
+}

package/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,24 @@
+# Changelog
+## [1.0.0] - 2025-01-06
+### Added
+- Complete rewrite using Puppeteer for headless browser automation
+- UI-based transcript extraction by clicking "Show transcript" button
+- ES modules support with Node.js 18+ requirement
+- Built-in Node.js test runner for testing
+- Zero build dependencies - runs directly from source
+### Changed
+- Project renamed from `youtube-captions-scraper` to `headless-youtube-captions`
+- Switched from API-based extraction to UI automation approach
+- Removed all build tools (Babel, Flow, ESLint configurations)
+- Simplified to 4 core dependencies only
+- Updated to modern ES6+ syntax without transpilation
+### Removed
+- Legacy API-based caption extraction method
+- Babel build pipeline and configuration
+- Flow type checking
+- Complex dev dependency chain
+- Vulnerability-prone legacy dependencies

package/README.md ADDED Viewed

@@ -0,0 +1,134 @@
+# Headless YouTube Captions
+> Extract YouTube video transcripts by interacting with YouTube's UI using Puppeteer
+## Installation
+```bash
+npm install -S headless-youtube-captions
+# OR
+yarn add headless-youtube-captions
+```
+## Usage
+### ES6 / TypeScript
+```js
+import { getSubtitles } from 'headless-youtube-captions';
+const captions = await getSubtitles({
+  videoID: 'JueUvj6X3DA', // YouTube video ID
+  lang: 'en' // Optional, default: 'en'
+});
+console.log(captions);
+```
+### ES5 / CommonJS
+```js
+const { getSubtitles } = require('headless-youtube-captions');
+getSubtitles({
+  videoID: 'JueUvj6X3DA', // YouTube video ID
+  lang: 'en' // Optional, default: 'en'
+}).then(captions => {
+  console.log(captions);
+});
+```
+## API
+### `getSubtitles(options)`
+Extracts captions/transcripts from a YouTube video by automating browser interactions.
+#### Parameters
+- `options` (Object):
+  - `videoID` (String, required): The YouTube video ID
+  - `lang` (String, optional): Language code for captions. Default: `'en'`. Supported: `'en'`, `'de'`, `'fr'`
+#### Returns
+A Promise that resolves to an array of caption objects.
+#### Caption Object Format
+Each caption object contains:
+```js
+{
+  "start": "0",     // Start time in seconds (as string)
+  "dur": "3.0",     // Duration in seconds (as string)
+  "text": "Caption text here"  // The actual caption text
+}
+```
+#### Example Response
+```js
+[
+  {
+    "start": "0",
+    "dur": "3.0",
+    "text": "- Creating passive income takes work,"
+  },
+  {
+    "start": "3",
+    "dur": "2.0",
+    "text": "but once you implement those processes,"
+  },
+  {
+    "start": "5",
+    "dur": "3.0",
+    "text": "it's one of the most fruitful income sources"
+  }
+  // ... more captions
+]
+```
+## How It Works
+This library uses Puppeteer to:
+1. Navigate to the YouTube video page
+2. Handle cookie consent and ads if present
+3. Click the "Show transcript" button in the video description
+4. Extract transcript segments from the opened transcript panel
+5. Parse timestamps and text content
+6. Calculate proper durations for each caption segment
+## Requirements
+- Node.js 12 or higher
+- Puppeteer (installed as a dependency)
+## Error Handling
+The function will throw an error if:
+- The video ID is invalid or the video doesn't exist
+- The video has no available captions/transcripts
+- The "Show transcript" button cannot be found
+- Network issues prevent loading the page
+Example error handling:
+```js
+try {
+  const captions = await getSubtitles({ videoID: 'XXXXX' });
+  console.log(captions);
+} catch (error) {
+  console.error('Failed to extract captions:', error.message);
+}
+```
+## Notes
+- The library runs Puppeteer in headless mode by default
+- Extraction time depends on video page load time and transcript length
+- The library respects YouTube's UI structure as of the last update
+- Some videos may not have transcripts available
+## License
+MIT

package/package.json ADDED Viewed

@@ -0,0 +1,41 @@
+{
+  "name": "headless-youtube-captions",
+  "version": "1.0.0",
+  "description": "Extract YouTube video transcripts using headless browser automation",
+  "main": "src/index.js",
+  "type": "module",
+  "engines": {
+    "node": ">=18.0.0"
+  },
+  "author": {
+    "name": "andrewlwn77"
+  },
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/andrewlwn77/headless-youtube-captions.git"
+  },
+  "homepage": "https://github.com/andrewlwn77/headless-youtube-captions",
+  "bugs": {
+    "url": "https://github.com/andrewlwn77/headless-youtube-captions/issues"
+  },
+  "license": "MIT",
+  "scripts": {
+    "test": "node --test test/*.test.js"
+  },
+  "dependencies": {
+    "he": "^1.2.0",
+    "lodash": "^4.17.21",
+    "puppeteer": "^24.10.0",
+    "striptags": "^3.2.0"
+  },
+  "devDependencies": {},
+  "keywords": [
+    "youtube",
+    "captions",
+    "subtitles",
+    "transcript",
+    "puppeteer",
+    "headless",
+    "scraper"
+  ]
+}

package/src/index.js ADDED Viewed

@@ -0,0 +1,290 @@
+import he from 'he';
+import lodash from 'lodash';
+import striptags from 'striptags';
+import puppeteer from 'puppeteer';
+const { find } = lodash;
+export async function getSubtitles({ videoID, lang = 'en' }) {
+  const browser = await puppeteer.launch({
+    headless: true,
+    args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080', '--disable-dev-shm-usage']
+  });
+  try {
+    const page = await browser.newPage();
+    // Set viewport to a standard desktop size
+    await page.setViewport({ width: 1920, height: 1080 });
+    // Set a realistic user agent
+    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
+    // Navigate to the YouTube video page
+    console.log(`Navigating to https://youtube.com/watch?v=${videoID}`);
+    await page.goto(`https://youtube.com/watch?v=${videoID}`, {
+      waitUntil: 'networkidle2',
+      timeout: 60000
+    });
+    // Wait for initial page load
+    await page.waitForSelector('#movie_player, video', { timeout: 30000 });
+    console.log('Video player loaded');
+    // Wait a bit more for dynamic content to load
+    await new Promise(resolve => setTimeout(resolve, 5000));
+    // Handle cookie consent if present
+    try {
+      const consentButton = await page.$('[aria-label*="Accept all"], [aria-label*="Accept cookies"], button:has-text("Accept all")');
+      if (consentButton) {
+        await consentButton.click();
+        console.log('Accepted cookies');
+        await new Promise(resolve => setTimeout(resolve, 1000));
+      }
+    } catch (e) {
+      // Cookie consent not present or already accepted
+    }
+    // Skip ads if present
+    try {
+      const skipButton = await page.$('.ytp-ad-skip-button, .ytp-skip-ad-button');
+      if (skipButton) {
+        await skipButton.click();
+        console.log('Skipped ad');
+        await new Promise(resolve => setTimeout(resolve, 2000));
+      }
+    } catch (e) {
+      // No skip button
+    }
+    // Scroll down to load more content
+    await page.evaluate(() => window.scrollBy(0, 800));
+    await new Promise(resolve => setTimeout(resolve, 2000));
+    // Click on "more" to expand description if needed
+    try {
+      // Multiple selectors for the more button
+      const moreSelectors = [
+        'tp-yt-paper-button#expand',
+        'tp-yt-paper-button[id="expand"]',
+        '#expand',
+        '#more',
+        'yt-formatted-string:has-text("...more")',
+        '[aria-label*="more"]'
+      ];
+      for (const selector of moreSelectors) {
+        try {
+          const moreButton = await page.$(selector);
+          if (moreButton) {
+            const isVisible = await moreButton.evaluate(el => {
+              const rect = el.getBoundingClientRect();
+              return rect.width > 0 && rect.height > 0;
+            });
+            if (isVisible) {
+              await moreButton.click();
+              console.log('Clicked "more" button');
+              await new Promise(resolve => setTimeout(resolve, 1000));
+              break;
+            }
+          }
+        } catch (e) {
+          // Try next selector
+        }
+      }
+    } catch (e) {
+      console.log('No "more" button found or error clicking it');
+    }
+    // Look for and click the "Show transcript" button
+    console.log('Looking for "Show transcript" button...');
+    // Multiple strategies to find the transcript button
+    const transcriptButtonSelectors = [
+      'button[aria-label="Show transcript"]',
+      'yt-button-shape button[aria-label="Show transcript"]',
+      'button[title*="transcript" i]',
+      'button[aria-label*="transcript" i]',
+      'yt-button-shape[aria-label*="transcript" i]',
+      '#button[aria-label*="transcript" i]',
+      'ytd-button-renderer[aria-label*="transcript" i]'
+    ];
+    let transcriptClicked = false;
+    for (const selector of transcriptButtonSelectors) {
+      try {
+        await page.waitForSelector(selector, { timeout: 3000, visible: true });
+        await page.click(selector);
+        console.log(`Clicked transcript button with selector: ${selector}`);
+        transcriptClicked = true;
+        break;
+      } catch (e) {
+        // Try next selector
+      }
+    }
+    if (!transcriptClicked) {
+      // Try finding by text content
+      console.log('Trying to find transcript button by text...');
+      const clicked = await page.evaluate(() => {
+        const buttons = Array.from(document.querySelectorAll('button, yt-button-shape'));
+        for (const button of buttons) {
+          const text = button.textContent || '';
+          const ariaLabel = button.getAttribute('aria-label') || '';
+          if (text.toLowerCase().includes('transcript') || ariaLabel.toLowerCase().includes('transcript')) {
+            button.click();
+            return true;
+          }
+        }
+        return false;
+      });
+      if (clicked) {
+        console.log('Clicked transcript button by text search');
+        transcriptClicked = true;
+      }
+    }
+    if (!transcriptClicked) {
+      throw new Error('Could not find or click "Show transcript" button');
+    }
+    // Wait for the transcript panel to load
+    console.log('Waiting for transcript panel...');
+    await new Promise(resolve => setTimeout(resolve, 3000));
+    // Wait for transcript segments
+    await page.waitForSelector('ytd-transcript-segment-renderer, ytd-transcript-body-renderer', {
+      timeout: 10000,
+      visible: true
+    });
+    // Extract transcript data
+    console.log('Extracting transcript content...');
+    const transcriptData = await page.evaluate(() => {
+      // Multiple selectors for transcript segments
+      const segmentSelectors = [
+        'ytd-transcript-segment-renderer',
+        'ytd-transcript-body-renderer ytd-transcript-segment-renderer',
+        'ytd-engagement-panel-section-list-renderer ytd-transcript-segment-renderer',
+        '#segments-container ytd-transcript-segment-renderer',
+        'ytd-transcript-segment-list-renderer ytd-transcript-segment-renderer'
+      ];
+      let segments = [];
+      for (const selector of segmentSelectors) {
+        segments = document.querySelectorAll(selector);
+        if (segments.length > 0) {
+          console.log(`Found ${segments.length} segments with selector: ${selector}`);
+          break;
+        }
+      }
+      if (segments.length === 0) {
+        // Try a more general approach
+        segments = document.querySelectorAll('[class*="transcript"][class*="segment"]');
+        console.log(`Found ${segments.length} segments with general selector`);
+      }
+      if (segments.length === 0) {
+        return [];
+      }
+      // Extract data from each segment
+      return Array.from(segments).map((segment) => {
+        // Extract timestamp - multiple strategies
+        let timestampText = '';
+        const timestampSelectors = [
+          '.segment-timestamp',
+          '[class*="timestamp"]',
+          '.ytd-transcript-segment-renderer:first-child',
+          'div:first-child'
+        ];
+        for (const selector of timestampSelectors) {
+          const elem = segment.querySelector(selector);
+          if (elem && elem.textContent && /\d+:\d+/.test(elem.textContent)) {
+            timestampText = elem.textContent.trim();
+            break;
+          }
+        }
+        // Extract text content - multiple strategies
+        let text = '';
+        const textSelectors = [
+          '.segment-text',
+          'yt-formatted-string.segment-text',
+          '[class*="segment-text"]',
+          'yt-formatted-string:last-child',
+          '.ytd-transcript-segment-renderer:last-child'
+        ];
+        for (const selector of textSelectors) {
+          const elem = segment.querySelector(selector);
+          if (elem && elem.textContent) {
+            const content = elem.textContent.trim();
+            // Make sure it's not the timestamp
+            if (content && !(/^\d+:\d+$/.test(content))) {
+              text = content;
+              break;
+            }
+          }
+        }
+        // If still no text, get all text and remove timestamp
+        if (!text) {
+          const fullText = segment.textContent || '';
+          text = fullText.replace(timestampText, '').trim();
+        }
+        // Convert timestamp to seconds
+        let startSeconds = 0;
+        if (timestampText && timestampText.includes(':')) {
+          const parts = timestampText.split(':').reverse();
+          startSeconds = parts.reduce((acc, part, idx) => {
+            return acc + (parseInt(part) || 0) * Math.pow(60, idx);
+          }, 0);
+        }
+        return {
+          start: startSeconds.toString(),
+          dur: "3",
+          text: text,
+          timestamp: timestampText
+        };
+      }).filter(item => item.text && item.text.trim() && item.text.length > 0);
+    });
+    if (!transcriptData || transcriptData.length === 0) {
+      throw new Error('No transcript data extracted');
+    }
+    console.log(`Successfully extracted ${transcriptData.length} transcript segments`);
+    console.log('First segment:', transcriptData[0]);
+    // Calculate proper durations
+    const processedCaptions = transcriptData.map((item, index) => {
+      const nextItem = transcriptData[index + 1];
+      const duration = nextItem
+        ? (parseFloat(nextItem.start) - parseFloat(item.start)).toFixed(1)
+        : "3.0";
+      return {
+        start: item.start,
+        dur: duration,
+        text: item.text
+      };
+    });
+    return processedCaptions;
+  } catch (error) {
+    console.error('Error extracting subtitles:', error);
+    throw error;
+  } finally {
+    await browser.close();
+  }
+}

package/test/index.test.js ADDED Viewed

@@ -0,0 +1,23 @@
+import { test } from 'node:test';
+import assert from 'node:assert';
+import { getSubtitles } from '../src/index.js';
+test('Extract passive income video captions', async () => {
+  const captions = await getSubtitles({ videoID: 'JueUvj6X3DA' });
+  // Check that captions were extracted
+  assert(Array.isArray(captions), 'Captions should be an array');
+  assert(captions.length > 0, 'Should extract at least one caption');
+  // Check structure of first caption
+  const firstCaption = captions[0];
+  assert(typeof firstCaption.start === 'string', 'Start time should be a string');
+  assert(typeof firstCaption.dur === 'string', 'Duration should be a string');
+  assert(typeof firstCaption.text === 'string', 'Text should be a string');
+  // Check that the first caption contains expected content
+  assert(
+    firstCaption.text.toLowerCase().includes('creating passive income'),
+    `First caption should contain "creating passive income", got: "${firstCaption.text}"`
+  );
+});