@steipete/summarize-core 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. package/README.md +7 -0
  2. package/dist/esm/content/index.js +5 -0
  3. package/dist/esm/content/index.js.map +1 -0
  4. package/dist/esm/content/link-preview/client.js +28 -0
  5. package/dist/esm/content/link-preview/client.js.map +1 -0
  6. package/dist/esm/content/link-preview/content/article.js +155 -0
  7. package/dist/esm/content/link-preview/content/article.js.map +1 -0
  8. package/dist/esm/content/link-preview/content/cleaner.js +55 -0
  9. package/dist/esm/content/link-preview/content/cleaner.js.map +1 -0
  10. package/dist/esm/content/link-preview/content/constants.js +7 -0
  11. package/dist/esm/content/link-preview/content/constants.js.map +1 -0
  12. package/dist/esm/content/link-preview/content/fetcher.js +124 -0
  13. package/dist/esm/content/link-preview/content/fetcher.js.map +1 -0
  14. package/dist/esm/content/link-preview/content/firecrawl.js +86 -0
  15. package/dist/esm/content/link-preview/content/firecrawl.js.map +1 -0
  16. package/dist/esm/content/link-preview/content/html.js +162 -0
  17. package/dist/esm/content/link-preview/content/html.js.map +1 -0
  18. package/dist/esm/content/link-preview/content/index.js +345 -0
  19. package/dist/esm/content/link-preview/content/index.js.map +1 -0
  20. package/dist/esm/content/link-preview/content/jsonld.js +77 -0
  21. package/dist/esm/content/link-preview/content/jsonld.js.map +1 -0
  22. package/dist/esm/content/link-preview/content/parsers.js +77 -0
  23. package/dist/esm/content/link-preview/content/parsers.js.map +1 -0
  24. package/dist/esm/content/link-preview/content/podcast-utils.js +79 -0
  25. package/dist/esm/content/link-preview/content/podcast-utils.js.map +1 -0
  26. package/dist/esm/content/link-preview/content/readability.js +53 -0
  27. package/dist/esm/content/link-preview/content/readability.js.map +1 -0
  28. package/dist/esm/content/link-preview/content/twitter-utils.js +68 -0
  29. package/dist/esm/content/link-preview/content/twitter-utils.js.map +1 -0
  30. package/dist/esm/content/link-preview/content/types.js +4 -0
  31. package/dist/esm/content/link-preview/content/types.js.map +1 -0
  32. package/dist/esm/content/link-preview/content/utils.js +164 -0
  33. package/dist/esm/content/link-preview/content/utils.js.map +1 -0
  34. package/dist/esm/content/link-preview/content/video.js +96 -0
  35. package/dist/esm/content/link-preview/content/video.js.map +1 -0
  36. package/dist/esm/content/link-preview/content/youtube.js +82 -0
  37. package/dist/esm/content/link-preview/content/youtube.js.map +1 -0
  38. package/dist/esm/content/link-preview/deps.js +20 -0
  39. package/dist/esm/content/link-preview/deps.js.map +1 -0
  40. package/dist/esm/content/link-preview/fetch-with-timeout.js +35 -0
  41. package/dist/esm/content/link-preview/fetch-with-timeout.js.map +1 -0
  42. package/dist/esm/content/link-preview/types.js +2 -0
  43. package/dist/esm/content/link-preview/types.js.map +1 -0
  44. package/dist/esm/content/transcript/cache.js +79 -0
  45. package/dist/esm/content/transcript/cache.js.map +1 -0
  46. package/dist/esm/content/transcript/index.js +130 -0
  47. package/dist/esm/content/transcript/index.js.map +1 -0
  48. package/dist/esm/content/transcript/normalize.js +43 -0
  49. package/dist/esm/content/transcript/normalize.js.map +1 -0
  50. package/dist/esm/content/transcript/providers/generic.js +11 -0
  51. package/dist/esm/content/transcript/providers/generic.js.map +1 -0
  52. package/dist/esm/content/transcript/providers/podcast/apple-flow.js +222 -0
  53. package/dist/esm/content/transcript/providers/podcast/apple-flow.js.map +1 -0
  54. package/dist/esm/content/transcript/providers/podcast/apple.js +38 -0
  55. package/dist/esm/content/transcript/providers/podcast/apple.js.map +1 -0
  56. package/dist/esm/content/transcript/providers/podcast/constants.js +8 -0
  57. package/dist/esm/content/transcript/providers/podcast/constants.js.map +1 -0
  58. package/dist/esm/content/transcript/providers/podcast/flow-context.js +2 -0
  59. package/dist/esm/content/transcript/providers/podcast/flow-context.js.map +1 -0
  60. package/dist/esm/content/transcript/providers/podcast/itunes.js +134 -0
  61. package/dist/esm/content/transcript/providers/podcast/itunes.js.map +1 -0
  62. package/dist/esm/content/transcript/providers/podcast/json.js +34 -0
  63. package/dist/esm/content/transcript/providers/podcast/json.js.map +1 -0
  64. package/dist/esm/content/transcript/providers/podcast/media.js +345 -0
  65. package/dist/esm/content/transcript/providers/podcast/media.js.map +1 -0
  66. package/dist/esm/content/transcript/providers/podcast/results.js +28 -0
  67. package/dist/esm/content/transcript/providers/podcast/results.js.map +1 -0
  68. package/dist/esm/content/transcript/providers/podcast/rss.js +253 -0
  69. package/dist/esm/content/transcript/providers/podcast/rss.js.map +1 -0
  70. package/dist/esm/content/transcript/providers/podcast/spotify-flow.js +218 -0
  71. package/dist/esm/content/transcript/providers/podcast/spotify-flow.js.map +1 -0
  72. package/dist/esm/content/transcript/providers/podcast/spotify.js +113 -0
  73. package/dist/esm/content/transcript/providers/podcast/spotify.js.map +1 -0
  74. package/dist/esm/content/transcript/providers/podcast.js +222 -0
  75. package/dist/esm/content/transcript/providers/podcast.js.map +1 -0
  76. package/dist/esm/content/transcript/providers/youtube/api.js +257 -0
  77. package/dist/esm/content/transcript/providers/youtube/api.js.map +1 -0
  78. package/dist/esm/content/transcript/providers/youtube/apify.js +55 -0
  79. package/dist/esm/content/transcript/providers/youtube/apify.js.map +1 -0
  80. package/dist/esm/content/transcript/providers/youtube/captions.js +409 -0
  81. package/dist/esm/content/transcript/providers/youtube/captions.js.map +1 -0
  82. package/dist/esm/content/transcript/providers/youtube/yt-dlp.js +166 -0
  83. package/dist/esm/content/transcript/providers/youtube/yt-dlp.js.map +1 -0
  84. package/dist/esm/content/transcript/providers/youtube.js +173 -0
  85. package/dist/esm/content/transcript/providers/youtube.js.map +1 -0
  86. package/dist/esm/content/transcript/types.js +2 -0
  87. package/dist/esm/content/transcript/types.js.map +1 -0
  88. package/dist/esm/content/transcript/utils.js +259 -0
  89. package/dist/esm/content/transcript/utils.js.map +1 -0
  90. package/dist/esm/index.js +4 -0
  91. package/dist/esm/index.js.map +1 -0
  92. package/dist/esm/language.js +126 -0
  93. package/dist/esm/language.js.map +1 -0
  94. package/dist/esm/prompts/cli.js +20 -0
  95. package/dist/esm/prompts/cli.js.map +1 -0
  96. package/dist/esm/prompts/file.js +48 -0
  97. package/dist/esm/prompts/file.js.map +1 -0
  98. package/dist/esm/prompts/index.js +4 -0
  99. package/dist/esm/prompts/index.js.map +1 -0
  100. package/dist/esm/prompts/link-summary.js +116 -0
  101. package/dist/esm/prompts/link-summary.js.map +1 -0
  102. package/dist/esm/shared/contracts.js +2 -0
  103. package/dist/esm/shared/contracts.js.map +1 -0
  104. package/dist/esm/transcription/whisper/constants.js +8 -0
  105. package/dist/esm/transcription/whisper/constants.js.map +1 -0
  106. package/dist/esm/transcription/whisper/core.js +303 -0
  107. package/dist/esm/transcription/whisper/core.js.map +1 -0
  108. package/dist/esm/transcription/whisper/fal.js +41 -0
  109. package/dist/esm/transcription/whisper/fal.js.map +1 -0
  110. package/dist/esm/transcription/whisper/ffmpeg.js +179 -0
  111. package/dist/esm/transcription/whisper/ffmpeg.js.map +1 -0
  112. package/dist/esm/transcription/whisper/openai.js +47 -0
  113. package/dist/esm/transcription/whisper/openai.js.map +1 -0
  114. package/dist/esm/transcription/whisper/types.js +2 -0
  115. package/dist/esm/transcription/whisper/types.js.map +1 -0
  116. package/dist/esm/transcription/whisper/utils.js +63 -0
  117. package/dist/esm/transcription/whisper/utils.js.map +1 -0
  118. package/dist/esm/transcription/whisper/whisper-cpp.js +227 -0
  119. package/dist/esm/transcription/whisper/whisper-cpp.js.map +1 -0
  120. package/dist/esm/transcription/whisper.js +5 -0
  121. package/dist/esm/transcription/whisper.js.map +1 -0
  122. package/dist/types/content/index.d.ts +5 -0
  123. package/dist/types/content/link-preview/client.d.ts +18 -0
  124. package/dist/types/content/link-preview/content/article.d.ts +4 -0
  125. package/dist/types/content/link-preview/content/cleaner.d.ts +12 -0
  126. package/dist/types/content/link-preview/content/constants.d.ts +6 -0
  127. package/dist/types/content/link-preview/content/fetcher.d.ts +16 -0
  128. package/dist/types/content/link-preview/content/firecrawl.d.ts +14 -0
  129. package/dist/types/content/link-preview/content/html.d.ts +17 -0
  130. package/dist/types/content/link-preview/content/index.d.ts +4 -0
  131. package/dist/types/content/link-preview/content/jsonld.d.ts +6 -0
  132. package/dist/types/content/link-preview/content/parsers.d.ts +7 -0
  133. package/dist/types/content/link-preview/content/podcast-utils.d.ts +7 -0
  134. package/dist/types/content/link-preview/content/readability.d.ts +8 -0
  135. package/dist/types/content/link-preview/content/twitter-utils.d.ts +4 -0
  136. package/dist/types/content/link-preview/content/types.d.ts +61 -0
  137. package/dist/types/content/link-preview/content/utils.d.ts +17 -0
  138. package/dist/types/content/link-preview/content/video.d.ts +5 -0
  139. package/dist/types/content/link-preview/content/youtube.d.ts +1 -0
  140. package/dist/types/content/link-preview/deps.d.ts +167 -0
  141. package/dist/types/content/link-preview/fetch-with-timeout.d.ts +4 -0
  142. package/dist/types/content/link-preview/types.d.ts +37 -0
  143. package/dist/types/content/transcript/cache.d.ts +29 -0
  144. package/dist/types/content/transcript/index.d.ts +9 -0
  145. package/dist/types/content/transcript/normalize.d.ts +3 -0
  146. package/dist/types/content/transcript/providers/generic.d.ts +3 -0
  147. package/dist/types/content/transcript/providers/podcast/apple-flow.d.ts +4 -0
  148. package/dist/types/content/transcript/providers/podcast/apple.d.ts +6 -0
  149. package/dist/types/content/transcript/providers/podcast/constants.d.ts +7 -0
  150. package/dist/types/content/transcript/providers/podcast/flow-context.d.ts +11 -0
  151. package/dist/types/content/transcript/providers/podcast/itunes.d.ts +17 -0
  152. package/dist/types/content/transcript/providers/podcast/json.d.ts +8 -0
  153. package/dist/types/content/transcript/providers/podcast/media.d.ts +42 -0
  154. package/dist/types/content/transcript/providers/podcast/results.d.ts +10 -0
  155. package/dist/types/content/transcript/providers/podcast/rss.d.ts +22 -0
  156. package/dist/types/content/transcript/providers/podcast/spotify-flow.d.ts +3 -0
  157. package/dist/types/content/transcript/providers/podcast/spotify.d.ts +24 -0
  158. package/dist/types/content/transcript/providers/podcast.d.ts +20 -0
  159. package/dist/types/content/transcript/providers/youtube/api.d.ts +26 -0
  160. package/dist/types/content/transcript/providers/youtube/apify.d.ts +1 -0
  161. package/dist/types/content/transcript/providers/youtube/captions.d.ts +7 -0
  162. package/dist/types/content/transcript/providers/youtube/yt-dlp.d.ts +17 -0
  163. package/dist/types/content/transcript/providers/youtube.d.ts +3 -0
  164. package/dist/types/content/transcript/types.d.ts +30 -0
  165. package/dist/types/content/transcript/utils.d.ts +8 -0
  166. package/dist/types/index.d.ts +4 -0
  167. package/dist/types/language.d.ts +25 -0
  168. package/dist/types/prompts/cli.d.ts +10 -0
  169. package/dist/types/prompts/file.d.ts +17 -0
  170. package/dist/types/prompts/index.d.ts +4 -0
  171. package/dist/types/prompts/link-summary.d.ts +29 -0
  172. package/dist/types/shared/contracts.d.ts +2 -0
  173. package/dist/types/transcription/whisper/constants.d.ts +7 -0
  174. package/dist/types/transcription/whisper/core.d.ts +20 -0
  175. package/dist/types/transcription/whisper/fal.d.ts +1 -0
  176. package/dist/types/transcription/whisper/ffmpeg.d.ts +16 -0
  177. package/dist/types/transcription/whisper/openai.d.ts +2 -0
  178. package/dist/types/transcription/whisper/types.d.ts +17 -0
  179. package/dist/types/transcription/whisper/utils.d.ts +5 -0
  180. package/dist/types/transcription/whisper/whisper-cpp.d.ts +9 -0
  181. package/dist/types/transcription/whisper.d.ts +5 -0
  182. package/package.json +54 -0
@@ -0,0 +1,14 @@
1
+ import type { FirecrawlScrapeResult, LinkPreviewDeps } from '../deps.js';
2
+ import type { FirecrawlDiagnostics } from '../types.js';
3
+ import type { ExtractedLinkContent, FetchLinkContentOptions } from './types.js';
4
+ export declare function shouldFallbackToFirecrawl(html: string): boolean;
5
+ export declare function buildResultFromFirecrawl({ url, payload, cacheMode, maxCharacters, youtubeTranscriptMode, firecrawlDiagnostics, markdownRequested, deps, }: {
6
+ url: string;
7
+ payload: FirecrawlScrapeResult;
8
+ cacheMode: FetchLinkContentOptions['cacheMode'];
9
+ maxCharacters: number | null;
10
+ youtubeTranscriptMode: FetchLinkContentOptions['youtubeTranscript'];
11
+ firecrawlDiagnostics: FirecrawlDiagnostics;
12
+ markdownRequested: boolean;
13
+ deps: LinkPreviewDeps;
14
+ }): Promise<ExtractedLinkContent | null>;
@@ -0,0 +1,17 @@
1
+ import type { LinkPreviewDeps } from '../deps.js';
2
+ import type { FirecrawlDiagnostics } from '../types.js';
3
+ import { extractReadabilityFromHtml } from './readability.js';
4
+ import type { ExtractedLinkContent, FetchLinkContentOptions, MarkdownMode } from './types.js';
5
+ export declare function buildResultFromHtmlDocument({ url, html, cacheMode, maxCharacters, youtubeTranscriptMode, firecrawlDiagnostics, markdownRequested, markdownMode, timeoutMs, deps, readabilityCandidate, }: {
6
+ url: string;
7
+ html: string;
8
+ cacheMode: FetchLinkContentOptions['cacheMode'];
9
+ maxCharacters: number | null;
10
+ youtubeTranscriptMode: FetchLinkContentOptions['youtubeTranscript'];
11
+ firecrawlDiagnostics: FirecrawlDiagnostics;
12
+ markdownRequested: boolean;
13
+ markdownMode: MarkdownMode;
14
+ timeoutMs: number;
15
+ deps: LinkPreviewDeps;
16
+ readabilityCandidate: Awaited<ReturnType<typeof extractReadabilityFromHtml>> | null;
17
+ }): Promise<ExtractedLinkContent>;
@@ -0,0 +1,4 @@
1
+ import type { LinkPreviewDeps } from '../deps.js';
2
+ import type { ExtractedLinkContent, FetchLinkContentOptions } from './types.js';
3
+ export declare function fetchLinkContent(url: string, options: FetchLinkContentOptions | undefined, deps: LinkPreviewDeps): Promise<ExtractedLinkContent>;
4
+ export type { ExtractedLinkContent, FetchLinkContentOptions } from './types.js';
@@ -0,0 +1,6 @@
1
+ export type JsonLdContent = {
2
+ title: string | null;
3
+ description: string | null;
4
+ type: string | null;
5
+ };
6
+ export declare function extractJsonLdContent(html: string): JsonLdContent | null;
@@ -0,0 +1,7 @@
1
+ export interface ParsedMetadata {
2
+ title: string | null;
3
+ description: string | null;
4
+ siteName: string | null;
5
+ }
6
+ export declare function extractMetadataFromHtml(html: string, url: string): ParsedMetadata;
7
+ export declare function extractMetadataFromFirecrawl(metadata: Record<string, unknown> | null | undefined): ParsedMetadata;
@@ -0,0 +1,7 @@
1
+ export declare function extractSpotifyEpisodeId(url: string): string | null;
2
+ export declare function extractApplePodcastIds(url: string): {
3
+ showId: string;
4
+ episodeId: string | null;
5
+ } | null;
6
+ export declare function isPodcastLikeJsonLdType(type: string | null | undefined): boolean;
7
+ export declare function isPodcastHost(url: string): boolean;
@@ -0,0 +1,8 @@
1
+ export type ReadabilityResult = {
2
+ text: string;
3
+ html: string | null;
4
+ title: string | null;
5
+ excerpt: string | null;
6
+ };
7
+ export declare function extractReadabilityFromHtml(html: string, url?: string): Promise<ReadabilityResult | null>;
8
+ export declare function toReadabilityHtml(result: ReadabilityResult | null): string | null;
@@ -0,0 +1,4 @@
1
+ export declare function isTwitterStatusUrl(url: string): boolean;
2
+ export declare function toNitterUrls(url: string): string[];
3
+ export declare function isBlockedTwitterContent(content: string): boolean;
4
+ export declare function isAnubisHtml(html: string): boolean;
@@ -0,0 +1,61 @@
1
+ import type { CacheMode, ContentFetchDiagnostics, TranscriptDiagnostics, TranscriptSource } from '../types.js';
2
+ export declare const DEFAULT_TIMEOUT_MS = 120000;
3
+ export declare const DEFAULT_MAX_CONTENT_CHARACTERS = 8000;
4
+ export declare const DEFAULT_CACHE_MODE: CacheMode;
5
+ export type YoutubeTranscriptMode = 'auto' | 'web' | 'apify' | 'yt-dlp';
6
+ export type FirecrawlMode = 'off' | 'auto' | 'always';
7
+ export type ContentFormat = 'text' | 'markdown';
8
+ export type MarkdownMode = 'off' | 'auto' | 'llm' | 'readability';
9
+ export interface FetchLinkContentOptions {
10
+ timeoutMs?: number;
11
+ maxCharacters?: number;
12
+ cacheMode?: CacheMode;
13
+ youtubeTranscript?: YoutubeTranscriptMode;
14
+ firecrawl?: FirecrawlMode;
15
+ format?: ContentFormat;
16
+ markdownMode?: MarkdownMode;
17
+ }
18
+ export interface TranscriptResolution {
19
+ diagnostics?: TranscriptDiagnostics;
20
+ source: TranscriptSource | null;
21
+ text: string | null;
22
+ metadata?: Record<string, unknown> | null;
23
+ }
24
+ export interface ExtractedLinkContent {
25
+ url: string;
26
+ title: string | null;
27
+ description: string | null;
28
+ siteName: string | null;
29
+ content: string;
30
+ truncated: boolean;
31
+ totalCharacters: number;
32
+ wordCount: number;
33
+ transcriptCharacters: number | null;
34
+ transcriptLines: number | null;
35
+ transcriptWordCount: number | null;
36
+ transcriptSource: TranscriptSource | null;
37
+ transcriptionProvider: string | null;
38
+ transcriptMetadata: Record<string, unknown> | null;
39
+ mediaDurationSeconds: number | null;
40
+ video: {
41
+ kind: 'youtube' | 'direct';
42
+ url: string;
43
+ } | null;
44
+ isVideoOnly: boolean;
45
+ diagnostics: ContentFetchDiagnostics;
46
+ }
47
+ export interface FinalizationArguments {
48
+ url: string;
49
+ baseContent: string;
50
+ maxCharacters: number | null;
51
+ title: string | null;
52
+ description: string | null;
53
+ siteName: string | null;
54
+ transcriptResolution: TranscriptResolution;
55
+ video: {
56
+ kind: 'youtube' | 'direct';
57
+ url: string;
58
+ } | null;
59
+ isVideoOnly: boolean;
60
+ diagnostics: ContentFetchDiagnostics;
61
+ }
@@ -0,0 +1,17 @@
1
+ import type { CacheMode, TranscriptDiagnostics } from '../types.js';
2
+ import { type ExtractedLinkContent, type FetchLinkContentOptions, type FinalizationArguments, type FirecrawlMode, type TranscriptResolution } from './types.js';
3
+ export declare function resolveCacheMode(options?: FetchLinkContentOptions): "default" | "bypass";
4
+ export declare function resolveMaxCharacters(options?: FetchLinkContentOptions): number | null;
5
+ export declare function resolveTimeoutMs(options?: FetchLinkContentOptions): number;
6
+ export declare function resolveFirecrawlMode(options?: FetchLinkContentOptions): FirecrawlMode;
7
+ export declare function appendNote(existing: string | null | undefined, next: string): string;
8
+ export declare function safeHostname(rawUrl: string): string | null;
9
+ export declare function pickFirstText(candidates: Array<string | null | undefined>): string | null;
10
+ export declare function selectBaseContent(sourceContent: string, transcriptText: string | null): string;
11
+ export declare function summarizeTranscript(transcriptText: string | null): {
12
+ transcriptCharacters: number | null;
13
+ transcriptLines: number | null;
14
+ transcriptWordCount: number | null;
15
+ };
16
+ export declare function ensureTranscriptDiagnostics(resolution: TranscriptResolution, cacheMode: CacheMode): TranscriptDiagnostics;
17
+ export declare function finalizeExtractedLinkContent({ url, baseContent, maxCharacters, title, description, siteName, transcriptResolution, video, isVideoOnly, diagnostics, }: FinalizationArguments): ExtractedLinkContent;
@@ -0,0 +1,5 @@
1
+ export type DetectedVideo = {
2
+ kind: 'youtube' | 'direct';
3
+ url: string;
4
+ };
5
+ export declare function detectPrimaryVideoFromHtml(html: string, url: string): DetectedVideo | null;
@@ -0,0 +1 @@
1
+ export declare function extractYouTubeShortDescription(html: string): string | null;
@@ -0,0 +1,167 @@
1
+ import type { CacheMode, TranscriptSource } from './types.js';
2
+ export declare const ProgressKind: {
3
+ readonly FetchHtmlStart: "fetch-html-start";
4
+ readonly FetchHtmlProgress: "fetch-html-progress";
5
+ readonly FetchHtmlDone: "fetch-html-done";
6
+ readonly TranscriptMediaDownloadStart: "transcript-media-download-start";
7
+ readonly TranscriptMediaDownloadProgress: "transcript-media-download-progress";
8
+ readonly TranscriptMediaDownloadDone: "transcript-media-download-done";
9
+ readonly TranscriptWhisperStart: "transcript-whisper-start";
10
+ readonly TranscriptWhisperProgress: "transcript-whisper-progress";
11
+ readonly TranscriptStart: "transcript-start";
12
+ readonly TranscriptDone: "transcript-done";
13
+ readonly FirecrawlStart: "firecrawl-start";
14
+ readonly FirecrawlDone: "firecrawl-done";
15
+ readonly NitterStart: "nitter-start";
16
+ readonly NitterDone: "nitter-done";
17
+ readonly BirdStart: "bird-start";
18
+ readonly BirdDone: "bird-done";
19
+ };
20
+ export type LinkPreviewProgressEvent = {
21
+ kind: 'fetch-html-start';
22
+ url: string;
23
+ } | {
24
+ kind: 'fetch-html-progress';
25
+ url: string;
26
+ downloadedBytes: number;
27
+ totalBytes: number | null;
28
+ } | {
29
+ kind: 'fetch-html-done';
30
+ url: string;
31
+ downloadedBytes: number;
32
+ totalBytes: number | null;
33
+ } | {
34
+ kind: 'transcript-media-download-start';
35
+ url: string;
36
+ service: 'youtube' | 'podcast' | 'generic';
37
+ mediaUrl: string | null;
38
+ totalBytes: number | null;
39
+ } | {
40
+ kind: 'transcript-media-download-progress';
41
+ url: string;
42
+ service: 'youtube' | 'podcast' | 'generic';
43
+ downloadedBytes: number;
44
+ totalBytes: number | null;
45
+ } | {
46
+ kind: 'transcript-media-download-done';
47
+ url: string;
48
+ service: 'youtube' | 'podcast' | 'generic';
49
+ downloadedBytes: number;
50
+ totalBytes: number | null;
51
+ } | {
52
+ kind: 'transcript-whisper-start';
53
+ url: string;
54
+ service: 'youtube' | 'podcast' | 'generic';
55
+ providerHint: 'cpp' | 'openai' | 'fal' | 'openai->fal' | 'unknown';
56
+ modelId: string | null;
57
+ totalDurationSeconds: number | null;
58
+ parts: number | null;
59
+ } | {
60
+ kind: 'transcript-whisper-progress';
61
+ url: string;
62
+ service: 'youtube' | 'podcast' | 'generic';
63
+ processedDurationSeconds: number | null;
64
+ totalDurationSeconds: number | null;
65
+ partIndex: number | null;
66
+ parts: number | null;
67
+ } | {
68
+ kind: 'transcript-start';
69
+ url: string;
70
+ service: 'youtube' | 'podcast' | 'generic';
71
+ hint: string | null;
72
+ } | {
73
+ kind: 'transcript-done';
74
+ url: string;
75
+ ok: boolean;
76
+ service: 'youtube' | 'podcast' | 'generic';
77
+ source: TranscriptSource | null;
78
+ hint: string | null;
79
+ } | {
80
+ kind: 'firecrawl-start';
81
+ url: string;
82
+ reason: string;
83
+ } | {
84
+ kind: 'firecrawl-done';
85
+ url: string;
86
+ ok: boolean;
87
+ markdownBytes: number | null;
88
+ htmlBytes: number | null;
89
+ } | {
90
+ kind: 'nitter-start';
91
+ url: string;
92
+ } | {
93
+ kind: 'nitter-done';
94
+ url: string;
95
+ ok: boolean;
96
+ textBytes: number | null;
97
+ } | {
98
+ kind: 'bird-start';
99
+ url: string;
100
+ } | {
101
+ kind: 'bird-done';
102
+ url: string;
103
+ ok: boolean;
104
+ textBytes: number | null;
105
+ };
106
+ export interface FirecrawlScrapeResult {
107
+ markdown: string;
108
+ html?: string | null;
109
+ metadata?: Record<string, unknown> | null;
110
+ }
111
+ export type ScrapeWithFirecrawl = (url: string, options?: {
112
+ cacheMode?: CacheMode;
113
+ timeoutMs?: number;
114
+ }) => Promise<FirecrawlScrapeResult | null>;
115
+ export type ConvertHtmlToMarkdown = (args: {
116
+ url: string;
117
+ html: string;
118
+ title: string | null;
119
+ siteName: string | null;
120
+ timeoutMs: number;
121
+ }) => Promise<string>;
122
+ export type BirdTweetPayload = {
123
+ id?: string;
124
+ text: string;
125
+ author?: {
126
+ username?: string;
127
+ name?: string;
128
+ };
129
+ createdAt?: string;
130
+ };
131
+ export type ReadTweetWithBird = (args: {
132
+ url: string;
133
+ timeoutMs: number;
134
+ }) => Promise<BirdTweetPayload | null>;
135
+ export interface TranscriptCacheGetResult {
136
+ content: string | null;
137
+ source: TranscriptSource | null;
138
+ expired: boolean;
139
+ metadata?: Record<string, unknown> | null;
140
+ }
141
+ export interface TranscriptCacheSetArgs {
142
+ url: string;
143
+ service: string;
144
+ resourceKey: string | null;
145
+ content: string | null;
146
+ source: TranscriptSource | null;
147
+ ttlMs: number;
148
+ metadata?: Record<string, unknown> | null;
149
+ }
150
+ export interface TranscriptCache {
151
+ get(args: {
152
+ url: string;
153
+ }): Promise<TranscriptCacheGetResult | null>;
154
+ set(args: TranscriptCacheSetArgs): Promise<void>;
155
+ }
156
+ export interface LinkPreviewDeps {
157
+ fetch: typeof fetch;
158
+ scrapeWithFirecrawl: ScrapeWithFirecrawl | null;
159
+ apifyApiToken: string | null;
160
+ ytDlpPath: string | null;
161
+ falApiKey: string | null;
162
+ openaiApiKey: string | null;
163
+ convertHtmlToMarkdown: ConvertHtmlToMarkdown | null;
164
+ transcriptCache: TranscriptCache | null;
165
+ readTweetWithBird?: ReadTweetWithBird | null;
166
+ onProgress?: ((event: LinkPreviewProgressEvent) => void) | null;
167
+ }
@@ -0,0 +1,4 @@
1
+ type FetchLike = typeof fetch;
2
+ type FetchArguments = Parameters<typeof fetch>;
3
+ export declare function fetchWithTimeout(fetchImpl: FetchLike, input: FetchArguments[0], init?: FetchArguments[1], timeoutMs?: number): Promise<Response>;
4
+ export {};
@@ -0,0 +1,37 @@
1
+ export type TranscriptSource = 'youtubei' | 'captionTracks' | 'yt-dlp' | 'podcastTranscript' | 'whisper' | 'apify' | 'html' | 'unavailable' | 'unknown';
2
+ export declare const CACHE_MODES: readonly ["default", "bypass"];
3
+ export type CacheMode = (typeof CACHE_MODES)[number];
4
+ export type CacheStatus = 'hit' | 'miss' | 'expired' | 'bypassed' | 'fallback' | 'unknown';
5
+ export interface TranscriptDiagnostics {
6
+ cacheMode: CacheMode;
7
+ cacheStatus: CacheStatus;
8
+ textProvided: boolean;
9
+ provider: TranscriptSource | null;
10
+ attemptedProviders: TranscriptSource[];
11
+ notes?: string | null;
12
+ }
13
+ export interface FirecrawlDiagnostics {
14
+ attempted: boolean;
15
+ used: boolean;
16
+ cacheMode: CacheMode;
17
+ cacheStatus: CacheStatus;
18
+ notes?: string | null;
19
+ }
20
+ export interface MarkdownDiagnostics {
21
+ requested: boolean;
22
+ used: boolean;
23
+ provider: 'firecrawl' | 'llm' | null;
24
+ notes?: string | null;
25
+ }
26
+ export interface ContentFetchDiagnostics {
27
+ strategy: 'bird' | 'firecrawl' | 'html' | 'nitter';
28
+ firecrawl: FirecrawlDiagnostics;
29
+ markdown: MarkdownDiagnostics;
30
+ transcript: TranscriptDiagnostics;
31
+ }
32
+ export interface TranscriptResolution {
33
+ text: string | null;
34
+ source: TranscriptSource | null;
35
+ metadata?: Record<string, unknown> | null;
36
+ diagnostics?: TranscriptDiagnostics;
37
+ }
@@ -0,0 +1,29 @@
1
+ import type { TranscriptCache } from '../link-preview/deps.js';
2
+ import type { CacheMode, TranscriptDiagnostics, TranscriptResolution, TranscriptSource } from '../link-preview/types.js';
3
+ export declare const DEFAULT_TTL_MS: number;
4
+ export declare const NEGATIVE_TTL_MS: number;
5
+ type CacheDiagnostics = Pick<TranscriptDiagnostics, 'cacheStatus' | 'notes' | 'provider' | 'textProvided' | 'cacheMode' | 'attemptedProviders'>;
6
+ export interface CacheReadArguments {
7
+ url: string;
8
+ cacheMode: CacheMode;
9
+ transcriptCache: TranscriptCache | null;
10
+ }
11
+ export interface TranscriptCacheLookup {
12
+ cached: Awaited<ReturnType<TranscriptCache['get']>> | null;
13
+ resolution: TranscriptResolution | null;
14
+ diagnostics: CacheDiagnostics;
15
+ }
16
+ export declare const readTranscriptCache: ({ url, cacheMode, transcriptCache, }: CacheReadArguments) => Promise<TranscriptCacheLookup>;
17
+ export declare const mapCachedSource: (source: string | null) => TranscriptSource | null;
18
+ export declare const writeTranscriptCache: ({ url, service, resourceKey, result, transcriptCache, }: {
19
+ url: string;
20
+ service: string;
21
+ resourceKey: string | null;
22
+ result: {
23
+ text: string | null;
24
+ source: TranscriptSource | null;
25
+ metadata?: Record<string, unknown> | undefined;
26
+ };
27
+ transcriptCache: TranscriptCache | null;
28
+ }) => Promise<void>;
29
+ export {};
@@ -0,0 +1,9 @@
1
+ import type { LinkPreviewDeps } from '../link-preview/deps.js';
2
+ import type { CacheMode, TranscriptResolution } from '../link-preview/types.js';
3
+ import type { ProviderFetchOptions } from './types.js';
4
+ interface ResolveTranscriptOptions {
5
+ youtubeTranscriptMode?: ProviderFetchOptions['youtubeTranscriptMode'];
6
+ cacheMode?: CacheMode;
7
+ }
8
+ export declare const resolveTranscriptForLink: (url: string, html: string | null, deps: LinkPreviewDeps, { youtubeTranscriptMode, cacheMode: providedCacheMode }?: ResolveTranscriptOptions) => Promise<TranscriptResolution>;
9
+ export {};
@@ -0,0 +1,3 @@
1
+ export declare const normalizeTranscriptText: (input: string) => string;
2
+ export declare const normalizeTranscriptLines: (lines: readonly string[]) => string | null;
3
+ export declare const normalizeApifyTranscript: (raw: unknown) => string | null;
@@ -0,0 +1,3 @@
1
+ import type { ProviderContext, ProviderFetchOptions, ProviderResult } from '../types.js';
2
+ export declare const canHandle: () => boolean;
3
+ export declare const fetchTranscript: (_context: ProviderContext, _options: ProviderFetchOptions) => Promise<ProviderResult>;
@@ -0,0 +1,4 @@
1
+ import type { ProviderResult } from '../../types.js';
2
+ import type { PodcastFlowContext } from './flow-context.js';
3
+ export declare function fetchAppleTranscriptFromItunesLookup(flow: PodcastFlowContext): Promise<ProviderResult | null>;
4
+ export declare function fetchAppleTranscriptFromEmbeddedHtml(flow: PodcastFlowContext): Promise<ProviderResult | null>;
@@ -0,0 +1,6 @@
1
+ export declare function extractAppleEpisodeTitleFromHtml(html: string): string | null;
2
+ export declare function extractEmbeddedJsonUrl(html: string, field: string): string | null;
3
+ export declare function extractApplePodcastIds(url: string): {
4
+ showId: string;
5
+ episodeId: string | null;
6
+ } | null;
@@ -0,0 +1,7 @@
1
+ export declare const FEED_HINT_URL_PATTERN: RegExp;
2
+ export declare const PODCAST_PLATFORM_HOST_PATTERN: RegExp;
3
+ export declare const TRANSCRIPTION_TIMEOUT_MS = 600000;
4
+ export declare const MAX_REMOTE_MEDIA_BYTES: number;
5
+ export declare const BLOCKED_HTML_HINT_PATTERN: RegExp;
6
+ export declare const ITUNES_SEARCH_URL = "https://itunes.apple.com/search";
7
+ export declare const ITUNES_LOOKUP_URL = "https://itunes.apple.com/lookup";
@@ -0,0 +1,11 @@
1
+ import type { ProviderContext, ProviderFetchOptions, ProviderResult } from '../../types.js';
2
+ import type { TranscribeRequest, TranscriptionResult } from './media.js';
3
+ export type PodcastFlowContext = {
4
+ context: ProviderContext;
5
+ options: ProviderFetchOptions;
6
+ attemptedProviders: ProviderResult['attemptedProviders'];
7
+ notes: string[];
8
+ pushOnce: (provider: ProviderResult['attemptedProviders'][number]) => void;
9
+ ensureTranscriptionProvider: () => ProviderResult | null;
10
+ transcribe: (request: TranscribeRequest) => Promise<TranscriptionResult>;
11
+ };
@@ -0,0 +1,17 @@
1
+ export declare function resolveApplePodcastEpisodeFromItunesLookup({ fetchImpl, showId, episodeId, }: {
2
+ fetchImpl: typeof fetch;
3
+ showId: string;
4
+ episodeId: string | null;
5
+ }): Promise<{
6
+ episodeUrl: string;
7
+ feedUrl: string | null;
8
+ fileExtension: string | null;
9
+ durationSeconds: number | null;
10
+ episodeTitle: string | null;
11
+ } | null>;
12
+ export declare function resolvePodcastFeedUrlFromItunesSearch(fetchImpl: typeof fetch, showTitle: string): Promise<string | null>;
13
+ export declare function resolvePodcastEpisodeFromItunesSearch(fetchImpl: typeof fetch, showTitle: string, episodeTitle: string): Promise<{
14
+ episodeUrl: string;
15
+ durationSeconds: number | null;
16
+ episodeTitle: string;
17
+ } | null>;
@@ -0,0 +1,8 @@
1
+ export type JsonRecord = Record<string, unknown>;
2
+ export declare function isJsonRecord(value: unknown): value is JsonRecord;
3
+ export declare function getJsonPath(value: unknown, path: readonly string[]): unknown;
4
+ export declare function getJsonString(value: unknown, path: readonly string[]): string | null;
5
+ export declare function getJsonNumber(value: unknown, path: readonly string[]): number | null;
6
+ export declare function getJsonArray(value: unknown, path: readonly string[]): unknown[];
7
+ export declare function asRecordArray(value: unknown): JsonRecord[];
8
+ export declare function getRecordString(record: JsonRecord, key: string): string | null;
@@ -0,0 +1,42 @@
1
+ import type { ProviderFetchOptions } from '../../types.js';
2
+ export type TranscribeRequest = {
3
+ url: string;
4
+ filenameHint: string;
5
+ durationSecondsHint: number | null;
6
+ };
7
+ export type TranscriptionResult = {
8
+ text: string | null;
9
+ provider: string | null;
10
+ error: Error | null;
11
+ };
12
+ export declare function transcribeMediaUrl({ fetchImpl, url, filenameHint, durationSecondsHint, openaiApiKey, falApiKey, notes, progress, }: {
13
+ fetchImpl: typeof fetch;
14
+ url: string;
15
+ filenameHint: string;
16
+ durationSecondsHint: number | null;
17
+ openaiApiKey: string | null;
18
+ falApiKey: string | null;
19
+ notes: string[];
20
+ progress: {
21
+ url: string;
22
+ service: 'podcast';
23
+ onProgress: ProviderFetchOptions['onProgress'] | null;
24
+ } | null;
25
+ }): Promise<TranscriptionResult>;
26
+ export declare function probeRemoteMedia(fetchImpl: typeof fetch, url: string): Promise<{
27
+ contentLength: number | null;
28
+ mediaType: string | null;
29
+ filename: string | null;
30
+ }>;
31
+ export declare function downloadCappedBytes(fetchImpl: typeof fetch, url: string, maxBytes: number, options?: {
32
+ totalBytes: number | null;
33
+ onProgress?: ((downloadedBytes: number) => void) | null;
34
+ }): Promise<Uint8Array>;
35
+ export declare function downloadToFile(fetchImpl: typeof fetch, url: string, filePath: string, options?: {
36
+ totalBytes: number | null;
37
+ onProgress?: ((downloadedBytes: number) => void) | null;
38
+ }): Promise<number>;
39
+ export declare function normalizeHeaderType(value: string | null): string | null;
40
+ export declare function parseContentLength(value: string | null): number | null;
41
+ export declare function filenameFromUrl(url: string): string | null;
42
+ export declare function formatBytes(bytes: number): string;
@@ -0,0 +1,10 @@
1
+ import type { ProviderResult } from '../../types.js';
2
+ import type { TranscriptionResult } from './media.js';
3
+ export declare function joinNotes(notes: string[]): string | null;
4
+ export declare function buildWhisperResult({ attemptedProviders, notes, outcome, metadata, includeProviderOnFailure, }: {
5
+ attemptedProviders: ProviderResult['attemptedProviders'];
6
+ notes: string[];
7
+ outcome: TranscriptionResult;
8
+ metadata: Record<string, unknown>;
9
+ includeProviderOnFailure?: boolean;
10
+ }): ProviderResult;
@@ -0,0 +1,22 @@
1
+ export declare function looksLikeRssOrAtomFeed(xml: string): boolean;
2
+ export declare function extractEnclosureFromFeed(xml: string): {
3
+ enclosureUrl: string;
4
+ durationSeconds: number | null;
5
+ } | null;
6
+ export declare function extractEnclosureForEpisode(feedXml: string, episodeTitle: string): {
7
+ enclosureUrl: string;
8
+ durationSeconds: number | null;
9
+ } | null;
10
+ export declare function extractItemDurationSeconds(itemXml: string): number | null;
11
+ export declare function decodeXmlEntities(value: string): string;
12
+ export declare function normalizeLooseTitle(value: string): string;
13
+ export declare function tryFetchTranscriptFromFeedXml({ fetchImpl, feedXml, episodeTitle, notes, }: {
14
+ fetchImpl: typeof fetch;
15
+ feedXml: string;
16
+ episodeTitle: string | null;
17
+ notes: string[];
18
+ }): Promise<{
19
+ text: string;
20
+ transcriptUrl: string;
21
+ transcriptType: string | null;
22
+ } | null>;
@@ -0,0 +1,3 @@
1
+ import type { ProviderResult } from '../../types.js';
2
+ import type { PodcastFlowContext } from './flow-context.js';
3
+ export declare function fetchSpotifyTranscript(flow: PodcastFlowContext): Promise<ProviderResult | null>;
@@ -0,0 +1,24 @@
1
+ export declare function extractSpotifyEpisodeId(url: string): string | null;
2
+ export declare function extractSpotifyEmbedData(html: string): {
3
+ showTitle: string;
4
+ episodeTitle: string;
5
+ durationSeconds: number | null;
6
+ drmFormat: string | null;
7
+ audioUrl: string | null;
8
+ } | null;
9
+ export declare function fetchSpotifyEmbedHtml({ embedUrl, episodeId, fetchImpl, scrapeWithFirecrawl, }: {
10
+ embedUrl: string;
11
+ episodeId: string;
12
+ fetchImpl: typeof fetch;
13
+ scrapeWithFirecrawl: ((url: string, options?: {
14
+ cacheMode?: 'default' | 'bypass';
15
+ timeoutMs?: number;
16
+ }) => Promise<{
17
+ html?: string | null;
18
+ markdown: string;
19
+ } | null>) | null;
20
+ }): Promise<{
21
+ html: string;
22
+ via: 'fetch' | 'firecrawl';
23
+ }>;
24
+ export declare function looksLikeBlockedHtml(html: string): boolean;
@@ -0,0 +1,20 @@
1
+ import type { ProviderContext, ProviderFetchOptions, ProviderResult } from '../types.js';
2
+ import { resolvePodcastFeedUrlFromItunesSearch } from './podcast/itunes.js';
3
+ import { downloadCappedBytes, downloadToFile, filenameFromUrl, formatBytes, normalizeHeaderType, parseContentLength, probeRemoteMedia } from './podcast/media.js';
4
+ import { extractEnclosureForEpisode, extractItemDurationSeconds } from './podcast/rss.js';
5
+ import { looksLikeBlockedHtml } from './podcast/spotify.js';
6
+ export declare const canHandle: ({ url, html }: ProviderContext) => boolean;
7
+ export declare const fetchTranscript: (context: ProviderContext, options: ProviderFetchOptions) => Promise<ProviderResult>;
8
+ export declare const __test__: {
9
+ probeRemoteMedia: typeof probeRemoteMedia;
10
+ downloadCappedBytes: typeof downloadCappedBytes;
11
+ downloadToFile: typeof downloadToFile;
12
+ normalizeHeaderType: typeof normalizeHeaderType;
13
+ parseContentLength: typeof parseContentLength;
14
+ filenameFromUrl: typeof filenameFromUrl;
15
+ looksLikeBlockedHtml: typeof looksLikeBlockedHtml;
16
+ extractItemDurationSeconds: typeof extractItemDurationSeconds;
17
+ extractEnclosureForEpisode: typeof extractEnclosureForEpisode;
18
+ resolvePodcastFeedUrlFromItunesSearch: typeof resolvePodcastFeedUrlFromItunesSearch;
19
+ formatBytes: typeof formatBytes;
20
+ };