@nshipster/sosumi 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,179 @@
1
+ import { getRandomUserAgent } from "../fetch"
2
+
3
+ export class TranscriptNotFoundError extends Error {}
4
+
5
+ const APPLE_VIDEO_SUFFIX = " - Videos - Apple Developer"
6
+
7
+ interface TranscriptLine {
8
+ startSeconds: number
9
+ text: string
10
+ }
11
+
12
+ export async function fetchVideoTranscriptMarkdown(
13
+ sourceUrl: string,
14
+ collection: string,
15
+ videoId: string,
16
+ ): Promise<string> {
17
+ const html = await fetchVideoTranscriptHtml(sourceUrl)
18
+ const title = extractVideoTitleFromHtml(html) ?? `Video ${videoId}`
19
+ const transcriptLines = extractTranscriptLinesFromHtml(html)
20
+
21
+ if (transcriptLines.length === 0) {
22
+ throw new TranscriptNotFoundError("Transcript not found for this video.")
23
+ }
24
+
25
+ return renderVideoTranscriptMarkdown({
26
+ title,
27
+ sourceUrl,
28
+ collection,
29
+ videoId,
30
+ transcriptLines,
31
+ })
32
+ }
33
+
34
+ export async function fetchVideoTranscriptHtml(sourceUrl: string): Promise<string> {
35
+ const response = await fetch(sourceUrl, {
36
+ headers: {
37
+ "User-Agent": getRandomUserAgent(),
38
+ Accept: "text/html,application/xhtml+xml",
39
+ "Cache-Control": "no-cache",
40
+ },
41
+ })
42
+
43
+ if (!response.ok) {
44
+ if (response.status === 404) {
45
+ throw new TranscriptNotFoundError("Video not found.")
46
+ }
47
+ throw new Error(`Failed to fetch video page: ${response.status} ${response.statusText}`)
48
+ }
49
+
50
+ return response.text()
51
+ }
52
+
53
+ function extractVideoTitleFromHtml(html: string): string | null {
54
+ const titleMatch = html.match(/<title>([\s\S]*?)<\/title>/i)
55
+ if (!titleMatch) {
56
+ return null
57
+ }
58
+
59
+ const title = decodeHtmlEntities(stripHtml(titleMatch[1])).trim()
60
+ if (!title) {
61
+ return null
62
+ }
63
+
64
+ if (title.endsWith(APPLE_VIDEO_SUFFIX)) {
65
+ return title.slice(0, -APPLE_VIDEO_SUFFIX.length).trim()
66
+ }
67
+
68
+ return title
69
+ }
70
+
71
+ export function extractTranscriptLinesFromHtml(html: string): TranscriptLine[] {
72
+ const transcriptSectionMatch = html.match(
73
+ /<section[^>]*id=["']transcript-content["'][^>]*>([\s\S]*?)<\/section>/i,
74
+ )
75
+ if (!transcriptSectionMatch) {
76
+ return []
77
+ }
78
+
79
+ const transcriptSection = transcriptSectionMatch[1]
80
+ const lines: TranscriptLine[] = []
81
+ const spanPattern = /<span[^>]*data-start=["']([\d.]+)["'][^>]*>([\s\S]*?)<\/span>/gi
82
+
83
+ let match = spanPattern.exec(transcriptSection)
84
+ while (match) {
85
+ const startSeconds = Number.parseFloat(match[1])
86
+ const text = decodeHtmlEntities(stripHtml(match[2])).replace(/\s+/g, " ").trim()
87
+
88
+ if (Number.isFinite(startSeconds) && text) {
89
+ lines.push({ startSeconds, text })
90
+ }
91
+
92
+ match = spanPattern.exec(transcriptSection)
93
+ }
94
+
95
+ return lines
96
+ }
97
+
98
+ function renderVideoTranscriptMarkdown({
99
+ title,
100
+ sourceUrl,
101
+ collection,
102
+ videoId,
103
+ transcriptLines,
104
+ }: {
105
+ title: string
106
+ sourceUrl: string
107
+ collection: string
108
+ videoId: string
109
+ transcriptLines: TranscriptLine[]
110
+ }): string {
111
+ const transcriptBody = transcriptLines
112
+ .map((line) => `- [${formatTimestamp(line.startSeconds)}] ${line.text}`)
113
+ .join("\n")
114
+
115
+ return [
116
+ "---",
117
+ `title: ${title}`,
118
+ `source: ${sourceUrl}`,
119
+ `timestamp: ${new Date().toISOString()}`,
120
+ "---",
121
+ "",
122
+ `# ${title}`,
123
+ "",
124
+ `**Collection:** ${collection}`,
125
+ "",
126
+ `**Video:** ${videoId}`,
127
+ "",
128
+ "## Transcript",
129
+ "",
130
+ transcriptBody,
131
+ "",
132
+ "---",
133
+ "",
134
+ "*Extracted by [sosumi.ai](https://sosumi.ai) - Making Apple docs AI-readable.*",
135
+ "*This is unofficial content. All transcripts belong to Apple Inc.*",
136
+ "",
137
+ ].join("\n")
138
+ }
139
+
140
+ function formatTimestamp(seconds: number): string {
141
+ const rounded = Math.max(0, Math.floor(seconds))
142
+ const minutes = Math.floor(rounded / 60)
143
+ const remainingSeconds = rounded % 60
144
+ return `${String(minutes).padStart(2, "0")}:${String(remainingSeconds).padStart(2, "0")}`
145
+ }
146
+
147
+ function stripHtml(input: string): string {
148
+ return input.replace(/<[^>]+>/g, "")
149
+ }
150
+
151
+ function decodeHtmlEntities(input: string): string {
152
+ return input.replace(/&(#x?[0-9a-fA-F]+|[a-zA-Z]+);/g, (_, entity: string) => {
153
+ if (entity[0] === "#") {
154
+ const isHex = entity[1]?.toLowerCase() === "x"
155
+ const codePoint = Number.parseInt(entity.slice(isHex ? 2 : 1), isHex ? 16 : 10)
156
+ if (Number.isFinite(codePoint)) {
157
+ return String.fromCodePoint(codePoint)
158
+ }
159
+ return ""
160
+ }
161
+
162
+ switch (entity) {
163
+ case "amp":
164
+ return "&"
165
+ case "lt":
166
+ return "<"
167
+ case "gt":
168
+ return ">"
169
+ case "quot":
170
+ return '"'
171
+ case "apos":
172
+ return "'"
173
+ case "nbsp":
174
+ return " "
175
+ default:
176
+ return `&${entity};`
177
+ }
178
+ })
179
+ }
package/wrangler.jsonc ADDED
@@ -0,0 +1,27 @@
1
+ /**
2
+ * For more details on how to configure Wrangler, refer to:
3
+ * https://developers.cloudflare.com/workers/wrangler/configuration/
4
+ */
5
+ {
6
+ "$schema": "node_modules/wrangler/config-schema.json",
7
+ "name": "sosumi-ai",
8
+ "routes": [
9
+ {
10
+ "pattern": "sosumi.ai",
11
+ "custom_domain": true
12
+ }
13
+ ],
14
+ "main": "src/index.ts",
15
+ "compatibility_date": "2025-08-23",
16
+ "compatibility_flags": ["nodejs_compat"],
17
+ "assets": {
18
+ "binding": "ASSETS",
19
+ "directory": "./public"
20
+ },
21
+ "observability": {
22
+ "enabled": true
23
+ },
24
+ "vars": {
25
+ "NODE_ENV": "production"
26
+ }
27
+ }