@chilfish/gallery-dl-instagram 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +340 -0
- package/README.md +134 -0
- package/dist/adapter-CFsiiEpM.cjs +83 -0
- package/dist/adapter-tSleX8Cr.mjs +59 -0
- package/dist/dl-ins.mjs +5129 -0
- package/dist/index.cjs +40 -0
- package/dist/{sdk-B9fRyc1e.d.mts → index.d.cts} +139 -270
- package/dist/index.d.mts +470 -51
- package/dist/index.mjs +2 -40
- package/dist/node.cjs +43 -0
- package/dist/node.d.cts +47 -0
- package/dist/node.d.mts +47 -0
- package/dist/node.mjs +42 -0
- package/dist/{extractors-Byw-2lPL.mjs → sdk-Bn0VCUIT.mjs} +291 -215
- package/dist/sdk-CK9x5wFL.d.cts +259 -0
- package/dist/sdk-CK9x5wFL.d.mts +259 -0
- package/dist/sdk-nzhAxf1O.cjs +2246 -0
- package/dist/storage-77hqz5Fi.mjs +24 -0
- package/dist/storage-BwGaT6XO.cjs +24 -0
- package/package.json +32 -25
- package/cli/adapter.ts +0 -284
- package/cli/cookies.ts +0 -59
- package/cli/index.ts +0 -337
- package/config.ts +0 -80
- package/core/extractor.ts +0 -217
- package/core/job.ts +0 -581
- package/dist/adapter-Bt86eL1R.mjs +0 -189
- package/dist/cli/index.d.mts +0 -1
- package/dist/cli/index.mjs +0 -3160
- package/dist/sdk.d.mts +0 -2
- package/dist/sdk.mjs +0 -93
- package/index.ts +0 -159
- package/instagram/api.ts +0 -531
- package/instagram/base.ts +0 -275
- package/instagram/extractors.ts +0 -521
- package/instagram/index.ts +0 -43
- package/instagram/parsers.ts +0 -583
- package/instagram/types.ts +0 -244
- package/message.ts +0 -31
- package/types.ts +0 -115
- package/utils/id-codec.ts +0 -39
- package/utils/text.ts +0 -178
package/cli/index.ts
DELETED
|
@@ -1,337 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
/**
|
|
3
|
-
* gdl-instagram — CLI entry point.
|
|
4
|
-
*
|
|
5
|
-
* Usage:
|
|
6
|
-
* gdl-instagram <url> [options] ← auto-detect from URL
|
|
7
|
-
* gdl-instagram tag <hashtag> [options]
|
|
8
|
-
* gdl-instagram saved [options]
|
|
9
|
-
*
|
|
10
|
-
* Every option is self-documented via ``--help``.
|
|
11
|
-
*/
|
|
12
|
-
|
|
13
|
-
import { Command } from 'commander'
|
|
14
|
-
import { ConfigManager } from '../config'
|
|
15
|
-
import { DownloadJob, PrintJob } from '../core/job'
|
|
16
|
-
import {
|
|
17
|
-
InstagramHighlightsExtractor,
|
|
18
|
-
InstagramPostExtractor,
|
|
19
|
-
InstagramSavedExtractor,
|
|
20
|
-
InstagramStoriesExtractor,
|
|
21
|
-
InstagramTagExtractor,
|
|
22
|
-
InstagramUserExtractor,
|
|
23
|
-
} from '../instagram/extractors'
|
|
24
|
-
import { createHttpClient, createLogger, createStorage, createWebClient, extractCsrfFromCookies } from './adapter'
|
|
25
|
-
|
|
26
|
-
/** Shared options — applied to all subcommands */
|
|
27
|
-
|
|
28
|
-
interface GlobalOptions {
|
|
29
|
-
sessionid?: string
|
|
30
|
-
cookies?: string
|
|
31
|
-
output?: string
|
|
32
|
-
videos?: string
|
|
33
|
-
previews?: string
|
|
34
|
-
audio?: boolean
|
|
35
|
-
maxPosts?: number
|
|
36
|
-
cursor?: string
|
|
37
|
-
orderPosts?: string
|
|
38
|
-
orderFiles?: string
|
|
39
|
-
staticVideos?: boolean
|
|
40
|
-
api?: string
|
|
41
|
-
verbose?: boolean
|
|
42
|
-
include?: string
|
|
43
|
-
split?: boolean
|
|
44
|
-
info?: boolean
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
function addSharedOptions(cmd: Command): Command {
|
|
48
|
-
return cmd
|
|
49
|
-
.option(
|
|
50
|
-
'--sessionid <cookie>',
|
|
51
|
-
'Instagram sessionid cookie value (from browser)',
|
|
52
|
-
process.env.INSTAGRAM_SESSIONID,
|
|
53
|
-
)
|
|
54
|
-
.option(
|
|
55
|
-
'--cookies <string>',
|
|
56
|
-
'Full Cookie header string from browser (DevTools → Network → Request Headers → Cookie)',
|
|
57
|
-
process.env.INSTAGRAM_COOKIES,
|
|
58
|
-
)
|
|
59
|
-
.option('-o, --output <dir>', 'Output directory', './data')
|
|
60
|
-
.option(
|
|
61
|
-
'--videos <mode>',
|
|
62
|
-
'Download videos: true, false, or merged (yt-dlp)',
|
|
63
|
-
'true',
|
|
64
|
-
)
|
|
65
|
-
.option(
|
|
66
|
-
'--previews <types>',
|
|
67
|
-
'Download only previews: video,audio (comma-separated)',
|
|
68
|
-
)
|
|
69
|
-
.option('--audio', 'Download standalone audio tracks', false)
|
|
70
|
-
.option('--max-posts <n>', 'Maximum number of posts to download', Number.parseInt)
|
|
71
|
-
.option(
|
|
72
|
-
'--cursor <cursor>',
|
|
73
|
-
'Resume from pagination cursor (see output of previous run)',
|
|
74
|
-
)
|
|
75
|
-
.option(
|
|
76
|
-
'--order-posts <order>',
|
|
77
|
-
'Post ordering: asc, desc, id, id_asc, id_desc',
|
|
78
|
-
)
|
|
79
|
-
.option(
|
|
80
|
-
'--order-files <order>',
|
|
81
|
-
'File ordering: r, d (reverse), or empty for natural order',
|
|
82
|
-
)
|
|
83
|
-
.option(
|
|
84
|
-
'--static-videos',
|
|
85
|
-
'Download static video cover images instead of actual videos',
|
|
86
|
-
false,
|
|
87
|
-
)
|
|
88
|
-
.option(
|
|
89
|
-
'--no-static-videos',
|
|
90
|
-
'Do not force static video covers (download real videos)',
|
|
91
|
-
)
|
|
92
|
-
.option('--api <backend>', 'API backend: rest (default) or graphql', 'rest')
|
|
93
|
-
.option('-v, --verbose', 'Verbose debug output', false)
|
|
94
|
-
.option(
|
|
95
|
-
'--include <list>',
|
|
96
|
-
'For user: comma-separated sub-extractors (posts,reels,tagged,stories,highlights,info,avatar)',
|
|
97
|
-
'posts',
|
|
98
|
-
)
|
|
99
|
-
.option(
|
|
100
|
-
'--split',
|
|
101
|
-
'For stories: split each frame into a separate post',
|
|
102
|
-
false,
|
|
103
|
-
)
|
|
104
|
-
.option(
|
|
105
|
-
'-i, --info',
|
|
106
|
-
'Print structured post info to terminal (no download)',
|
|
107
|
-
false,
|
|
108
|
-
)
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
/** Build config from parsed options */
|
|
112
|
-
|
|
113
|
-
function buildConfig(opts: GlobalOptions): ConfigManager {
|
|
114
|
-
const config = new ConfigManager()
|
|
115
|
-
|
|
116
|
-
const ig: Record<string, unknown> = {}
|
|
117
|
-
|
|
118
|
-
if (opts.videos)
|
|
119
|
-
ig.videos = opts.videos
|
|
120
|
-
if (opts.previews)
|
|
121
|
-
ig.previews = opts.previews.split(',')
|
|
122
|
-
if (opts.audio)
|
|
123
|
-
ig.audio = true
|
|
124
|
-
if (opts.maxPosts)
|
|
125
|
-
ig['max-posts'] = opts.maxPosts
|
|
126
|
-
if (opts.cursor)
|
|
127
|
-
ig.cursor = opts.cursor
|
|
128
|
-
if (opts.orderPosts)
|
|
129
|
-
ig['order-posts'] = opts.orderPosts
|
|
130
|
-
if (opts.orderFiles)
|
|
131
|
-
ig['order-files'] = opts.orderFiles
|
|
132
|
-
if (opts.staticVideos)
|
|
133
|
-
ig['static-videos'] = true
|
|
134
|
-
if (opts.api)
|
|
135
|
-
ig.api = opts.api
|
|
136
|
-
if (opts.include)
|
|
137
|
-
ig.include = opts.include
|
|
138
|
-
if (opts.split)
|
|
139
|
-
ig.split = true
|
|
140
|
-
|
|
141
|
-
if (Object.keys(ig).length > 0) {
|
|
142
|
-
config.set('extractor.instagram', ig)
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
return config
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
/** Auto-detect the right extractor for a URL */
|
|
149
|
-
|
|
150
|
-
function resolveExtractor(url: string): {
|
|
151
|
-
new (opts: any): any
|
|
152
|
-
readonly pattern: RegExp
|
|
153
|
-
} {
|
|
154
|
-
for (const Cls of [
|
|
155
|
-
InstagramPostExtractor,
|
|
156
|
-
InstagramStoriesExtractor,
|
|
157
|
-
InstagramHighlightsExtractor,
|
|
158
|
-
InstagramTagExtractor,
|
|
159
|
-
InstagramSavedExtractor,
|
|
160
|
-
InstagramUserExtractor,
|
|
161
|
-
]) {
|
|
162
|
-
if (Cls.pattern.test(url)) {
|
|
163
|
-
return Cls
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
throw new Error(
|
|
168
|
-
`No extractor matched URL: ${url}. `
|
|
169
|
-
+ 'Supported: /p/, /reel/, /{user}/, /stories/, /highlights/, /explore/tags/, /saved/',
|
|
170
|
-
)
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
/** Run an extractor */
|
|
174
|
-
|
|
175
|
-
async function runExtractor(
|
|
176
|
-
url: string,
|
|
177
|
-
extrClass: {
|
|
178
|
-
new (opts: any): any
|
|
179
|
-
readonly pattern: RegExp
|
|
180
|
-
},
|
|
181
|
-
opts: GlobalOptions,
|
|
182
|
-
): Promise<void> {
|
|
183
|
-
const config = buildConfig(opts)
|
|
184
|
-
const log = createLogger(opts.verbose ?? false)
|
|
185
|
-
let http: ReturnType<typeof createHttpClient>
|
|
186
|
-
let webCsrf: string | undefined
|
|
187
|
-
|
|
188
|
-
if (opts.cookies) {
|
|
189
|
-
// Full cookie string — highest priority
|
|
190
|
-
http = createHttpClient(undefined, opts.cookies, log)
|
|
191
|
-
webCsrf = extractCsrfFromCookies(opts.cookies)
|
|
192
|
-
}
|
|
193
|
-
else if (opts.sessionid) {
|
|
194
|
-
http = createHttpClient(opts.sessionid, undefined, log)
|
|
195
|
-
}
|
|
196
|
-
else {
|
|
197
|
-
const wc = await createWebClient(log)
|
|
198
|
-
http = wc.http
|
|
199
|
-
webCsrf = wc.csrfToken
|
|
200
|
-
}
|
|
201
|
-
const storage = createStorage()
|
|
202
|
-
|
|
203
|
-
const match = extrClass.pattern.exec(url)
|
|
204
|
-
if (!match) {
|
|
205
|
-
console.error(`URL did not match expected pattern: ${url}`)
|
|
206
|
-
process.exit(1)
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
// eslint-disable-next-line new-cap
|
|
210
|
-
const extractor = new extrClass({
|
|
211
|
-
url,
|
|
212
|
-
match,
|
|
213
|
-
config,
|
|
214
|
-
http,
|
|
215
|
-
storage,
|
|
216
|
-
log,
|
|
217
|
-
sessionId: opts.sessionid,
|
|
218
|
-
csrfToken: webCsrf,
|
|
219
|
-
})
|
|
220
|
-
|
|
221
|
-
if (opts.info) {
|
|
222
|
-
const job = new PrintJob(
|
|
223
|
-
extractor as import('../core/extractor').Extractor,
|
|
224
|
-
)
|
|
225
|
-
const start = Date.now()
|
|
226
|
-
try {
|
|
227
|
-
const status = await job.run()
|
|
228
|
-
const elapsed = ((Date.now() - start) / 1000).toFixed(1)
|
|
229
|
-
if (status !== 0) {
|
|
230
|
-
log.warn(`Finished with status ${status} in ${elapsed}s`)
|
|
231
|
-
}
|
|
232
|
-
}
|
|
233
|
-
catch (err) {
|
|
234
|
-
log.error(String(err))
|
|
235
|
-
process.exit(1)
|
|
236
|
-
}
|
|
237
|
-
return
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
const job = new DownloadJob(
|
|
241
|
-
extractor as import('../core/extractor').Extractor,
|
|
242
|
-
)
|
|
243
|
-
job.basePath = opts.output ?? './data'
|
|
244
|
-
|
|
245
|
-
const start = Date.now()
|
|
246
|
-
try {
|
|
247
|
-
const status = await job.run()
|
|
248
|
-
const elapsed = ((Date.now() - start) / 1000).toFixed(1)
|
|
249
|
-
|
|
250
|
-
if (status === 0) {
|
|
251
|
-
log.info(`Done in ${elapsed}s`)
|
|
252
|
-
}
|
|
253
|
-
else {
|
|
254
|
-
log.warn(`Finished with status ${status} in ${elapsed}s`)
|
|
255
|
-
}
|
|
256
|
-
}
|
|
257
|
-
catch (err) {
|
|
258
|
-
log.error(String(err))
|
|
259
|
-
process.exit(1)
|
|
260
|
-
}
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
/** Program */
|
|
264
|
-
|
|
265
|
-
const program = new Command()
|
|
266
|
-
|
|
267
|
-
program
|
|
268
|
-
.name('gdl-instagram')
|
|
269
|
-
.description(
|
|
270
|
-
'Download images and videos from Instagram.\n\n'
|
|
271
|
-
+ 'Uses gallery-dl\'s extraction pipeline — supports posts, reels,\n'
|
|
272
|
-
+ 'stories, highlights, tagged posts, saved collections, and more.\n\n'
|
|
273
|
-
+ 'Requires a sessionid cookie exported from your browser.\n'
|
|
274
|
-
+ 'Set via --sessionid or INSTAGRAM_SESSIONID environment variable.',
|
|
275
|
-
)
|
|
276
|
-
.version('0.1.0')
|
|
277
|
-
|
|
278
|
-
/** Default command — auto-detect extractor from URL */
|
|
279
|
-
|
|
280
|
-
const dlCmd = program
|
|
281
|
-
.command('dl', { isDefault: true })
|
|
282
|
-
.argument('[url]', 'Instagram URL to download (auto-detects type)')
|
|
283
|
-
.description(
|
|
284
|
-
'Download media from an Instagram URL (auto-detects post/user/stories/…)\n\n'
|
|
285
|
-
+ 'Examples:\n'
|
|
286
|
-
+ ' gdl-instagram https://www.instagram.com/p/CxAbCdEfGh/\n'
|
|
287
|
-
+ ' gdl-instagram https://www.instagram.com/username/ --include=posts,reels\n'
|
|
288
|
-
+ ' gdl-instagram https://www.instagram.com/stories/username/',
|
|
289
|
-
)
|
|
290
|
-
.action(async (url: string | undefined, opts: GlobalOptions) => {
|
|
291
|
-
if (!url) {
|
|
292
|
-
program.help()
|
|
293
|
-
return
|
|
294
|
-
}
|
|
295
|
-
const ExtrClass = resolveExtractor(url)
|
|
296
|
-
await runExtractor(url, ExtrClass, opts)
|
|
297
|
-
})
|
|
298
|
-
addSharedOptions(dlCmd)
|
|
299
|
-
|
|
300
|
-
/** tag subcommand */
|
|
301
|
-
|
|
302
|
-
const tag = program
|
|
303
|
-
.command('tag <hashtag>')
|
|
304
|
-
.description(
|
|
305
|
-
'Download posts from an Instagram hashtag\n\n'
|
|
306
|
-
+ 'Examples:\n'
|
|
307
|
-
+ ' gdl-instagram tag cats\n'
|
|
308
|
-
+ ' gdl-instagram tag https://www.instagram.com/explore/tags/cats/',
|
|
309
|
-
)
|
|
310
|
-
.action(async (hashtag: string, opts: GlobalOptions) => {
|
|
311
|
-
// Accept both raw tags and full URLs
|
|
312
|
-
const url = hashtag.startsWith('http')
|
|
313
|
-
? hashtag
|
|
314
|
-
: `https://www.instagram.com/explore/tags/${hashtag}/`
|
|
315
|
-
await runExtractor(url, InstagramTagExtractor, opts)
|
|
316
|
-
})
|
|
317
|
-
addSharedOptions(tag)
|
|
318
|
-
|
|
319
|
-
/** saved subcommand */
|
|
320
|
-
|
|
321
|
-
const saved = program
|
|
322
|
-
.command('saved')
|
|
323
|
-
.description(
|
|
324
|
-
'Download your saved (bookmarked) posts\n\n'
|
|
325
|
-
+ 'Requires authentication via --sessionid.\n\n'
|
|
326
|
-
+ 'Examples:\n'
|
|
327
|
-
+ ' gdl-instagram saved --sessionid=abc123',
|
|
328
|
-
)
|
|
329
|
-
.action(async (opts: GlobalOptions) => {
|
|
330
|
-
const url = 'https://www.instagram.com/me/saved/'
|
|
331
|
-
await runExtractor(url, InstagramSavedExtractor, opts)
|
|
332
|
-
})
|
|
333
|
-
addSharedOptions(saved)
|
|
334
|
-
|
|
335
|
-
/** parse */
|
|
336
|
-
|
|
337
|
-
program.parse()
|
package/config.ts
DELETED
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Simple nested config reader.
|
|
3
|
-
*
|
|
4
|
-
* Mirrors gallery-dl's ``config.interpolate``:
|
|
5
|
-
*
|
|
6
|
-
* cfgPath like ``['extractor', 'instagram', 'post']``
|
|
7
|
-
* looks up: extractor.instagram.post.{key}, extractor.instagram.{key},
|
|
8
|
-
* extractor.{key}
|
|
9
|
-
*/
|
|
10
|
-
|
|
11
|
-
import type { Config, ConfigValue } from './types'
|
|
12
|
-
|
|
13
|
-
export class ConfigManager {
|
|
14
|
-
private readonly data: Config
|
|
15
|
-
|
|
16
|
-
constructor(data: Config = {}) {
|
|
17
|
-
this.data = data
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
/**
|
|
21
|
-
* Read a value at a dot-path like ``'extractor.instagram.videos'``.
|
|
22
|
-
* Returns ``undefined`` when the path doesn't exist.
|
|
23
|
-
*/
|
|
24
|
-
get(path: string, defaultValue?: ConfigValue): ConfigValue | undefined {
|
|
25
|
-
const keys = path.split('.')
|
|
26
|
-
let node: unknown = this.data
|
|
27
|
-
for (const key of keys) {
|
|
28
|
-
if (node == null || typeof node !== 'object' || Array.isArray(node)) {
|
|
29
|
-
return defaultValue
|
|
30
|
-
}
|
|
31
|
-
node = (node as Record<string, unknown>)[key]
|
|
32
|
-
}
|
|
33
|
-
if (node === undefined)
|
|
34
|
-
return defaultValue
|
|
35
|
-
return node as ConfigValue
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
/**
|
|
39
|
-
* Interpolate a config key through a hierarchy of paths.
|
|
40
|
-
*/
|
|
41
|
-
interpolate(
|
|
42
|
-
cfgPath: readonly string[],
|
|
43
|
-
key: string,
|
|
44
|
-
defaultVal?: ConfigValue,
|
|
45
|
-
): ConfigValue | undefined {
|
|
46
|
-
let node: unknown = this.data
|
|
47
|
-
|
|
48
|
-
for (let i = 0; i < cfgPath.length; i++) {
|
|
49
|
-
if (node != null && typeof node === 'object' && !Array.isArray(node)) {
|
|
50
|
-
const v = (node as Record<string, unknown>)[key]
|
|
51
|
-
if (v !== undefined)
|
|
52
|
-
return v as ConfigValue
|
|
53
|
-
}
|
|
54
|
-
if (node == null || typeof node !== 'object' || Array.isArray(node)) {
|
|
55
|
-
break
|
|
56
|
-
}
|
|
57
|
-
node = (node as Record<string, unknown>)[cfgPath[i]!]
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
return defaultVal
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
/**
|
|
64
|
-
* Mutate the config at a given dot-path.
|
|
65
|
-
*/
|
|
66
|
-
set(path: string, value: unknown): void {
|
|
67
|
-
const keys = path.split('.')
|
|
68
|
-
let node: Record<string, unknown> = this.data as Record<string, unknown>
|
|
69
|
-
for (let i = 0; i < keys.length - 1; i++) {
|
|
70
|
-
const key = keys[i]!
|
|
71
|
-
let child = node[key]
|
|
72
|
-
if (child == null || typeof child !== 'object' || Array.isArray(child)) {
|
|
73
|
-
child = {}
|
|
74
|
-
node[key] = child
|
|
75
|
-
}
|
|
76
|
-
node = child as Record<string, unknown>
|
|
77
|
-
}
|
|
78
|
-
node[keys[keys.length - 1]!] = value
|
|
79
|
-
}
|
|
80
|
-
}
|
package/core/extractor.ts
DELETED
|
@@ -1,217 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Base Extractor abstract class.
|
|
3
|
-
*
|
|
4
|
-
* Every extractor extends this. The class provides:
|
|
5
|
-
* - URL pattern matching via ``fromURL``
|
|
6
|
-
* - One-time initialization guarded by ``initialize()``
|
|
7
|
-
* - Async-iteration entrypoint ``[Symbol.asyncIterator]`` → ``items()``
|
|
8
|
-
* - Rate-limited HTTP requests
|
|
9
|
-
* - Timestamp parsing
|
|
10
|
-
*/
|
|
11
|
-
|
|
12
|
-
import type { ConfigManager } from '../config'
|
|
13
|
-
import type {
|
|
14
|
-
ConfigValue,
|
|
15
|
-
HttpClient,
|
|
16
|
-
HttpResponse,
|
|
17
|
-
MessageIter,
|
|
18
|
-
RequestConfig,
|
|
19
|
-
Storage,
|
|
20
|
-
} from '../types'
|
|
21
|
-
|
|
22
|
-
export interface ExtractorOptions {
|
|
23
|
-
url: string
|
|
24
|
-
match: RegExpMatchArray
|
|
25
|
-
config: ConfigManager
|
|
26
|
-
http: HttpClient
|
|
27
|
-
storage: Storage
|
|
28
|
-
/** The logger interface — at minimum a debug/info/warn/error contract */
|
|
29
|
-
log: Logger
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
export interface Logger {
|
|
33
|
-
debug: (message: string, ...args: unknown[]) => void
|
|
34
|
-
info: (message: string, ...args: unknown[]) => void
|
|
35
|
-
warn: (message: string, ...args: unknown[]) => void
|
|
36
|
-
error: (message: string, ...args: unknown[]) => void
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
/** A no-op logger */
|
|
40
|
-
export const noopLogger: Logger = {
|
|
41
|
-
debug: () => {},
|
|
42
|
-
info: () => {},
|
|
43
|
-
warn: () => {},
|
|
44
|
-
error: () => {},
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
export abstract class Extractor {
|
|
48
|
-
/** Human-readable category (e.g. ``'instagram'``) */
|
|
49
|
-
abstract readonly category: string
|
|
50
|
-
|
|
51
|
-
/** Sub-category (e.g. ``'post'``, ``'posts'``, ``'reels'``) */
|
|
52
|
-
abstract readonly subcategory: string
|
|
53
|
-
|
|
54
|
-
/** Root URL (e.g. ``'https://www.instagram.com'``) */
|
|
55
|
-
abstract readonly root: string
|
|
56
|
-
|
|
57
|
-
/** Regex pattern to match against URLs */
|
|
58
|
-
static readonly pattern: RegExp = /^$/
|
|
59
|
-
|
|
60
|
-
/** The input URL */
|
|
61
|
-
readonly url: string
|
|
62
|
-
|
|
63
|
-
/** Regex match groups from ``fromURL`` */
|
|
64
|
-
readonly groups: readonly string[]
|
|
65
|
-
|
|
66
|
-
readonly config: ConfigManager
|
|
67
|
-
/** HTTP client — public so Job can access for downloads */
|
|
68
|
-
readonly http: HttpClient
|
|
69
|
-
/** Storage backend — public so Job can access for writes */
|
|
70
|
-
readonly storage: Storage
|
|
71
|
-
/** Logger instance — public so Job can access for reporting */
|
|
72
|
-
readonly log: Logger
|
|
73
|
-
|
|
74
|
-
/** Delay range in seconds — random between [min, max] before each request */
|
|
75
|
-
protected requestInterval: [number, number] = [6, 12]
|
|
76
|
-
|
|
77
|
-
private _initialized = false
|
|
78
|
-
|
|
79
|
-
constructor(opts: ExtractorOptions) {
|
|
80
|
-
this.url = opts.url
|
|
81
|
-
this.groups = opts.match ? [...opts.match].slice(1) : []
|
|
82
|
-
this.config = opts.config
|
|
83
|
-
this.http = opts.http
|
|
84
|
-
this.storage = opts.storage
|
|
85
|
-
this.log = opts.log
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
/** Initialization */
|
|
89
|
-
|
|
90
|
-
/**
|
|
91
|
-
* One-time async setup (cookies, session, internal state).
|
|
92
|
-
* Safe to call multiple times — after the first call it becomes a no-op.
|
|
93
|
-
*/
|
|
94
|
-
async initialize(): Promise<void> {
|
|
95
|
-
if (this._initialized)
|
|
96
|
-
return
|
|
97
|
-
await this._init()
|
|
98
|
-
this._initialized = true
|
|
99
|
-
// Replace with no-op so subclasses can call super.initialize() freely
|
|
100
|
-
this.initialize = async () => {}
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
/**
|
|
104
|
-
* Subclass hook for one-time setup.
|
|
105
|
-
*/
|
|
106
|
-
protected async _init(): Promise<void> {
|
|
107
|
-
// no-op by default
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
/** Async iteration */
|
|
111
|
-
|
|
112
|
-
async* [Symbol.asyncIterator](): MessageIter {
|
|
113
|
-
await this.initialize()
|
|
114
|
-
yield* this.items()
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
/**
|
|
118
|
-
* The main extraction pipeline. Subclasses *must* implement this.
|
|
119
|
-
*/
|
|
120
|
-
abstract items(): MessageIter
|
|
121
|
-
|
|
122
|
-
/** Config helpers */
|
|
123
|
-
|
|
124
|
-
/**
|
|
125
|
-
* Read a config value using the interpolated hierarchy.
|
|
126
|
-
*/
|
|
127
|
-
protected _cfg(key: string, defaultVal?: ConfigValue): ConfigValue | undefined {
|
|
128
|
-
const path: readonly string[] = [
|
|
129
|
-
'extractor',
|
|
130
|
-
this.category,
|
|
131
|
-
this.subcategory,
|
|
132
|
-
]
|
|
133
|
-
return this.config.interpolate(path, key, defaultVal)
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
/** HTTP */
|
|
137
|
-
|
|
138
|
-
private _lastRequestTime = 0
|
|
139
|
-
|
|
140
|
-
/**
|
|
141
|
-
* Rate-limited HTTP request wrapper.
|
|
142
|
-
*/
|
|
143
|
-
async request(
|
|
144
|
-
url: string,
|
|
145
|
-
cfg: RequestConfig = {},
|
|
146
|
-
): Promise<HttpResponse<unknown>> {
|
|
147
|
-
await this._throttle()
|
|
148
|
-
const response = await this.http.request({ url, ...cfg })
|
|
149
|
-
this._lastRequestTime = Date.now()
|
|
150
|
-
return response
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
/**
|
|
154
|
-
* Convenience: request + parse JSON body.
|
|
155
|
-
*/
|
|
156
|
-
async requestJSON(
|
|
157
|
-
url: string,
|
|
158
|
-
cfg: RequestConfig = {},
|
|
159
|
-
): Promise<unknown> {
|
|
160
|
-
const resp = await this.request(url, cfg)
|
|
161
|
-
if (typeof resp.data === 'object')
|
|
162
|
-
return resp.data
|
|
163
|
-
try {
|
|
164
|
-
return JSON.parse(resp.data as string)
|
|
165
|
-
}
|
|
166
|
-
catch {
|
|
167
|
-
return {}
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
/** Rate limiting */
|
|
172
|
-
|
|
173
|
-
/**
|
|
174
|
-
* Sleep long enough to keep the minimum interval between requests.
|
|
175
|
-
*/
|
|
176
|
-
private async _throttle(): Promise<void> {
|
|
177
|
-
const now = Date.now()
|
|
178
|
-
const elapsed = now - this._lastRequestTime
|
|
179
|
-
const [min, max] = this.requestInterval
|
|
180
|
-
// Random delay in milliseconds
|
|
181
|
-
const target = min + Math.random() * (max - min)
|
|
182
|
-
const waitMs = Math.max(0, target * 1000 - elapsed)
|
|
183
|
-
if (waitMs > 0) {
|
|
184
|
-
await new Promise(r => setTimeout(r, waitMs))
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
/** Utility */
|
|
189
|
-
|
|
190
|
-
/**
|
|
191
|
-
* Convert a Unix timestamp (seconds or ms) to an ISO-8601 string.
|
|
192
|
-
*/
|
|
193
|
-
parseTimestamp(ts: number | null | undefined): string {
|
|
194
|
-
if (ts == null)
|
|
195
|
-
return ''
|
|
196
|
-
// If ts looks like milliseconds (year > 2100 in seconds)
|
|
197
|
-
const asMs = ts > 2_500_000_000 ? ts : ts * 1000
|
|
198
|
-
return new Date(asMs).toISOString()
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
/**
|
|
202
|
-
* Generate a random hex token (used for CSRF).
|
|
203
|
-
*/
|
|
204
|
-
static generateToken(size = 16): string {
|
|
205
|
-
const bytes = new Uint8Array(size)
|
|
206
|
-
if (typeof crypto !== 'undefined' && crypto.getRandomValues) {
|
|
207
|
-
crypto.getRandomValues(bytes)
|
|
208
|
-
}
|
|
209
|
-
else {
|
|
210
|
-
// Fallback for Node without global crypto
|
|
211
|
-
for (let i = 0; i < size; i++) {
|
|
212
|
-
bytes[i] = Math.floor(Math.random() * 256)
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
return Array.from(bytes, b => b.toString(16).padStart(2, '0')).join('')
|
|
216
|
-
}
|
|
217
|
-
}
|