@chilfish/gallery-dl-instagram 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/config.ts ADDED
@@ -0,0 +1,80 @@
1
+ /**
2
+ * Simple nested config reader.
3
+ *
4
+ * Mirrors gallery-dl's ``config.interpolate``:
5
+ *
6
+ * cfgPath like ``['extractor', 'instagram', 'post']``
7
+ * looks up: extractor.instagram.post.{key}, extractor.instagram.{key},
8
+ * extractor.{key}
9
+ */
10
+
11
+ import type { Config, ConfigValue } from './types'
12
+
13
+ export class ConfigManager {
14
+ private readonly data: Config
15
+
16
+ constructor(data: Config = {}) {
17
+ this.data = data
18
+ }
19
+
20
+ /**
21
+ * Read a value at a dot-path like ``'extractor.instagram.videos'``.
22
+ * Returns ``undefined`` when the path doesn't exist.
23
+ */
24
+ get(path: string, defaultValue?: ConfigValue): ConfigValue | undefined {
25
+ const keys = path.split('.')
26
+ let node: unknown = this.data
27
+ for (const key of keys) {
28
+ if (node == null || typeof node !== 'object' || Array.isArray(node)) {
29
+ return defaultValue
30
+ }
31
+ node = (node as Record<string, unknown>)[key]
32
+ }
33
+ if (node === undefined)
34
+ return defaultValue
35
+ return node as ConfigValue
36
+ }
37
+
38
+ /**
39
+ * Interpolate a config key through a hierarchy of paths.
40
+ */
41
+ interpolate(
42
+ cfgPath: readonly string[],
43
+ key: string,
44
+ defaultVal?: ConfigValue,
45
+ ): ConfigValue | undefined {
46
+ let node: unknown = this.data
47
+
48
+ for (let i = 0; i < cfgPath.length; i++) {
49
+ if (node != null && typeof node === 'object' && !Array.isArray(node)) {
50
+ const v = (node as Record<string, unknown>)[key]
51
+ if (v !== undefined)
52
+ return v as ConfigValue
53
+ }
54
+ if (node == null || typeof node !== 'object' || Array.isArray(node)) {
55
+ break
56
+ }
57
+ node = (node as Record<string, unknown>)[cfgPath[i]!]
58
+ }
59
+
60
+ return defaultVal
61
+ }
62
+
63
+ /**
64
+ * Mutate the config at a given dot-path.
65
+ */
66
+ set(path: string, value: unknown): void {
67
+ const keys = path.split('.')
68
+ let node: Record<string, unknown> = this.data as Record<string, unknown>
69
+ for (let i = 0; i < keys.length - 1; i++) {
70
+ const key = keys[i]!
71
+ let child = node[key]
72
+ if (child == null || typeof child !== 'object' || Array.isArray(child)) {
73
+ child = {}
74
+ node[key] = child
75
+ }
76
+ node = child as Record<string, unknown>
77
+ }
78
+ node[keys[keys.length - 1]!] = value
79
+ }
80
+ }
@@ -0,0 +1,217 @@
1
+ /**
2
+ * Base Extractor abstract class.
3
+ *
4
+ * Every extractor extends this. The class provides:
5
+ * - URL pattern matching via ``fromURL``
6
+ * - One-time initialization guarded by ``initialize()``
7
+ * - Async-iteration entrypoint ``[Symbol.asyncIterator]`` → ``items()``
8
+ * - Rate-limited HTTP requests
9
+ * - Timestamp parsing
10
+ */
11
+
12
+ import type { ConfigManager } from '../config'
13
+ import type {
14
+ ConfigValue,
15
+ HttpClient,
16
+ HttpResponse,
17
+ MessageIter,
18
+ RequestConfig,
19
+ Storage,
20
+ } from '../types'
21
+
22
+ export interface ExtractorOptions {
23
+ url: string
24
+ match: RegExpMatchArray
25
+ config: ConfigManager
26
+ http: HttpClient
27
+ storage: Storage
28
+ /** The logger interface — at minimum a debug/info/warn/error contract */
29
+ log: Logger
30
+ }
31
+
32
+ export interface Logger {
33
+ debug: (message: string, ...args: unknown[]) => void
34
+ info: (message: string, ...args: unknown[]) => void
35
+ warn: (message: string, ...args: unknown[]) => void
36
+ error: (message: string, ...args: unknown[]) => void
37
+ }
38
+
39
+ /** A no-op logger */
40
+ export const noopLogger: Logger = {
41
+ debug: () => {},
42
+ info: () => {},
43
+ warn: () => {},
44
+ error: () => {},
45
+ }
46
+
47
+ export abstract class Extractor {
48
+ /** Human-readable category (e.g. ``'instagram'``) */
49
+ abstract readonly category: string
50
+
51
+ /** Sub-category (e.g. ``'post'``, ``'posts'``, ``'reels'``) */
52
+ abstract readonly subcategory: string
53
+
54
+ /** Root URL (e.g. ``'https://www.instagram.com'``) */
55
+ abstract readonly root: string
56
+
57
+ /** Regex pattern to match against URLs */
58
+ static readonly pattern: RegExp = /^$/
59
+
60
+ /** The input URL */
61
+ readonly url: string
62
+
63
+ /** Regex match groups from ``fromURL`` */
64
+ readonly groups: readonly string[]
65
+
66
+ readonly config: ConfigManager
67
+ /** HTTP client — public so Job can access for downloads */
68
+ readonly http: HttpClient
69
+ /** Storage backend — public so Job can access for writes */
70
+ readonly storage: Storage
71
+ /** Logger instance — public so Job can access for reporting */
72
+ readonly log: Logger
73
+
74
+ /** Delay range in seconds — random between [min, max] before each request */
75
+ protected requestInterval: [number, number] = [6, 12]
76
+
77
+ private _initialized = false
78
+
79
+ constructor(opts: ExtractorOptions) {
80
+ this.url = opts.url
81
+ this.groups = opts.match ? [...opts.match].slice(1) : []
82
+ this.config = opts.config
83
+ this.http = opts.http
84
+ this.storage = opts.storage
85
+ this.log = opts.log
86
+ }
87
+
88
+ /** Initialization */
89
+
90
+ /**
91
+ * One-time async setup (cookies, session, internal state).
92
+ * Safe to call multiple times — after the first call it becomes a no-op.
93
+ */
94
+ async initialize(): Promise<void> {
95
+ if (this._initialized)
96
+ return
97
+ await this._init()
98
+ this._initialized = true
99
+ // Replace with no-op so subclasses can call super.initialize() freely
100
+ this.initialize = async () => {}
101
+ }
102
+
103
+ /**
104
+ * Subclass hook for one-time setup.
105
+ */
106
+ protected async _init(): Promise<void> {
107
+ // no-op by default
108
+ }
109
+
110
+ /** Async iteration */
111
+
112
+ async* [Symbol.asyncIterator](): MessageIter {
113
+ await this.initialize()
114
+ yield* this.items()
115
+ }
116
+
117
+ /**
118
+ * The main extraction pipeline. Subclasses *must* implement this.
119
+ */
120
+ abstract items(): MessageIter
121
+
122
+ /** Config helpers */
123
+
124
+ /**
125
+ * Read a config value using the interpolated hierarchy.
126
+ */
127
+ protected _cfg(key: string, defaultVal?: ConfigValue): ConfigValue | undefined {
128
+ const path: readonly string[] = [
129
+ 'extractor',
130
+ this.category,
131
+ this.subcategory,
132
+ ]
133
+ return this.config.interpolate(path, key, defaultVal)
134
+ }
135
+
136
+ /** HTTP */
137
+
138
+ private _lastRequestTime = 0
139
+
140
+ /**
141
+ * Rate-limited HTTP request wrapper.
142
+ */
143
+ async request(
144
+ url: string,
145
+ cfg: RequestConfig = {},
146
+ ): Promise<HttpResponse<unknown>> {
147
+ await this._throttle()
148
+ const response = await this.http.request({ url, ...cfg })
149
+ this._lastRequestTime = Date.now()
150
+ return response
151
+ }
152
+
153
+ /**
154
+ * Convenience: request + parse JSON body.
155
+ */
156
+ async requestJSON(
157
+ url: string,
158
+ cfg: RequestConfig = {},
159
+ ): Promise<unknown> {
160
+ const resp = await this.request(url, cfg)
161
+ if (typeof resp.data === 'object')
162
+ return resp.data
163
+ try {
164
+ return JSON.parse(resp.data as string)
165
+ }
166
+ catch {
167
+ return {}
168
+ }
169
+ }
170
+
171
+ /** Rate limiting */
172
+
173
+ /**
174
+ * Sleep long enough to keep the minimum interval between requests.
175
+ */
176
+ private async _throttle(): Promise<void> {
177
+ const now = Date.now()
178
+ const elapsed = now - this._lastRequestTime
179
+ const [min, max] = this.requestInterval
180
+ // Random delay in milliseconds
181
+ const target = min + Math.random() * (max - min)
182
+ const waitMs = Math.max(0, target * 1000 - elapsed)
183
+ if (waitMs > 0) {
184
+ await new Promise(r => setTimeout(r, waitMs))
185
+ }
186
+ }
187
+
188
+ /** Utility */
189
+
190
+ /**
191
+ * Convert a Unix timestamp (seconds or ms) to an ISO-8601 string.
192
+ */
193
+ parseTimestamp(ts: number | null | undefined): string {
194
+ if (ts == null)
195
+ return ''
196
+ // If ts looks like milliseconds (year > 2100 in seconds)
197
+ const asMs = ts > 2_500_000_000 ? ts : ts * 1000
198
+ return new Date(asMs).toISOString()
199
+ }
200
+
201
+ /**
202
+ * Generate a random hex token (used for CSRF).
203
+ */
204
+ static generateToken(size = 16): string {
205
+ const bytes = new Uint8Array(size)
206
+ if (typeof crypto !== 'undefined' && crypto.getRandomValues) {
207
+ crypto.getRandomValues(bytes)
208
+ }
209
+ else {
210
+ // Fallback for Node without global crypto
211
+ for (let i = 0; i < size; i++) {
212
+ bytes[i] = Math.floor(Math.random() * 256)
213
+ }
214
+ }
215
+ return Array.from(bytes, b => b.toString(16).padStart(2, '0')).join('')
216
+ }
217
+ }