retold-facto 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/.claude/launch.json +11 -0
  2. package/.dockerignore +8 -0
  3. package/.quackage.json +19 -0
  4. package/Dockerfile +26 -0
  5. package/bin/retold-facto.js +909 -0
  6. package/examples/facto-government-data.sqlite +0 -0
  7. package/examples/government-data-catalog.json +137 -0
  8. package/examples/government-data-loader.js +1432 -0
  9. package/package.json +91 -0
  10. package/scripts/facto-download.js +425 -0
  11. package/source/Retold-Facto.js +1042 -0
  12. package/source/services/Retold-Facto-BeaconProvider.js +511 -0
  13. package/source/services/Retold-Facto-CatalogManager.js +1252 -0
  14. package/source/services/Retold-Facto-DataLakeService.js +1642 -0
  15. package/source/services/Retold-Facto-DatasetManager.js +417 -0
  16. package/source/services/Retold-Facto-IngestEngine.js +1315 -0
  17. package/source/services/Retold-Facto-ProjectionEngine.js +3960 -0
  18. package/source/services/Retold-Facto-RecordManager.js +360 -0
  19. package/source/services/Retold-Facto-SchemaManager.js +1110 -0
  20. package/source/services/Retold-Facto-SourceFolderScanner.js +2243 -0
  21. package/source/services/Retold-Facto-SourceManager.js +730 -0
  22. package/source/services/Retold-Facto-StoreConnectionManager.js +441 -0
  23. package/source/services/Retold-Facto-ThroughputMonitor.js +478 -0
  24. package/source/services/web-app/codemirror-entry.js +7 -0
  25. package/source/services/web-app/pict-app/Pict-Application-Facto-Configuration.json +9 -0
  26. package/source/services/web-app/pict-app/Pict-Application-Facto.js +70 -0
  27. package/source/services/web-app/pict-app/Pict-Facto-Bundle.js +11 -0
  28. package/source/services/web-app/pict-app/providers/Pict-Provider-Facto-UI.js +66 -0
  29. package/source/services/web-app/pict-app/providers/Pict-Provider-Facto.js +69 -0
  30. package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Catalog.js +93 -0
  31. package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Connections.js +42 -0
  32. package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Datasets.js +605 -0
  33. package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Projections.js +188 -0
  34. package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Scanner.js +80 -0
  35. package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Schema.js +116 -0
  36. package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Sources.js +104 -0
  37. package/source/services/web-app/pict-app/views/PictView-Facto-Catalog.js +526 -0
  38. package/source/services/web-app/pict-app/views/PictView-Facto-Datasets.js +173 -0
  39. package/source/services/web-app/pict-app/views/PictView-Facto-Ingest.js +259 -0
  40. package/source/services/web-app/pict-app/views/PictView-Facto-Layout.js +191 -0
  41. package/source/services/web-app/pict-app/views/PictView-Facto-Projections.js +231 -0
  42. package/source/services/web-app/pict-app/views/PictView-Facto-Records.js +326 -0
  43. package/source/services/web-app/pict-app/views/PictView-Facto-Scanner.js +624 -0
  44. package/source/services/web-app/pict-app/views/PictView-Facto-Sources.js +201 -0
  45. package/source/services/web-app/pict-app/views/PictView-Facto-Throughput.js +456 -0
  46. package/source/services/web-app/pict-app-full/Pict-Application-Facto-Full-Configuration.json +14 -0
  47. package/source/services/web-app/pict-app-full/Pict-Application-Facto-Full.js +391 -0
  48. package/source/services/web-app/pict-app-full/providers/PictRouter-Facto-Configuration.json +56 -0
  49. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-BottomBar.js +68 -0
  50. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Connections.js +340 -0
  51. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Dashboard.js +149 -0
  52. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Dashboards.js +819 -0
  53. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Datasets.js +178 -0
  54. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-IngestJobs.js +99 -0
  55. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Layout.js +62 -0
  56. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-MappingEditor.js +158 -0
  57. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-ProjectionDetail.js +1120 -0
  58. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Projections.js +172 -0
  59. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-QueryPanel.js +119 -0
  60. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-RecordViewer.js +663 -0
  61. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Records.js +648 -0
  62. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Scanner.js +1017 -0
  63. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SchemaDetail.js +1404 -0
  64. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SchemaDocEditor.js +1036 -0
  65. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SchemaEditor.js +636 -0
  66. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SchemaResearch.js +357 -0
  67. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SourceDetail.js +822 -0
  68. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SourceEditor.js +1036 -0
  69. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SourceResearch.js +487 -0
  70. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Sources.js +165 -0
  71. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Throughput.js +439 -0
  72. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-TopBar.js +335 -0
  73. package/source/services/web-app/pict-app-full/views/projections/Facto-Projections-Constants.js +71 -0
  74. package/source/services/web-app/web/chart.min.js +20 -0
  75. package/source/services/web-app/web/codemirror-bundle.js +30099 -0
  76. package/source/services/web-app/web/css/facto-themes.css +467 -0
  77. package/source/services/web-app/web/css/facto.css +502 -0
  78. package/source/services/web-app/web/index.html +28 -0
  79. package/source/services/web-app/web/retold-facto.js +12138 -0
  80. package/source/services/web-app/web/retold-facto.js.map +1 -0
  81. package/source/services/web-app/web/retold-facto.min.js +2 -0
  82. package/source/services/web-app/web/retold-facto.min.js.map +1 -0
  83. package/source/services/web-app/web/simple/index.html +17 -0
  84. package/test/Facto_Browser_Integration_tests.js +798 -0
  85. package/test/RetoldFacto_tests.js +4117 -0
  86. package/test/fixtures/weather-readings.csv +17 -0
  87. package/test/fixtures/weather-stations.csv +9 -0
  88. package/test/model/MeadowModel-Extended.json +8497 -0
  89. package/test/model/MeadowModel-PICT.json +1 -0
  90. package/test/model/MeadowModel.json +1355 -0
  91. package/test/model/ddl/Facto.ddl +225 -0
  92. package/test/model/fable-configuration.json +14 -0
@@ -0,0 +1,1642 @@
1
+ /**
2
+ * Retold Facto - Data Lake Service
3
+ *
4
+ * Manages the offline data lake: downloading public datasets into a
5
+ * well-organized folder structure for stable, repeatable ingestion.
6
+ * Supports http_file, http_archive, git_clone, and rest_api (via fetch_steps) methods.
7
+ *
8
+ * @author Steven Velozo <steven@velozo.com>
9
+ */
10
+ const libFableServiceProviderBase = require('fable-serviceproviderbase');
11
+ const libFs = require('fs');
12
+ const libPath = require('path');
13
+ const libHttps = require('https');
14
+ const libHttp = require('http');
15
+ const libUrl = require('url');
16
+ const libCrypto = require('crypto');
17
+ const libChildProcess = require('child_process');
18
+ const libZlib = require('zlib');
19
+
20
+ const defaultDataLakeOptions = (
21
+ {
22
+ CatalogPath: null,
23
+ DataDir: null,
24
+ UserAgent: 'RetoldFacto/1.0 (data-coagulation-platform; https://github.com/stevenvelozo/retold-facto)'
25
+ });
26
+
27
+ // Map catalog category keys to folder names
28
+ const CATEGORY_FOLDER_MAP = (
29
+ {
30
+ '01_foundational_reference': '01-foundational-reference',
31
+ '02_geographic_location': '02-geographic-location',
32
+ '03_people_cultural_entities': '03-people-cultural-entities',
33
+ '04_business_industry': '04-business-industry',
34
+ '05_media_entertainment': '05-media-entertainment'
35
+ });
36
+
37
+ // Methods we can actually download automatically
38
+ const DOWNLOADABLE_METHODS = ['http_file', 'http_archive', 'git_clone', 'rest_api'];
39
+
40
+ class RetoldFactoDataLakeService extends libFableServiceProviderBase
41
+ {
42
+ constructor(pFable, pOptions, pServiceHash)
43
+ {
44
+ let tmpOptions = Object.assign({}, defaultDataLakeOptions, pOptions);
45
+ super(pFable, tmpOptions, pServiceHash);
46
+
47
+ this.serviceType = 'RetoldFactoDataLakeService';
48
+ }
49
+
50
+ // ================================================================
51
+ // Catalog Loading
52
+ // ================================================================
53
+
54
+ loadCatalog()
55
+ {
56
+ let tmpCatalogPath = this.options.CatalogPath;
57
+ if (!tmpCatalogPath)
58
+ {
59
+ this.log.error('No CatalogPath configured for DataLakeService.');
60
+ return null;
61
+ }
62
+
63
+ if (!libFs.existsSync(tmpCatalogPath))
64
+ {
65
+ this.log.error(`Catalog not found at: ${tmpCatalogPath}`);
66
+ return null;
67
+ }
68
+
69
+ let tmpRaw = libFs.readFileSync(tmpCatalogPath, 'utf8');
70
+ return JSON.parse(tmpRaw);
71
+ }
72
+
73
+ /**
74
+ * Flatten the catalog into a list of { categoryKey, categoryFolder, dataset } objects,
75
+ * applying optional filters.
76
+ *
77
+ * @param {object} pCatalog - The parsed catalog JSON
78
+ * @param {object} pFilters - Optional { tier, category, id }
79
+ * @returns {Array}
80
+ */
81
+ flattenCatalog(pCatalog, pFilters)
82
+ {
83
+ let tmpFilters = pFilters || {};
84
+ let tmpEntries = [];
85
+
86
+ let tmpCategoryKeys = Object.keys(pCatalog.categories);
87
+ for (let c = 0; c < tmpCategoryKeys.length; c++)
88
+ {
89
+ let tmpCategoryKey = tmpCategoryKeys[c];
90
+
91
+ if (tmpFilters.category && tmpCategoryKey !== tmpFilters.category)
92
+ {
93
+ continue;
94
+ }
95
+
96
+ let tmpCategoryFolder = CATEGORY_FOLDER_MAP[tmpCategoryKey] || tmpCategoryKey.replace(/_/g, '-');
97
+ let tmpDatasets = pCatalog.categories[tmpCategoryKey].datasets;
98
+
99
+ for (let d = 0; d < tmpDatasets.length; d++)
100
+ {
101
+ let tmpDataset = tmpDatasets[d];
102
+
103
+ if (tmpFilters.id && tmpDataset.id !== tmpFilters.id)
104
+ {
105
+ continue;
106
+ }
107
+
108
+ if (tmpFilters.tier !== null && tmpFilters.tier !== undefined && tmpDataset.tier > tmpFilters.tier)
109
+ {
110
+ continue;
111
+ }
112
+
113
+ tmpEntries.push(
114
+ {
115
+ categoryKey: tmpCategoryKey,
116
+ categoryFolder: tmpCategoryFolder,
117
+ dataset: tmpDataset
118
+ });
119
+ }
120
+ }
121
+
122
+ return tmpEntries;
123
+ }
124
+
125
+ // ================================================================
126
+ // Manifest Management
127
+ // ================================================================
128
+
129
+ getDatasetDir(pCategoryFolder, pDatasetId)
130
+ {
131
+ return libPath.join(this.options.DataDir, pCategoryFolder, pDatasetId);
132
+ }
133
+
134
+ getManifestPath(pDatasetDir)
135
+ {
136
+ return libPath.join(pDatasetDir, '_manifest.json');
137
+ }
138
+
139
+ readManifest(pDatasetDir)
140
+ {
141
+ let tmpManifestPath = this.getManifestPath(pDatasetDir);
142
+ if (libFs.existsSync(tmpManifestPath))
143
+ {
144
+ try
145
+ {
146
+ return JSON.parse(libFs.readFileSync(tmpManifestPath, 'utf8'));
147
+ }
148
+ catch (pError)
149
+ {
150
+ return null;
151
+ }
152
+ }
153
+ return null;
154
+ }
155
+
156
+ writeManifest(pDatasetDir, pManifest)
157
+ {
158
+ let tmpManifestPath = this.getManifestPath(pDatasetDir);
159
+ libFs.writeFileSync(tmpManifestPath, JSON.stringify(pManifest, null, '\t'), 'utf8');
160
+ }
161
+
162
+ /**
163
+ * Scan a dataset directory and build a file inventory (excluding _manifest.json).
164
+ */
165
+ inventoryFiles(pDatasetDir)
166
+ {
167
+ let tmpFiles = [];
168
+
169
+ if (!libFs.existsSync(pDatasetDir))
170
+ {
171
+ return tmpFiles;
172
+ }
173
+
174
+ let tmpEntries = libFs.readdirSync(pDatasetDir);
175
+ for (let i = 0; i < tmpEntries.length; i++)
176
+ {
177
+ if (tmpEntries[i] === '_manifest.json')
178
+ {
179
+ continue;
180
+ }
181
+
182
+ let tmpFullPath = libPath.join(pDatasetDir, tmpEntries[i]);
183
+ let tmpStat = libFs.statSync(tmpFullPath);
184
+
185
+ if (tmpStat.isFile())
186
+ {
187
+ tmpFiles.push(
188
+ {
189
+ name: tmpEntries[i],
190
+ size: tmpStat.size
191
+ });
192
+ }
193
+ else if (tmpStat.isDirectory())
194
+ {
195
+ let tmpDirSize = this.getDirectorySize(tmpFullPath);
196
+ tmpFiles.push(
197
+ {
198
+ name: tmpEntries[i] + '/',
199
+ size: tmpDirSize
200
+ });
201
+ }
202
+ }
203
+
204
+ return tmpFiles;
205
+ }
206
+
207
+ getDirectorySize(pDirPath)
208
+ {
209
+ let tmpTotal = 0;
210
+
211
+ try
212
+ {
213
+ let tmpEntries = libFs.readdirSync(pDirPath);
214
+ for (let i = 0; i < tmpEntries.length; i++)
215
+ {
216
+ let tmpFullPath = libPath.join(pDirPath, tmpEntries[i]);
217
+ let tmpStat = libFs.statSync(tmpFullPath);
218
+
219
+ if (tmpStat.isFile())
220
+ {
221
+ tmpTotal += tmpStat.size;
222
+ }
223
+ else if (tmpStat.isDirectory())
224
+ {
225
+ tmpTotal += this.getDirectorySize(tmpFullPath);
226
+ }
227
+ }
228
+ }
229
+ catch (pError)
230
+ {
231
+ // Skip unreadable dirs
232
+ }
233
+
234
+ return tmpTotal;
235
+ }
236
+
237
+ // ================================================================
238
+ // Dataset Status
239
+ // ================================================================
240
+
241
+ getDatasetStatus(pEntry)
242
+ {
243
+ let tmpDatasetDir = this.getDatasetDir(pEntry.categoryFolder, pEntry.dataset.id);
244
+ let tmpManifest = this.readManifest(tmpDatasetDir);
245
+
246
+ if (!tmpManifest)
247
+ {
248
+ return 'missing';
249
+ }
250
+ if (tmpManifest.status === 'error')
251
+ {
252
+ return 'error';
253
+ }
254
+ if (tmpManifest.status === 'complete')
255
+ {
256
+ return 'cached';
257
+ }
258
+ return 'partial';
259
+ }
260
+
261
+ /**
262
+ * Check if a dataset entry is downloadable by this service.
263
+ * rest_api entries are only downloadable if they have fetch_steps defined.
264
+ */
265
+ isDownloadable(pDataset)
266
+ {
267
+ if (pDataset.method === 'rest_api')
268
+ {
269
+ return Array.isArray(pDataset.fetch_steps) && pDataset.fetch_steps.length > 0;
270
+ }
271
+ return DOWNLOADABLE_METHODS.indexOf(pDataset.method) > -1;
272
+ }
273
+
274
+ // ================================================================
275
+ // Size Formatting
276
+ // ================================================================
277
+
278
+ parseSize(pSizeStr)
279
+ {
280
+ if (!pSizeStr || pSizeStr === 'N/A' || pSizeStr === 'varies' || pSizeStr === 'small' || pSizeStr === 'large')
281
+ {
282
+ return 0;
283
+ }
284
+
285
+ // Handle compound sizes like "companyfacts ~10GB, submissions ~8GB"
286
+ if (pSizeStr.indexOf(',') > -1)
287
+ {
288
+ let tmpParts = pSizeStr.split(',');
289
+ let tmpTotal = 0;
290
+ for (let i = 0; i < tmpParts.length; i++)
291
+ {
292
+ tmpTotal += this.parseSize(tmpParts[i].trim());
293
+ }
294
+ return tmpTotal;
295
+ }
296
+
297
+ // Handle "cities15000: 2MB, allCountries: 1.5GB" style
298
+ if (pSizeStr.indexOf(':') > -1)
299
+ {
300
+ let tmpColonParts = pSizeStr.split(':');
301
+ if (tmpColonParts.length >= 2)
302
+ {
303
+ return this.parseSize(tmpColonParts[tmpColonParts.length - 1].trim());
304
+ }
305
+ }
306
+
307
+ // Strip leading text like "~" or "companyfacts "
308
+ let tmpMatch = pSizeStr.match(/([\d.]+)\s*(KB|MB|GB|TB)/i);
309
+ if (!tmpMatch)
310
+ {
311
+ return 0;
312
+ }
313
+
314
+ let tmpValue = parseFloat(tmpMatch[1]);
315
+ let tmpUnit = tmpMatch[2].toUpperCase();
316
+ let tmpMultipliers = { 'KB': 1024, 'MB': 1024 * 1024, 'GB': 1024 * 1024 * 1024, 'TB': 1024 * 1024 * 1024 * 1024 };
317
+ return Math.round(tmpValue * (tmpMultipliers[tmpUnit] || 1));
318
+ }
319
+
320
+ formatSize(pBytes)
321
+ {
322
+ if (pBytes === 0)
323
+ {
324
+ return '???';
325
+ }
326
+ if (pBytes < 1024)
327
+ {
328
+ return pBytes + ' B';
329
+ }
330
+ if (pBytes < 1024 * 1024)
331
+ {
332
+ return (pBytes / 1024).toFixed(1) + ' KB';
333
+ }
334
+ if (pBytes < 1024 * 1024 * 1024)
335
+ {
336
+ return (pBytes / (1024 * 1024)).toFixed(1) + ' MB';
337
+ }
338
+ return (pBytes / (1024 * 1024 * 1024)).toFixed(2) + ' GB';
339
+ }
340
+
341
+ // ================================================================
342
+ // HTTP Download
343
+ // ================================================================
344
+
345
+ /**
346
+ * Download a single URL to a local file path.
347
+ * Returns a Promise that resolves with { path, size }.
348
+ * Follows redirects up to 5 times.
349
+ */
350
+ downloadFile(pUrl, pDestPath, pRedirects, pHeaders)
351
+ {
352
+ let tmpRedirects = pRedirects || 0;
353
+ if (tmpRedirects > 5)
354
+ {
355
+ return Promise.reject(new Error(`Too many redirects for ${pUrl}`));
356
+ }
357
+
358
+ let tmpSelf = this;
359
+ return new Promise(
360
+ (fResolve, fReject) =>
361
+ {
362
+ let tmpParsed = new URL(pUrl);
363
+ let tmpLib = tmpParsed.protocol === 'https:' ? libHttps : libHttp;
364
+
365
+ let tmpRequestOptions = (
366
+ {
367
+ headers: Object.assign(
368
+ {
369
+ 'User-Agent': tmpSelf.options.UserAgent
370
+ }, pHeaders || {})
371
+ });
372
+
373
+ let tmpRequest = tmpLib.get(pUrl, tmpRequestOptions,
374
+ (pResponse) =>
375
+ {
376
+ // Follow redirects
377
+ if (pResponse.statusCode >= 300 && pResponse.statusCode < 400 && pResponse.headers.location)
378
+ {
379
+ let tmpRedirectUrl = pResponse.headers.location;
380
+ if (!tmpRedirectUrl.startsWith('http'))
381
+ {
382
+ tmpRedirectUrl = tmpParsed.protocol + '//' + tmpParsed.host + tmpRedirectUrl;
383
+ }
384
+ pResponse.resume();
385
+ return tmpSelf.downloadFile(tmpRedirectUrl, pDestPath, tmpRedirects + 1, pHeaders)
386
+ .then(fResolve)
387
+ .catch(fReject);
388
+ }
389
+
390
+ if (pResponse.statusCode !== 200)
391
+ {
392
+ pResponse.resume();
393
+ return fReject(new Error(`HTTP ${pResponse.statusCode} for ${pUrl}`));
394
+ }
395
+
396
+ // Ensure parent directory exists
397
+ let tmpDir = libPath.dirname(pDestPath);
398
+ if (!libFs.existsSync(tmpDir))
399
+ {
400
+ libFs.mkdirSync(tmpDir, { recursive: true });
401
+ }
402
+
403
+ let tmpFile = libFs.createWriteStream(pDestPath);
404
+ let tmpDownloaded = 0;
405
+ let tmpContentLength = parseInt(pResponse.headers['content-length'] || '0', 10);
406
+ let tmpLastProgress = 0;
407
+
408
+ pResponse.on('data',
409
+ (pChunk) =>
410
+ {
411
+ tmpDownloaded += pChunk.length;
412
+
413
+ if (tmpContentLength > 0)
414
+ {
415
+ let tmpProgress = Math.floor((tmpDownloaded / tmpContentLength) * 100);
416
+ if (tmpProgress >= tmpLastProgress + 10)
417
+ {
418
+ tmpLastProgress = tmpProgress;
419
+ process.stdout.write(` ... ${tmpProgress}% (${tmpSelf.formatSize(tmpDownloaded)})\r`);
420
+ }
421
+ }
422
+ });
423
+
424
+ pResponse.pipe(tmpFile);
425
+
426
+ tmpFile.on('finish',
427
+ () =>
428
+ {
429
+ tmpFile.close();
430
+ if (tmpContentLength > 0)
431
+ {
432
+ process.stdout.write(' \r');
433
+ }
434
+ fResolve({ path: pDestPath, size: tmpDownloaded });
435
+ });
436
+
437
+ tmpFile.on('error',
438
+ (pError) =>
439
+ {
440
+ libFs.unlink(pDestPath, () => {});
441
+ fReject(pError);
442
+ });
443
+ });
444
+
445
+ tmpRequest.on('error', fReject);
446
+
447
+ tmpRequest.setTimeout(300000,
448
+ () =>
449
+ {
450
+ tmpRequest.destroy();
451
+ fReject(new Error(`Timeout downloading ${pUrl}`));
452
+ });
453
+ });
454
+ }
455
+
456
+ /**
457
+ * Fetch a URL and return the parsed JSON body.
458
+ * Returns a Promise that resolves with the parsed object.
459
+ */
460
+ fetchJson(pUrl, pHeaders)
461
+ {
462
+ let tmpSelf = this;
463
+ return new Promise(
464
+ (fResolve, fReject) =>
465
+ {
466
+ let tmpParsed = new URL(pUrl);
467
+ let tmpLib = tmpParsed.protocol === 'https:' ? libHttps : libHttp;
468
+
469
+ let tmpRequestHeaders = Object.assign(
470
+ {
471
+ 'User-Agent': tmpSelf.options.UserAgent,
472
+ 'Accept': 'application/json'
473
+ }, pHeaders || {});
474
+
475
+ let tmpRequestOptions = (
476
+ {
477
+ headers: tmpRequestHeaders
478
+ });
479
+
480
+ let tmpRequest = tmpLib.get(pUrl, tmpRequestOptions,
481
+ (pResponse) =>
482
+ {
483
+ // Follow redirects
484
+ if (pResponse.statusCode >= 300 && pResponse.statusCode < 400 && pResponse.headers.location)
485
+ {
486
+ let tmpRedirectUrl = pResponse.headers.location;
487
+ if (!tmpRedirectUrl.startsWith('http'))
488
+ {
489
+ tmpRedirectUrl = tmpParsed.protocol + '//' + tmpParsed.host + tmpRedirectUrl;
490
+ }
491
+ pResponse.resume();
492
+ return tmpSelf.fetchJson(tmpRedirectUrl, pHeaders)
493
+ .then(fResolve)
494
+ .catch(fReject);
495
+ }
496
+
497
+ if (pResponse.statusCode !== 200)
498
+ {
499
+ pResponse.resume();
500
+ return fReject(new Error(`HTTP ${pResponse.statusCode} for ${pUrl}`));
501
+ }
502
+
503
+ let tmpChunks = [];
504
+ pResponse.on('data', (pChunk) => { tmpChunks.push(pChunk); });
505
+ pResponse.on('end',
506
+ () =>
507
+ {
508
+ try
509
+ {
510
+ let tmpBody = Buffer.concat(tmpChunks).toString('utf8');
511
+ fResolve(JSON.parse(tmpBody));
512
+ }
513
+ catch (pError)
514
+ {
515
+ fReject(new Error(`Failed to parse JSON from ${pUrl}: ${pError.message}`));
516
+ }
517
+ });
518
+ pResponse.on('error', fReject);
519
+ });
520
+
521
+ tmpRequest.on('error', fReject);
522
+
523
+ tmpRequest.setTimeout(60000,
524
+ () =>
525
+ {
526
+ tmpRequest.destroy();
527
+ fReject(new Error(`Timeout fetching ${pUrl}`));
528
+ });
529
+ });
530
+ }
531
+
532
+ /**
533
+ * Fetch a URL and return the raw text body.
534
+ * Returns a Promise that resolves with a string.
535
+ */
536
+ fetchText(pUrl, pHeaders)
537
+ {
538
+ let tmpSelf = this;
539
+ return new Promise(
540
+ (fResolve, fReject) =>
541
+ {
542
+ let tmpParsed = new URL(pUrl);
543
+ let tmpLib = tmpParsed.protocol === 'https:' ? libHttps : libHttp;
544
+
545
+ let tmpRequestHeaders = Object.assign(
546
+ {
547
+ 'User-Agent': tmpSelf.options.UserAgent
548
+ }, pHeaders || {});
549
+
550
+ let tmpRequestOptions = (
551
+ {
552
+ headers: tmpRequestHeaders
553
+ });
554
+
555
+ let tmpRequest = tmpLib.get(pUrl, tmpRequestOptions,
556
+ (pResponse) =>
557
+ {
558
+ if (pResponse.statusCode >= 300 && pResponse.statusCode < 400 && pResponse.headers.location)
559
+ {
560
+ let tmpRedirectUrl = pResponse.headers.location;
561
+ if (!tmpRedirectUrl.startsWith('http'))
562
+ {
563
+ tmpRedirectUrl = tmpParsed.protocol + '//' + tmpParsed.host + tmpRedirectUrl;
564
+ }
565
+ pResponse.resume();
566
+ return tmpSelf.fetchText(tmpRedirectUrl, pHeaders)
567
+ .then(fResolve)
568
+ .catch(fReject);
569
+ }
570
+
571
+ if (pResponse.statusCode !== 200)
572
+ {
573
+ pResponse.resume();
574
+ return fReject(new Error(`HTTP ${pResponse.statusCode} for ${pUrl}`));
575
+ }
576
+
577
+ let tmpChunks = [];
578
+ pResponse.on('data', (pChunk) => { tmpChunks.push(pChunk); });
579
+ pResponse.on('end',
580
+ () =>
581
+ {
582
+ fResolve(Buffer.concat(tmpChunks).toString('utf8'));
583
+ });
584
+ pResponse.on('error', fReject);
585
+ });
586
+
587
+ tmpRequest.on('error', fReject);
588
+
589
+ tmpRequest.setTimeout(60000,
590
+ () =>
591
+ {
592
+ tmpRequest.destroy();
593
+ fReject(new Error(`Timeout fetching ${pUrl}`));
594
+ });
595
+ });
596
+ }
597
+
598
+ // ================================================================
599
+ // Archive Extraction
600
+ // ================================================================
601
+
602
+ extractArchive(pArchivePath, pDestDir)
603
+ {
604
+ return new Promise(
605
+ (fResolve, fReject) =>
606
+ {
607
+ let tmpFilename = libPath.basename(pArchivePath).toLowerCase();
608
+ let tmpCommand = null;
609
+
610
+ if (tmpFilename.endsWith('.zip'))
611
+ {
612
+ tmpCommand = `unzip -o -q "${pArchivePath}" -d "${pDestDir}"`;
613
+ }
614
+ else if (tmpFilename.endsWith('.tar.gz') || tmpFilename.endsWith('.tgz'))
615
+ {
616
+ tmpCommand = `tar -xzf "${pArchivePath}" -C "${pDestDir}"`;
617
+ }
618
+ else if (tmpFilename.endsWith('.tar.bz2') || tmpFilename.endsWith('.tbz2'))
619
+ {
620
+ tmpCommand = `tar -xjf "${pArchivePath}" -C "${pDestDir}"`;
621
+ }
622
+ else if (tmpFilename.endsWith('.tar.xz'))
623
+ {
624
+ tmpCommand = `tar -xJf "${pArchivePath}" -C "${pDestDir}"`;
625
+ }
626
+ else if (tmpFilename.endsWith('.tar.zst') || tmpFilename.endsWith('.tar.zstd'))
627
+ {
628
+ tmpCommand = `zstd -d "${pArchivePath}" --stdout | tar -xf - -C "${pDestDir}"`;
629
+ }
630
+ else if (tmpFilename.endsWith('.gz') && !tmpFilename.endsWith('.tar.gz'))
631
+ {
632
+ tmpCommand = `gunzip -k "${pArchivePath}"`;
633
+ }
634
+ else
635
+ {
636
+ return fResolve();
637
+ }
638
+
639
+ this.log.info(` Extracting: ${tmpFilename}`);
640
+ libChildProcess.exec(tmpCommand, { maxBuffer: 50 * 1024 * 1024 },
641
+ (pError, pStdout, pStderr) =>
642
+ {
643
+ if (pError)
644
+ {
645
+ this.log.warn(` Extract warning: ${pError.message}`);
646
+ }
647
+ fResolve();
648
+ });
649
+ });
650
+ }
651
+
652
+ // ================================================================
653
+ // Git Clone
654
+ // ================================================================
655
+
656
+ gitClone(pUrl, pDestDir)
657
+ {
658
+ let tmpSelf = this;
659
+ return new Promise(
660
+ (fResolve, fReject) =>
661
+ {
662
+ let tmpRepoName = libPath.basename(pUrl, '.git').replace(/\.git$/, '');
663
+ let tmpCloneTarget = libPath.join(pDestDir, tmpRepoName);
664
+
665
+ if (libFs.existsSync(tmpCloneTarget))
666
+ {
667
+ tmpSelf.log.info(` Updating existing clone: ${tmpRepoName}`);
668
+ libChildProcess.exec(`git -C "${tmpCloneTarget}" pull --ff-only`, { timeout: 120000 },
669
+ (pError) =>
670
+ {
671
+ if (pError)
672
+ {
673
+ tmpSelf.log.warn(` Git pull warning: ${pError.message}`);
674
+ }
675
+ fResolve({ path: tmpCloneTarget });
676
+ });
677
+ }
678
+ else
679
+ {
680
+ tmpSelf.log.info(` Cloning: ${pUrl}`);
681
+ libChildProcess.exec(`git clone --depth 1 "${pUrl}" "${tmpCloneTarget}"`, { timeout: 300000 },
682
+ (pError) =>
683
+ {
684
+ if (pError)
685
+ {
686
+ return fReject(new Error(`Git clone failed for ${pUrl}: ${pError.message}`));
687
+ }
688
+ fResolve({ path: tmpCloneTarget });
689
+ });
690
+ }
691
+ });
692
+ }
693
+
694
+ // ================================================================
695
+ // URL / Filename Helpers
696
+ // ================================================================
697
+
698
+ filenameFromUrl(pUrl)
699
+ {
700
+ try
701
+ {
702
+ let tmpParsed = new URL(pUrl);
703
+ let tmpPathname = tmpParsed.pathname;
704
+ let tmpFilename = libPath.basename(tmpPathname);
705
+
706
+ if (!tmpFilename || tmpFilename === '/')
707
+ {
708
+ tmpFilename = 'download_' + libCrypto.createHash('md5').update(pUrl).digest('hex').substring(0, 8);
709
+ }
710
+
711
+ if (!tmpFilename.match(/\.\w{2,5}$/))
712
+ {
713
+ if (pUrl.indexOf('.csv') > -1)
714
+ {
715
+ tmpFilename += '.csv';
716
+ }
717
+ else if (pUrl.indexOf('.json') > -1)
718
+ {
719
+ tmpFilename += '.json';
720
+ }
721
+ else if (pUrl.indexOf('.zip') > -1)
722
+ {
723
+ tmpFilename += '.zip';
724
+ }
725
+ }
726
+
727
+ return tmpFilename;
728
+ }
729
+ catch (pError)
730
+ {
731
+ return 'download_' + libCrypto.createHash('md5').update(pUrl).digest('hex').substring(0, 8);
732
+ }
733
+ }
734
+
735
+ isDirectDownloadUrl(pUrl)
736
+ {
737
+ let tmpLower = pUrl.toLowerCase();
738
+
739
+ if (tmpLower.match(/\.(zip|gz|tgz|tar|csv|tsv|json|xml|txt|xls|xlsx|dat|bz2|xz|zst|zstd|sqlite|sql)$/))
740
+ {
741
+ return true;
742
+ }
743
+
744
+ if (tmpLower.indexOf('/download') > -1 || tmpLower.indexOf('/data/') > -1)
745
+ {
746
+ return true;
747
+ }
748
+
749
+ if (tmpLower.indexOf('raw.githubusercontent.com') > -1 || tmpLower.indexOf('/releases/download/') > -1)
750
+ {
751
+ return true;
752
+ }
753
+
754
+ if (tmpLower.indexOf('download.geonames.org') > -1 || tmpLower.indexOf('datasets.imdbws.com') > -1)
755
+ {
756
+ return true;
757
+ }
758
+
759
+ if (tmpLower.indexOf('datahub.io') > -1 && tmpLower.indexOf('/r/') > -1)
760
+ {
761
+ return true;
762
+ }
763
+
764
+ if (tmpLower.indexOf('standards-oui.ieee.org') > -1)
765
+ {
766
+ return true;
767
+ }
768
+ if (tmpLower.indexOf('rfc-editor.org/rfc-index') > -1)
769
+ {
770
+ return true;
771
+ }
772
+ if (tmpLower.indexOf('data.iana.org') > -1)
773
+ {
774
+ return true;
775
+ }
776
+ if (tmpLower.indexOf('nasdaqtrader.com') > -1)
777
+ {
778
+ return true;
779
+ }
780
+ if (tmpLower.indexOf('files.usaspending.gov') > -1)
781
+ {
782
+ return true;
783
+ }
784
+ if (tmpLower.indexOf('sec.gov/files/') > -1)
785
+ {
786
+ return true;
787
+ }
788
+ if (tmpLower.indexOf('openlibrary.org/data/') > -1)
789
+ {
790
+ return true;
791
+ }
792
+ if (tmpLower.indexOf('gutenberg.org/cache/') > -1)
793
+ {
794
+ return true;
795
+ }
796
+ if (tmpLower.indexOf('ourairports.com/data/') > -1)
797
+ {
798
+ return true;
799
+ }
800
+ if (tmpLower.indexOf('iso639-3.sil.org') > -1 && tmpLower.indexOf('/downloads/') > -1)
801
+ {
802
+ return true;
803
+ }
804
+
805
+ if (tmpLower.match(/[?&]format=(csv|json|xml)/))
806
+ {
807
+ return true;
808
+ }
809
+ if (tmpLower.indexOf('goldencopy.gleif.org') > -1)
810
+ {
811
+ return true;
812
+ }
813
+ if (tmpLower.indexOf('enterpriseefiling.fcc.gov') > -1)
814
+ {
815
+ return true;
816
+ }
817
+ if (tmpLower.indexOf('irs.gov/pub/') > -1)
818
+ {
819
+ return true;
820
+ }
821
+ if (tmpLower.indexOf('onetcenter.org/dl_files/') > -1)
822
+ {
823
+ return true;
824
+ }
825
+ if (tmpLower.indexOf('downloads.dbpedia.org') > -1)
826
+ {
827
+ return true;
828
+ }
829
+ if (tmpLower.indexOf('data.metabrainz.org') > -1)
830
+ {
831
+ return true;
832
+ }
833
+ if (tmpLower.indexOf('archive.org/download/') > -1)
834
+ {
835
+ return true;
836
+ }
837
+ if (tmpLower.indexOf('planet.openstreetmap.org') > -1)
838
+ {
839
+ return true;
840
+ }
841
+ if (tmpLower.indexOf('static.openfoodfacts.org') > -1)
842
+ {
843
+ return true;
844
+ }
845
+ if (tmpLower.indexOf('static.openbeautyfacts.org') > -1)
846
+ {
847
+ return true;
848
+ }
849
+ if (tmpLower.indexOf('static.openpetfoodfacts.org') > -1)
850
+ {
851
+ return true;
852
+ }
853
+ if (tmpLower.indexOf('api.crossref.org/snapshots/') > -1)
854
+ {
855
+ return true;
856
+ }
857
+ if (tmpLower.indexOf('loc.gov/cds/downloads/') > -1)
858
+ {
859
+ return true;
860
+ }
861
+ if (tmpLower.indexOf('sec.gov/Archives/') > -1)
862
+ {
863
+ return true;
864
+ }
865
+ if (tmpLower.indexOf('discogs-data-dumps') > -1)
866
+ {
867
+ return true;
868
+ }
869
+ if (tmpLower.indexOf('databus.dbpedia.org') > -1)
870
+ {
871
+ return true;
872
+ }
873
+ if (tmpLower.indexOf('aqs.epa.gov/aqsweb/airdata/') > -1)
874
+ {
875
+ return true;
876
+ }
877
+ if (tmpLower.indexOf('download.open.fda.gov') > -1)
878
+ {
879
+ return true;
880
+ }
881
+ if (tmpLower.indexOf('fdc.nal.usda.gov/fdc-datasets/') > -1)
882
+ {
883
+ return true;
884
+ }
885
+ if (tmpLower.indexOf('ncei.noaa.gov/pub/data/') > -1)
886
+ {
887
+ return true;
888
+ }
889
+ if (tmpLower.indexOf('fenixservices.fao.org') > -1)
890
+ {
891
+ return true;
892
+ }
893
+ if (tmpLower.indexOf('ftp.cdc.gov/pub/') > -1)
894
+ {
895
+ return true;
896
+ }
897
+ if (tmpLower.indexOf('accessdata.fda.gov/cder/') > -1)
898
+ {
899
+ return true;
900
+ }
901
+ if (tmpLower.indexOf('cdstar.eva.mpg.de') > -1)
902
+ {
903
+ return true;
904
+ }
905
+ if (tmpLower.indexOf('nces.ed.gov/ipeds/datacenter/data/') > -1)
906
+ {
907
+ return true;
908
+ }
909
+ if (tmpLower.indexOf('storage.googleapis.com/pantheon-public-data') > -1)
910
+ {
911
+ return true;
912
+ }
913
+ if (tmpLower.indexOf('ndownloader.figshare.com') > -1)
914
+ {
915
+ return true;
916
+ }
917
+ if (tmpLower.indexOf('gist.githubusercontent.com') > -1)
918
+ {
919
+ return true;
920
+ }
921
+ if (tmpLower.indexOf('cms.gov/files/zip/') > -1)
922
+ {
923
+ return true;
924
+ }
925
+ if (tmpLower.indexOf('data.bls.gov/cew/') > -1)
926
+ {
927
+ return true;
928
+ }
929
+ if (tmpLower.indexOf('dumps.wikimedia.org') > -1)
930
+ {
931
+ return true;
932
+ }
933
+ if (tmpLower.indexOf('cbwinslow/baseballdatabank') > -1)
934
+ {
935
+ return true;
936
+ }
937
+ if (tmpLower.indexOf('onetcenter.org/taxonomy/') > -1)
938
+ {
939
+ return true;
940
+ }
941
+ if (tmpLower.indexOf('bulks-faostat.fao.org') > -1)
942
+ {
943
+ return true;
944
+ }
945
+ if (tmpLower.indexOf('nces.ed.gov/ccd/') > -1)
946
+ {
947
+ return true;
948
+ }
949
+ if (tmpLower.indexOf('nces.ed.gov/surveys/pss/') > -1)
950
+ {
951
+ return true;
952
+ }
953
+
954
+ return false;
955
+ }
956
+
957
+ isArchiveFilename(pFilename)
958
+ {
959
+ let tmpLower = pFilename.toLowerCase();
960
+ return tmpLower.endsWith('.zip') ||
961
+ tmpLower.endsWith('.tar.gz') ||
962
+ tmpLower.endsWith('.tgz') ||
963
+ tmpLower.endsWith('.tar.bz2') ||
964
+ tmpLower.endsWith('.tar.xz') ||
965
+ tmpLower.endsWith('.tar.zst') ||
966
+ tmpLower.endsWith('.tar.zstd') ||
967
+ tmpLower.endsWith('.gz');
968
+ }
969
+
970
+ // ================================================================
971
+ // Download Method Handlers
972
+ // ================================================================
973
+
974
+ async downloadHttpFiles(pDataset, pDatasetDir, pManifest)
975
+ {
976
+ let tmpUrls = pDataset.urls || [];
977
+ let tmpDatasetHeaders = pDataset.headers || null;
978
+
979
+ for (let i = 0; i < tmpUrls.length; i++)
980
+ {
981
+ let tmpUrl = tmpUrls[i];
982
+
983
+ if (tmpUrl.indexOf('{') > -1)
984
+ {
985
+ this.log.info(` Skipping template URL: ${tmpUrl}`);
986
+ continue;
987
+ }
988
+
989
+ let tmpFilename = this.filenameFromUrl(tmpUrl);
990
+ let tmpDestPath = libPath.join(pDatasetDir, tmpFilename);
991
+
992
+ this.log.info(` Downloading: ${tmpFilename}`);
993
+ try
994
+ {
995
+ let tmpResult = await this.downloadFile(tmpUrl, tmpDestPath, 0, tmpDatasetHeaders);
996
+ pManifest.urls_downloaded.push(tmpUrl);
997
+ this.log.info(` OK: ${this.formatSize(tmpResult.size)}`);
998
+ }
999
+ catch (pError)
1000
+ {
1001
+ this.log.error(` Failed: ${tmpUrl} — ${pError.message}`);
1002
+ }
1003
+ }
1004
+ }
1005
+
1006
+ async downloadHttpArchives(pDataset, pDatasetDir, pManifest)
1007
+ {
1008
+ let tmpUrls = pDataset.urls || [];
1009
+ let tmpDatasetHeaders = pDataset.headers || null;
1010
+
1011
+ for (let i = 0; i < tmpUrls.length; i++)
1012
+ {
1013
+ let tmpUrl = tmpUrls[i];
1014
+
1015
+ if (!this.isDirectDownloadUrl(tmpUrl))
1016
+ {
1017
+ this.log.info(` Skipping non-direct URL (browse manually): ${tmpUrl}`);
1018
+ continue;
1019
+ }
1020
+
1021
+ let tmpFilename = this.filenameFromUrl(tmpUrl);
1022
+ let tmpDestPath = libPath.join(pDatasetDir, tmpFilename);
1023
+
1024
+ this.log.info(` Downloading: ${tmpFilename}`);
1025
+ try
1026
+ {
1027
+ let tmpResult = await this.downloadFile(tmpUrl, tmpDestPath, 0, tmpDatasetHeaders);
1028
+ pManifest.urls_downloaded.push(tmpUrl);
1029
+ this.log.info(` OK: ${this.formatSize(tmpResult.size)}`);
1030
+
1031
+ if (this.isArchiveFilename(tmpFilename) && !pDataset.skip_extract)
1032
+ {
1033
+ await this.extractArchive(tmpDestPath, pDatasetDir);
1034
+ }
1035
+ }
1036
+ catch (pError)
1037
+ {
1038
+ this.log.error(` Failed: ${tmpUrl} — ${pError.message}`);
1039
+ }
1040
+ }
1041
+ }
1042
+
1043
+ async downloadGitClone(pDataset, pDatasetDir, pManifest)
1044
+ {
1045
+ let tmpUrls = pDataset.urls || [];
1046
+
1047
+ for (let i = 0; i < tmpUrls.length; i++)
1048
+ {
1049
+ let tmpUrl = tmpUrls[i];
1050
+
1051
+ if (!tmpUrl.match(/github\.com|gitlab\.com|bitbucket\.org/))
1052
+ {
1053
+ this.log.info(` Skipping non-git URL: ${tmpUrl}`);
1054
+ continue;
1055
+ }
1056
+
1057
+ try
1058
+ {
1059
+ await this.gitClone(tmpUrl, pDatasetDir);
1060
+ pManifest.urls_downloaded.push(tmpUrl);
1061
+ this.log.info(` OK: cloned`);
1062
+ }
1063
+ catch (pError)
1064
+ {
1065
+ this.log.error(` Failed: ${tmpUrl} — ${pError.message}`);
1066
+ }
1067
+ }
1068
+ }
1069
+
1070
+ // ================================================================
1071
+ // REST API Download (fetch_steps)
1072
+ // ================================================================
1073
+
1074
+ /**
1075
+ * Download a dataset via its fetch_steps DSL.
1076
+ */
1077
+ async downloadRestApi(pDataset, pDatasetDir, pManifest)
1078
+ {
1079
+ let tmpSteps = pDataset.fetch_steps;
1080
+ if (!Array.isArray(tmpSteps) || tmpSteps.length === 0)
1081
+ {
1082
+ this.log.warn(` No fetch_steps defined for ${pDataset.id} — skipping`);
1083
+ return;
1084
+ }
1085
+
1086
+ await this.executeFetchSteps(tmpSteps, pDatasetDir, pManifest);
1087
+ }
1088
+
1089
+ /**
1090
+ * Execute an array of fetch_steps sequentially.
1091
+ * Each step can reference files created by previous steps.
1092
+ */
1093
+ async executeFetchSteps(pSteps, pDatasetDir, pManifest)
1094
+ {
1095
+ for (let i = 0; i < pSteps.length; i++)
1096
+ {
1097
+ let tmpStep = pSteps[i];
1098
+ let tmpAction = tmpStep.action;
1099
+
1100
+ if (tmpAction === 'get_json')
1101
+ {
1102
+ await this.executeGetJson(tmpStep, pDatasetDir, pManifest);
1103
+ }
1104
+ else if (tmpAction === 'get_text')
1105
+ {
1106
+ await this.executeGetText(tmpStep, pDatasetDir, pManifest);
1107
+ }
1108
+ else if (tmpAction === 'for_each')
1109
+ {
1110
+ await this.executeForEach(tmpStep, pDatasetDir, pManifest);
1111
+ }
1112
+ else if (tmpAction === 'paginate')
1113
+ {
1114
+ await this.executePaginate(tmpStep, pDatasetDir, pManifest);
1115
+ }
1116
+ else if (tmpAction === 'merge_pages')
1117
+ {
1118
+ await this.executeMergePages(tmpStep, pDatasetDir, pManifest);
1119
+ }
1120
+ else if (tmpAction === 'for_each_in_pages')
1121
+ {
1122
+ await this.executeForEachInPages(tmpStep, pDatasetDir, pManifest);
1123
+ }
1124
+ else
1125
+ {
1126
+ this.log.warn(` Unknown fetch_step action: ${tmpAction}`);
1127
+ }
1128
+ }
1129
+ }
1130
+
1131
+ /**
1132
+ * get_json: Fetch a URL, save the JSON response to a file.
1133
+ */
1134
+ async executeGetJson(pStep, pDatasetDir, pManifest)
1135
+ {
1136
+ let tmpUrl = pStep.url;
1137
+ let tmpSaveAs = pStep.save_as;
1138
+ let tmpHeaders = pStep.headers || {};
1139
+
1140
+ this.log.info(` Fetching: ${tmpUrl}`);
1141
+
1142
+ let tmpData = await this.fetchJson(tmpUrl, tmpHeaders);
1143
+ let tmpDestPath = libPath.join(pDatasetDir, tmpSaveAs);
1144
+
1145
+ // Ensure subdirectory exists
1146
+ let tmpDir = libPath.dirname(tmpDestPath);
1147
+ if (!libFs.existsSync(tmpDir))
1148
+ {
1149
+ libFs.mkdirSync(tmpDir, { recursive: true });
1150
+ }
1151
+
1152
+ libFs.writeFileSync(tmpDestPath, JSON.stringify(tmpData, null, '\t'), 'utf8');
1153
+ pManifest.urls_downloaded.push(tmpUrl);
1154
+ this.log.info(` Saved: ${tmpSaveAs} (${this.formatSize(Buffer.byteLength(JSON.stringify(tmpData, null, '\t')))})`);
1155
+
1156
+ if (pStep.delay_ms)
1157
+ {
1158
+ await this.delay(pStep.delay_ms);
1159
+ }
1160
+ }
1161
+
1162
+ /**
1163
+ * get_text: Fetch a URL, save the raw text to a file.
1164
+ */
1165
+ async executeGetText(pStep, pDatasetDir, pManifest)
1166
+ {
1167
+ let tmpUrl = pStep.url;
1168
+ let tmpSaveAs = pStep.save_as;
1169
+ let tmpHeaders = pStep.headers || {};
1170
+
1171
+ this.log.info(` Fetching: ${tmpUrl}`);
1172
+
1173
+ let tmpData = await this.fetchText(tmpUrl, tmpHeaders);
1174
+ let tmpDestPath = libPath.join(pDatasetDir, tmpSaveAs);
1175
+
1176
+ let tmpDir = libPath.dirname(tmpDestPath);
1177
+ if (!libFs.existsSync(tmpDir))
1178
+ {
1179
+ libFs.mkdirSync(tmpDir, { recursive: true });
1180
+ }
1181
+
1182
+ libFs.writeFileSync(tmpDestPath, tmpData, 'utf8');
1183
+ pManifest.urls_downloaded.push(tmpUrl);
1184
+ this.log.info(` Saved: ${tmpSaveAs} (${this.formatSize(Buffer.byteLength(tmpData))})`);
1185
+
1186
+ if (pStep.delay_ms)
1187
+ {
1188
+ await this.delay(pStep.delay_ms);
1189
+ }
1190
+ }
1191
+
1192
+ /**
1193
+ * for_each: Read a previously-saved JSON array file, iterate items,
1194
+ * fetch a templated URL per item, and save results.
1195
+ */
1196
+ async executeForEach(pStep, pDatasetDir, pManifest)
1197
+ {
1198
+ let tmpSourcePath = libPath.join(pDatasetDir, pStep.source_file);
1199
+ if (!libFs.existsSync(tmpSourcePath))
1200
+ {
1201
+ this.log.error(` for_each: source file not found: ${pStep.source_file}`);
1202
+ return;
1203
+ }
1204
+
1205
+ let tmpSourceData = JSON.parse(libFs.readFileSync(tmpSourcePath, 'utf8'));
1206
+ if (!Array.isArray(tmpSourceData))
1207
+ {
1208
+ this.log.error(` for_each: source file is not a JSON array: ${pStep.source_file}`);
1209
+ return;
1210
+ }
1211
+
1212
+ let tmpField = pStep.field;
1213
+ let tmpUrlTemplate = pStep.url_template;
1214
+ let tmpSaveAsTemplate = pStep.save_as;
1215
+ let tmpDelayMs = pStep.delay_ms || 100;
1216
+ let tmpHeaders = pStep.headers || {};
1217
+ let tmpSuccessCount = 0;
1218
+ let tmpErrorCount = 0;
1219
+
1220
+ this.log.info(` Iterating ${tmpSourceData.length} items from ${pStep.source_file}...`);
1221
+
1222
+ for (let i = 0; i < tmpSourceData.length; i++)
1223
+ {
1224
+ let tmpItem = tmpSourceData[i];
1225
+ let tmpValue = tmpField ? tmpItem[tmpField] : (typeof tmpItem === 'string' ? tmpItem : JSON.stringify(tmpItem));
1226
+
1227
+ // Expand template variables
1228
+ let tmpUrl = this.expandTemplate(tmpUrlTemplate, tmpItem, tmpValue);
1229
+ let tmpSaveAs = this.expandTemplate(tmpSaveAsTemplate, tmpItem, tmpValue);
1230
+ let tmpDestPath = libPath.join(pDatasetDir, tmpSaveAs);
1231
+
1232
+ // Ensure subdirectory exists
1233
+ let tmpDir = libPath.dirname(tmpDestPath);
1234
+ if (!libFs.existsSync(tmpDir))
1235
+ {
1236
+ libFs.mkdirSync(tmpDir, { recursive: true });
1237
+ }
1238
+
1239
+ let tmpSkipExisting = pStep.skip_existing !== false;
1240
+ let tmpSkipCount = 0;
1241
+
1242
+ try
1243
+ {
1244
+ // Skip if file already exists (for resumability)
1245
+ if (tmpSkipExisting && libFs.existsSync(tmpDestPath))
1246
+ {
1247
+ tmpSkipCount++;
1248
+ if ((i + 1) % 100 === 0 || i === tmpSourceData.length - 1)
1249
+ {
1250
+ process.stdout.write(` ... ${i + 1}/${tmpSourceData.length} (${tmpSuccessCount} ok, ${tmpSkipCount} cached, ${tmpErrorCount} err)\r`);
1251
+ }
1252
+ continue;
1253
+ }
1254
+
1255
+ let tmpData = await this.fetchJson(tmpUrl, tmpHeaders);
1256
+ libFs.writeFileSync(tmpDestPath, JSON.stringify(tmpData, null, '\t'), 'utf8');
1257
+ pManifest.urls_downloaded.push(tmpUrl);
1258
+ tmpSuccessCount++;
1259
+
1260
+ // Progress every 10 items or on last item
1261
+ if ((i + 1) % 10 === 0 || i === tmpSourceData.length - 1)
1262
+ {
1263
+ process.stdout.write(` ... ${i + 1}/${tmpSourceData.length} (${tmpSuccessCount} ok, ${tmpSkipCount} cached, ${tmpErrorCount} err)\r`);
1264
+ }
1265
+ }
1266
+ catch (pError)
1267
+ {
1268
+ tmpErrorCount++;
1269
+ this.log.warn(` Failed: ${tmpValue} — ${pError.message}`);
1270
+ }
1271
+
1272
+ if (tmpDelayMs > 0 && i < tmpSourceData.length - 1)
1273
+ {
1274
+ await this.delay(tmpDelayMs);
1275
+ }
1276
+ }
1277
+
1278
+ process.stdout.write(' \r');
1279
+ this.log.info(` for_each complete: ${tmpSuccessCount} ok, ${tmpSkipCount} cached, ${tmpErrorCount} errors`);
1280
+ }
1281
+
1282
+ /**
1283
+ * paginate: Fetch pages from a URL using offset or page-based pagination.
1284
+ */
1285
+ async executePaginate(pStep, pDatasetDir, pManifest)
1286
+ {
1287
+ let tmpUrlTemplate = pStep.url_template;
1288
+ let tmpSaveAsTemplate = pStep.save_as;
1289
+ let tmpDelayMs = pStep.delay_ms || 100;
1290
+ let tmpHeaders = pStep.headers || {};
1291
+ let tmpMaxPages = pStep.max_pages || 1000;
1292
+ let tmpStartPage = pStep.start_page || 1;
1293
+ let tmpPageSize = pStep.page_size || 100;
1294
+ let tmpResultField = pStep.result_field;
1295
+ let tmpStopWhenEmpty = pStep.stop_when_empty !== false;
1296
+
1297
+ this.log.info(` Paginating: ${tmpUrlTemplate}`);
1298
+
1299
+ let tmpPage = tmpStartPage;
1300
+ let tmpOffset = 0;
1301
+ let tmpTotalSaved = 0;
1302
+
1303
+ while (tmpPage < tmpStartPage + tmpMaxPages)
1304
+ {
1305
+ let tmpUrl = tmpUrlTemplate
1306
+ .replace(/\{page\}/g, String(tmpPage))
1307
+ .replace(/\{offset\}/g, String(tmpOffset))
1308
+ .replace(/\{page_size\}/g, String(tmpPageSize));
1309
+
1310
+ let tmpSaveAs = tmpSaveAsTemplate
1311
+ .replace(/\{page\}/g, String(tmpPage))
1312
+ .replace(/\{offset\}/g, String(tmpOffset));
1313
+
1314
+ let tmpDestPath = libPath.join(pDatasetDir, tmpSaveAs);
1315
+
1316
+ let tmpDir = libPath.dirname(tmpDestPath);
1317
+ if (!libFs.existsSync(tmpDir))
1318
+ {
1319
+ libFs.mkdirSync(tmpDir, { recursive: true });
1320
+ }
1321
+
1322
+ // Skip if page file already exists (for resumability)
1323
+ if (libFs.existsSync(tmpDestPath))
1324
+ {
1325
+ tmpTotalSaved++;
1326
+ tmpPage++;
1327
+ tmpOffset += tmpPageSize;
1328
+ continue;
1329
+ }
1330
+
1331
+ try
1332
+ {
1333
+ let tmpData = await this.fetchJson(tmpUrl, tmpHeaders);
1334
+ libFs.writeFileSync(tmpDestPath, JSON.stringify(tmpData, null, '\t'), 'utf8');
1335
+ pManifest.urls_downloaded.push(tmpUrl);
1336
+ tmpTotalSaved++;
1337
+
1338
+ // Check if we should stop
1339
+ if (tmpStopWhenEmpty)
1340
+ {
1341
+ let tmpResults = tmpResultField ? tmpData[tmpResultField] : tmpData;
1342
+ if (Array.isArray(tmpResults) && tmpResults.length === 0)
1343
+ {
1344
+ this.log.info(` Pagination complete: empty page at page ${tmpPage}`);
1345
+ break;
1346
+ }
1347
+ }
1348
+
1349
+ process.stdout.write(` ... page ${tmpPage} (${tmpTotalSaved} saved)\r`);
1350
+ }
1351
+ catch (pError)
1352
+ {
1353
+ this.log.warn(` Pagination stopped at page ${tmpPage}: ${pError.message}`);
1354
+ break;
1355
+ }
1356
+
1357
+ tmpPage++;
1358
+ tmpOffset += tmpPageSize;
1359
+
1360
+ if (tmpDelayMs > 0)
1361
+ {
1362
+ await this.delay(tmpDelayMs);
1363
+ }
1364
+ }
1365
+
1366
+ process.stdout.write(' \r');
1367
+ this.log.info(` Pagination complete: ${tmpTotalSaved} pages saved`);
1368
+ }
1369
+
1370
+ /**
1371
+ * merge_pages: Read all paginated JSON files matching a glob pattern
1372
+ * and merge them into a single JSON array file.
1373
+ */
1374
+ async executeMergePages(pStep, pDatasetDir, pManifest)
1375
+ {
1376
+ let tmpPattern = pStep.source_pattern; // e.g. "pages/shows_page_{page}.json"
1377
+ let tmpSaveAs = pStep.save_as;
1378
+ let tmpItemField = pStep.item_field; // optional: extract field from each page item (e.g. for search results)
1379
+ let tmpDestPath = libPath.join(pDatasetDir, tmpSaveAs);
1380
+
1381
+ // Skip if already exists
1382
+ if (libFs.existsSync(tmpDestPath))
1383
+ {
1384
+ this.log.info(` merge_pages: ${tmpSaveAs} already exists, skipping`);
1385
+ return;
1386
+ }
1387
+
1388
+ let tmpMerged = [];
1389
+ let tmpPage = 0;
1390
+ // Check if pages start at 0 or 1
1391
+ let tmpTestFile0 = libPath.join(pDatasetDir, tmpPattern.replace(/\{page\}/g, '0'));
1392
+ if (!libFs.existsSync(tmpTestFile0))
1393
+ {
1394
+ tmpPage = 1;
1395
+ }
1396
+
1397
+ while (true)
1398
+ {
1399
+ let tmpFilename = tmpPattern.replace(/\{page\}/g, String(tmpPage));
1400
+ let tmpFilePath = libPath.join(pDatasetDir, tmpFilename);
1401
+
1402
+ if (!libFs.existsSync(tmpFilePath))
1403
+ {
1404
+ break;
1405
+ }
1406
+
1407
+ let tmpData = JSON.parse(libFs.readFileSync(tmpFilePath, 'utf8'));
1408
+ if (Array.isArray(tmpData))
1409
+ {
1410
+ if (tmpItemField)
1411
+ {
1412
+ for (let i = 0; i < tmpData.length; i++)
1413
+ {
1414
+ tmpMerged.push(tmpData[i][tmpItemField] || tmpData[i]);
1415
+ }
1416
+ }
1417
+ else
1418
+ {
1419
+ tmpMerged = tmpMerged.concat(tmpData);
1420
+ }
1421
+ }
1422
+
1423
+ tmpPage++;
1424
+ }
1425
+
1426
+ libFs.writeFileSync(tmpDestPath, JSON.stringify(tmpMerged), 'utf8');
1427
+ this.log.info(` merge_pages: merged ${tmpPage} pages into ${tmpSaveAs} (${tmpMerged.length} items)`);
1428
+ }
1429
+
1430
+ /**
1431
+ * for_each_in_pages: Iterate over items across multiple paginated JSON files
1432
+ * and fetch a URL per item. Skips items whose output file already exists.
1433
+ */
1434
+ async executeForEachInPages(pStep, pDatasetDir, pManifest)
1435
+ {
1436
+ let tmpPagePattern = pStep.source_pattern; // e.g. "pages/shows_page_{page}.json"
1437
+ let tmpField = pStep.field; // field to extract from each item for URL template
1438
+ let tmpItemField = pStep.item_field; // optional: unwrap item (e.g. search results have {show: {...}})
1439
+ let tmpUrlTemplate = pStep.url_template;
1440
+ let tmpSaveAsTemplate = pStep.save_as;
1441
+ let tmpDelayMs = pStep.delay_ms || 100;
1442
+ let tmpHeaders = pStep.headers || {};
1443
+ let tmpSuccessCount = 0;
1444
+ let tmpSkipCount = 0;
1445
+ let tmpErrorCount = 0;
1446
+ let tmpTotalItems = 0;
1447
+
1448
+ // Count total items first — try page 0 then page 1 to find the start
1449
+ let tmpPage = 0;
1450
+ let tmpAllItems = [];
1451
+ // Check if pages start at 0 or 1
1452
+ let tmpTestFile0 = libPath.join(pDatasetDir, tmpPagePattern.replace(/\{page\}/g, '0'));
1453
+ if (!libFs.existsSync(tmpTestFile0))
1454
+ {
1455
+ tmpPage = 1;
1456
+ }
1457
+ while (true)
1458
+ {
1459
+ let tmpFilename = tmpPagePattern.replace(/\{page\}/g, String(tmpPage));
1460
+ let tmpFilePath = libPath.join(pDatasetDir, tmpFilename);
1461
+
1462
+ if (!libFs.existsSync(tmpFilePath))
1463
+ {
1464
+ break;
1465
+ }
1466
+
1467
+ let tmpData = JSON.parse(libFs.readFileSync(tmpFilePath, 'utf8'));
1468
+ if (Array.isArray(tmpData))
1469
+ {
1470
+ for (let i = 0; i < tmpData.length; i++)
1471
+ {
1472
+ let tmpItem = tmpItemField ? (tmpData[i][tmpItemField] || tmpData[i]) : tmpData[i];
1473
+ tmpAllItems.push(tmpItem);
1474
+ }
1475
+ }
1476
+ tmpPage++;
1477
+ }
1478
+
1479
+ tmpTotalItems = tmpAllItems.length;
1480
+ this.log.info(` for_each_in_pages: ${tmpTotalItems} items across ${tmpPage} pages`);
1481
+
1482
+ for (let i = 0; i < tmpAllItems.length; i++)
1483
+ {
1484
+ let tmpItem = tmpAllItems[i];
1485
+ let tmpValue = tmpField ? tmpItem[tmpField] : (typeof tmpItem === 'string' ? tmpItem : JSON.stringify(tmpItem));
1486
+
1487
+ let tmpUrl = this.expandTemplate(tmpUrlTemplate, tmpItem, tmpValue);
1488
+ let tmpSaveAs = this.expandTemplate(tmpSaveAsTemplate, tmpItem, tmpValue);
1489
+ let tmpDestPath = libPath.join(pDatasetDir, tmpSaveAs);
1490
+
1491
+ // Ensure subdirectory exists
1492
+ let tmpDir = libPath.dirname(tmpDestPath);
1493
+ if (!libFs.existsSync(tmpDir))
1494
+ {
1495
+ libFs.mkdirSync(tmpDir, { recursive: true });
1496
+ }
1497
+
1498
+ // Skip if already exists
1499
+ if (libFs.existsSync(tmpDestPath))
1500
+ {
1501
+ tmpSkipCount++;
1502
+ if ((i + 1) % 500 === 0)
1503
+ {
1504
+ process.stdout.write(` ... ${i + 1}/${tmpTotalItems} (${tmpSuccessCount} ok, ${tmpSkipCount} cached, ${tmpErrorCount} err)\r`);
1505
+ }
1506
+ continue;
1507
+ }
1508
+
1509
+ try
1510
+ {
1511
+ let tmpData = await this.fetchJson(tmpUrl, tmpHeaders);
1512
+ libFs.writeFileSync(tmpDestPath, JSON.stringify(tmpData, null, '\t'), 'utf8');
1513
+ pManifest.urls_downloaded.push(tmpUrl);
1514
+ tmpSuccessCount++;
1515
+
1516
+ if ((i + 1) % 10 === 0 || i === tmpTotalItems - 1)
1517
+ {
1518
+ process.stdout.write(` ... ${i + 1}/${tmpTotalItems} (${tmpSuccessCount} ok, ${tmpSkipCount} cached, ${tmpErrorCount} err)\r`);
1519
+ }
1520
+ }
1521
+ catch (pError)
1522
+ {
1523
+ tmpErrorCount++;
1524
+ if (tmpErrorCount <= 10)
1525
+ {
1526
+ this.log.warn(` Failed: ${tmpValue} — ${pError.message}`);
1527
+ }
1528
+ }
1529
+
1530
+ if (tmpDelayMs > 0)
1531
+ {
1532
+ await this.delay(tmpDelayMs);
1533
+ }
1534
+ }
1535
+
1536
+ process.stdout.write(' \r');
1537
+ this.log.info(` for_each_in_pages complete: ${tmpSuccessCount} ok, ${tmpSkipCount} cached, ${tmpErrorCount} errors`);
1538
+ }
1539
+
1540
+ /**
1541
+ * Expand template variables in a string.
1542
+ * Supports {value}, {item.fieldName}
1543
+ */
1544
+ expandTemplate(pTemplate, pItem, pValue)
1545
+ {
1546
+ let tmpResult = pTemplate.replace(/\{value\}/g, String(pValue));
1547
+
1548
+ // Replace {item.fieldName} patterns
1549
+ tmpResult = tmpResult.replace(/\{item\.(\w+)\}/g,
1550
+ (pMatch, pFieldName) =>
1551
+ {
1552
+ return pItem && pItem[pFieldName] !== undefined ? String(pItem[pFieldName]) : pMatch;
1553
+ });
1554
+
1555
+ return tmpResult;
1556
+ }
1557
+
1558
+ /**
1559
+ * Simple delay helper.
1560
+ */
1561
+ delay(pMs)
1562
+ {
1563
+ return new Promise((fResolve) => { setTimeout(fResolve, pMs); });
1564
+ }
1565
+
1566
+ // ================================================================
1567
+ // Download Dispatcher
1568
+ // ================================================================
1569
+
1570
+ /**
1571
+ * Download a single dataset entry.
1572
+ * Returns a Promise that resolves with a manifest object.
1573
+ */
1574
+ async downloadDataset(pEntry)
1575
+ {
1576
+ let tmpDataset = pEntry.dataset;
1577
+ let tmpDatasetDir = this.getDatasetDir(pEntry.categoryFolder, tmpDataset.id);
1578
+ let tmpManifest = (
1579
+ {
1580
+ id: tmpDataset.id,
1581
+ name: tmpDataset.name,
1582
+ category: pEntry.categoryKey,
1583
+ tier: tmpDataset.tier,
1584
+ method: tmpDataset.method,
1585
+ license: tmpDataset.license,
1586
+ urls_downloaded: [],
1587
+ download_date: new Date().toISOString(),
1588
+ files: [],
1589
+ total_size: 0,
1590
+ status: 'in_progress'
1591
+ });
1592
+
1593
+ // Ensure dataset directory exists
1594
+ if (!libFs.existsSync(tmpDatasetDir))
1595
+ {
1596
+ libFs.mkdirSync(tmpDatasetDir, { recursive: true });
1597
+ }
1598
+
1599
+ try
1600
+ {
1601
+ if (tmpDataset.method === 'http_file')
1602
+ {
1603
+ await this.downloadHttpFiles(tmpDataset, tmpDatasetDir, tmpManifest);
1604
+ }
1605
+ else if (tmpDataset.method === 'http_archive')
1606
+ {
1607
+ await this.downloadHttpArchives(tmpDataset, tmpDatasetDir, tmpManifest);
1608
+ }
1609
+ else if (tmpDataset.method === 'git_clone')
1610
+ {
1611
+ await this.downloadGitClone(tmpDataset, tmpDatasetDir, tmpManifest);
1612
+ }
1613
+ else if (tmpDataset.method === 'rest_api')
1614
+ {
1615
+ await this.downloadRestApi(tmpDataset, tmpDatasetDir, tmpManifest);
1616
+ }
1617
+
1618
+ // Build file inventory
1619
+ tmpManifest.files = this.inventoryFiles(tmpDatasetDir);
1620
+ tmpManifest.total_size = 0;
1621
+ for (let i = 0; i < tmpManifest.files.length; i++)
1622
+ {
1623
+ tmpManifest.total_size += tmpManifest.files[i].size;
1624
+ }
1625
+
1626
+ tmpManifest.status = 'complete';
1627
+ }
1628
+ catch (pError)
1629
+ {
1630
+ tmpManifest.status = 'error';
1631
+ tmpManifest.error = pError.message;
1632
+ this.log.error(` ERROR: ${pError.message}`);
1633
+ }
1634
+
1635
+ // Write manifest
1636
+ this.writeManifest(tmpDatasetDir, tmpManifest);
1637
+ return tmpManifest;
1638
+ }
1639
+ }
1640
+
1641
+ module.exports = RetoldFactoDataLakeService;
1642
+ module.exports.default_options = defaultDataLakeOptions;