retold-facto 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/launch.json +11 -0
- package/.dockerignore +8 -0
- package/.quackage.json +19 -0
- package/Dockerfile +26 -0
- package/bin/retold-facto.js +909 -0
- package/examples/facto-government-data.sqlite +0 -0
- package/examples/government-data-catalog.json +137 -0
- package/examples/government-data-loader.js +1432 -0
- package/package.json +91 -0
- package/scripts/facto-download.js +425 -0
- package/source/Retold-Facto.js +1042 -0
- package/source/services/Retold-Facto-BeaconProvider.js +511 -0
- package/source/services/Retold-Facto-CatalogManager.js +1252 -0
- package/source/services/Retold-Facto-DataLakeService.js +1642 -0
- package/source/services/Retold-Facto-DatasetManager.js +417 -0
- package/source/services/Retold-Facto-IngestEngine.js +1315 -0
- package/source/services/Retold-Facto-ProjectionEngine.js +3960 -0
- package/source/services/Retold-Facto-RecordManager.js +360 -0
- package/source/services/Retold-Facto-SchemaManager.js +1110 -0
- package/source/services/Retold-Facto-SourceFolderScanner.js +2243 -0
- package/source/services/Retold-Facto-SourceManager.js +730 -0
- package/source/services/Retold-Facto-StoreConnectionManager.js +441 -0
- package/source/services/Retold-Facto-ThroughputMonitor.js +478 -0
- package/source/services/web-app/codemirror-entry.js +7 -0
- package/source/services/web-app/pict-app/Pict-Application-Facto-Configuration.json +9 -0
- package/source/services/web-app/pict-app/Pict-Application-Facto.js +70 -0
- package/source/services/web-app/pict-app/Pict-Facto-Bundle.js +11 -0
- package/source/services/web-app/pict-app/providers/Pict-Provider-Facto-UI.js +66 -0
- package/source/services/web-app/pict-app/providers/Pict-Provider-Facto.js +69 -0
- package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Catalog.js +93 -0
- package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Connections.js +42 -0
- package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Datasets.js +605 -0
- package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Projections.js +188 -0
- package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Scanner.js +80 -0
- package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Schema.js +116 -0
- package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Sources.js +104 -0
- package/source/services/web-app/pict-app/views/PictView-Facto-Catalog.js +526 -0
- package/source/services/web-app/pict-app/views/PictView-Facto-Datasets.js +173 -0
- package/source/services/web-app/pict-app/views/PictView-Facto-Ingest.js +259 -0
- package/source/services/web-app/pict-app/views/PictView-Facto-Layout.js +191 -0
- package/source/services/web-app/pict-app/views/PictView-Facto-Projections.js +231 -0
- package/source/services/web-app/pict-app/views/PictView-Facto-Records.js +326 -0
- package/source/services/web-app/pict-app/views/PictView-Facto-Scanner.js +624 -0
- package/source/services/web-app/pict-app/views/PictView-Facto-Sources.js +201 -0
- package/source/services/web-app/pict-app/views/PictView-Facto-Throughput.js +456 -0
- package/source/services/web-app/pict-app-full/Pict-Application-Facto-Full-Configuration.json +14 -0
- package/source/services/web-app/pict-app-full/Pict-Application-Facto-Full.js +391 -0
- package/source/services/web-app/pict-app-full/providers/PictRouter-Facto-Configuration.json +56 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-BottomBar.js +68 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Connections.js +340 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Dashboard.js +149 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Dashboards.js +819 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Datasets.js +178 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-IngestJobs.js +99 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Layout.js +62 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-MappingEditor.js +158 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-ProjectionDetail.js +1120 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Projections.js +172 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-QueryPanel.js +119 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-RecordViewer.js +663 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Records.js +648 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Scanner.js +1017 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SchemaDetail.js +1404 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SchemaDocEditor.js +1036 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SchemaEditor.js +636 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SchemaResearch.js +357 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SourceDetail.js +822 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SourceEditor.js +1036 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SourceResearch.js +487 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Sources.js +165 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Throughput.js +439 -0
- package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-TopBar.js +335 -0
- package/source/services/web-app/pict-app-full/views/projections/Facto-Projections-Constants.js +71 -0
- package/source/services/web-app/web/chart.min.js +20 -0
- package/source/services/web-app/web/codemirror-bundle.js +30099 -0
- package/source/services/web-app/web/css/facto-themes.css +467 -0
- package/source/services/web-app/web/css/facto.css +502 -0
- package/source/services/web-app/web/index.html +28 -0
- package/source/services/web-app/web/retold-facto.js +12138 -0
- package/source/services/web-app/web/retold-facto.js.map +1 -0
- package/source/services/web-app/web/retold-facto.min.js +2 -0
- package/source/services/web-app/web/retold-facto.min.js.map +1 -0
- package/source/services/web-app/web/simple/index.html +17 -0
- package/test/Facto_Browser_Integration_tests.js +798 -0
- package/test/RetoldFacto_tests.js +4117 -0
- package/test/fixtures/weather-readings.csv +17 -0
- package/test/fixtures/weather-stations.csv +9 -0
- package/test/model/MeadowModel-Extended.json +8497 -0
- package/test/model/MeadowModel-PICT.json +1 -0
- package/test/model/MeadowModel.json +1355 -0
- package/test/model/ddl/Facto.ddl +225 -0
- package/test/model/fable-configuration.json +14 -0
|
@@ -0,0 +1,1642 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Retold Facto - Data Lake Service
|
|
3
|
+
*
|
|
4
|
+
* Manages the offline data lake: downloading public datasets into a
|
|
5
|
+
* well-organized folder structure for stable, repeatable ingestion.
|
|
6
|
+
* Supports http_file, http_archive, git_clone, and rest_api (via fetch_steps) methods.
|
|
7
|
+
*
|
|
8
|
+
* @author Steven Velozo <steven@velozo.com>
|
|
9
|
+
*/
|
|
10
|
+
const libFableServiceProviderBase = require('fable-serviceproviderbase');
|
|
11
|
+
const libFs = require('fs');
|
|
12
|
+
const libPath = require('path');
|
|
13
|
+
const libHttps = require('https');
|
|
14
|
+
const libHttp = require('http');
|
|
15
|
+
const libUrl = require('url');
|
|
16
|
+
const libCrypto = require('crypto');
|
|
17
|
+
const libChildProcess = require('child_process');
|
|
18
|
+
const libZlib = require('zlib');
|
|
19
|
+
|
|
20
|
+
const defaultDataLakeOptions = (
|
|
21
|
+
{
|
|
22
|
+
CatalogPath: null,
|
|
23
|
+
DataDir: null,
|
|
24
|
+
UserAgent: 'RetoldFacto/1.0 (data-coagulation-platform; https://github.com/stevenvelozo/retold-facto)'
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
// Map catalog category keys to folder names
|
|
28
|
+
const CATEGORY_FOLDER_MAP = (
|
|
29
|
+
{
|
|
30
|
+
'01_foundational_reference': '01-foundational-reference',
|
|
31
|
+
'02_geographic_location': '02-geographic-location',
|
|
32
|
+
'03_people_cultural_entities': '03-people-cultural-entities',
|
|
33
|
+
'04_business_industry': '04-business-industry',
|
|
34
|
+
'05_media_entertainment': '05-media-entertainment'
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
// Methods we can actually download automatically
|
|
38
|
+
const DOWNLOADABLE_METHODS = ['http_file', 'http_archive', 'git_clone', 'rest_api'];
|
|
39
|
+
|
|
40
|
+
class RetoldFactoDataLakeService extends libFableServiceProviderBase
|
|
41
|
+
{
|
|
42
|
+
constructor(pFable, pOptions, pServiceHash)
|
|
43
|
+
{
|
|
44
|
+
let tmpOptions = Object.assign({}, defaultDataLakeOptions, pOptions);
|
|
45
|
+
super(pFable, tmpOptions, pServiceHash);
|
|
46
|
+
|
|
47
|
+
this.serviceType = 'RetoldFactoDataLakeService';
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// ================================================================
|
|
51
|
+
// Catalog Loading
|
|
52
|
+
// ================================================================
|
|
53
|
+
|
|
54
|
+
loadCatalog()
|
|
55
|
+
{
|
|
56
|
+
let tmpCatalogPath = this.options.CatalogPath;
|
|
57
|
+
if (!tmpCatalogPath)
|
|
58
|
+
{
|
|
59
|
+
this.log.error('No CatalogPath configured for DataLakeService.');
|
|
60
|
+
return null;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
if (!libFs.existsSync(tmpCatalogPath))
|
|
64
|
+
{
|
|
65
|
+
this.log.error(`Catalog not found at: ${tmpCatalogPath}`);
|
|
66
|
+
return null;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
let tmpRaw = libFs.readFileSync(tmpCatalogPath, 'utf8');
|
|
70
|
+
return JSON.parse(tmpRaw);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Flatten the catalog into a list of { categoryKey, categoryFolder, dataset } objects,
|
|
75
|
+
* applying optional filters.
|
|
76
|
+
*
|
|
77
|
+
* @param {object} pCatalog - The parsed catalog JSON
|
|
78
|
+
* @param {object} pFilters - Optional { tier, category, id }
|
|
79
|
+
* @returns {Array}
|
|
80
|
+
*/
|
|
81
|
+
flattenCatalog(pCatalog, pFilters)
|
|
82
|
+
{
|
|
83
|
+
let tmpFilters = pFilters || {};
|
|
84
|
+
let tmpEntries = [];
|
|
85
|
+
|
|
86
|
+
let tmpCategoryKeys = Object.keys(pCatalog.categories);
|
|
87
|
+
for (let c = 0; c < tmpCategoryKeys.length; c++)
|
|
88
|
+
{
|
|
89
|
+
let tmpCategoryKey = tmpCategoryKeys[c];
|
|
90
|
+
|
|
91
|
+
if (tmpFilters.category && tmpCategoryKey !== tmpFilters.category)
|
|
92
|
+
{
|
|
93
|
+
continue;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
let tmpCategoryFolder = CATEGORY_FOLDER_MAP[tmpCategoryKey] || tmpCategoryKey.replace(/_/g, '-');
|
|
97
|
+
let tmpDatasets = pCatalog.categories[tmpCategoryKey].datasets;
|
|
98
|
+
|
|
99
|
+
for (let d = 0; d < tmpDatasets.length; d++)
|
|
100
|
+
{
|
|
101
|
+
let tmpDataset = tmpDatasets[d];
|
|
102
|
+
|
|
103
|
+
if (tmpFilters.id && tmpDataset.id !== tmpFilters.id)
|
|
104
|
+
{
|
|
105
|
+
continue;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if (tmpFilters.tier !== null && tmpFilters.tier !== undefined && tmpDataset.tier > tmpFilters.tier)
|
|
109
|
+
{
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
tmpEntries.push(
|
|
114
|
+
{
|
|
115
|
+
categoryKey: tmpCategoryKey,
|
|
116
|
+
categoryFolder: tmpCategoryFolder,
|
|
117
|
+
dataset: tmpDataset
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return tmpEntries;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// ================================================================
|
|
126
|
+
// Manifest Management
|
|
127
|
+
// ================================================================
|
|
128
|
+
|
|
129
|
+
getDatasetDir(pCategoryFolder, pDatasetId)
|
|
130
|
+
{
|
|
131
|
+
return libPath.join(this.options.DataDir, pCategoryFolder, pDatasetId);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
getManifestPath(pDatasetDir)
|
|
135
|
+
{
|
|
136
|
+
return libPath.join(pDatasetDir, '_manifest.json');
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
readManifest(pDatasetDir)
|
|
140
|
+
{
|
|
141
|
+
let tmpManifestPath = this.getManifestPath(pDatasetDir);
|
|
142
|
+
if (libFs.existsSync(tmpManifestPath))
|
|
143
|
+
{
|
|
144
|
+
try
|
|
145
|
+
{
|
|
146
|
+
return JSON.parse(libFs.readFileSync(tmpManifestPath, 'utf8'));
|
|
147
|
+
}
|
|
148
|
+
catch (pError)
|
|
149
|
+
{
|
|
150
|
+
return null;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
return null;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
writeManifest(pDatasetDir, pManifest)
|
|
157
|
+
{
|
|
158
|
+
let tmpManifestPath = this.getManifestPath(pDatasetDir);
|
|
159
|
+
libFs.writeFileSync(tmpManifestPath, JSON.stringify(pManifest, null, '\t'), 'utf8');
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Scan a dataset directory and build a file inventory (excluding _manifest.json).
|
|
164
|
+
*/
|
|
165
|
+
inventoryFiles(pDatasetDir)
|
|
166
|
+
{
|
|
167
|
+
let tmpFiles = [];
|
|
168
|
+
|
|
169
|
+
if (!libFs.existsSync(pDatasetDir))
|
|
170
|
+
{
|
|
171
|
+
return tmpFiles;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
let tmpEntries = libFs.readdirSync(pDatasetDir);
|
|
175
|
+
for (let i = 0; i < tmpEntries.length; i++)
|
|
176
|
+
{
|
|
177
|
+
if (tmpEntries[i] === '_manifest.json')
|
|
178
|
+
{
|
|
179
|
+
continue;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
let tmpFullPath = libPath.join(pDatasetDir, tmpEntries[i]);
|
|
183
|
+
let tmpStat = libFs.statSync(tmpFullPath);
|
|
184
|
+
|
|
185
|
+
if (tmpStat.isFile())
|
|
186
|
+
{
|
|
187
|
+
tmpFiles.push(
|
|
188
|
+
{
|
|
189
|
+
name: tmpEntries[i],
|
|
190
|
+
size: tmpStat.size
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
else if (tmpStat.isDirectory())
|
|
194
|
+
{
|
|
195
|
+
let tmpDirSize = this.getDirectorySize(tmpFullPath);
|
|
196
|
+
tmpFiles.push(
|
|
197
|
+
{
|
|
198
|
+
name: tmpEntries[i] + '/',
|
|
199
|
+
size: tmpDirSize
|
|
200
|
+
});
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
return tmpFiles;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
getDirectorySize(pDirPath)
|
|
208
|
+
{
|
|
209
|
+
let tmpTotal = 0;
|
|
210
|
+
|
|
211
|
+
try
|
|
212
|
+
{
|
|
213
|
+
let tmpEntries = libFs.readdirSync(pDirPath);
|
|
214
|
+
for (let i = 0; i < tmpEntries.length; i++)
|
|
215
|
+
{
|
|
216
|
+
let tmpFullPath = libPath.join(pDirPath, tmpEntries[i]);
|
|
217
|
+
let tmpStat = libFs.statSync(tmpFullPath);
|
|
218
|
+
|
|
219
|
+
if (tmpStat.isFile())
|
|
220
|
+
{
|
|
221
|
+
tmpTotal += tmpStat.size;
|
|
222
|
+
}
|
|
223
|
+
else if (tmpStat.isDirectory())
|
|
224
|
+
{
|
|
225
|
+
tmpTotal += this.getDirectorySize(tmpFullPath);
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
catch (pError)
|
|
230
|
+
{
|
|
231
|
+
// Skip unreadable dirs
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return tmpTotal;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// ================================================================
|
|
238
|
+
// Dataset Status
|
|
239
|
+
// ================================================================
|
|
240
|
+
|
|
241
|
+
getDatasetStatus(pEntry)
|
|
242
|
+
{
|
|
243
|
+
let tmpDatasetDir = this.getDatasetDir(pEntry.categoryFolder, pEntry.dataset.id);
|
|
244
|
+
let tmpManifest = this.readManifest(tmpDatasetDir);
|
|
245
|
+
|
|
246
|
+
if (!tmpManifest)
|
|
247
|
+
{
|
|
248
|
+
return 'missing';
|
|
249
|
+
}
|
|
250
|
+
if (tmpManifest.status === 'error')
|
|
251
|
+
{
|
|
252
|
+
return 'error';
|
|
253
|
+
}
|
|
254
|
+
if (tmpManifest.status === 'complete')
|
|
255
|
+
{
|
|
256
|
+
return 'cached';
|
|
257
|
+
}
|
|
258
|
+
return 'partial';
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
/**
|
|
262
|
+
* Check if a dataset entry is downloadable by this service.
|
|
263
|
+
* rest_api entries are only downloadable if they have fetch_steps defined.
|
|
264
|
+
*/
|
|
265
|
+
isDownloadable(pDataset)
|
|
266
|
+
{
|
|
267
|
+
if (pDataset.method === 'rest_api')
|
|
268
|
+
{
|
|
269
|
+
return Array.isArray(pDataset.fetch_steps) && pDataset.fetch_steps.length > 0;
|
|
270
|
+
}
|
|
271
|
+
return DOWNLOADABLE_METHODS.indexOf(pDataset.method) > -1;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// ================================================================
|
|
275
|
+
// Size Formatting
|
|
276
|
+
// ================================================================
|
|
277
|
+
|
|
278
|
+
parseSize(pSizeStr)
|
|
279
|
+
{
|
|
280
|
+
if (!pSizeStr || pSizeStr === 'N/A' || pSizeStr === 'varies' || pSizeStr === 'small' || pSizeStr === 'large')
|
|
281
|
+
{
|
|
282
|
+
return 0;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// Handle compound sizes like "companyfacts ~10GB, submissions ~8GB"
|
|
286
|
+
if (pSizeStr.indexOf(',') > -1)
|
|
287
|
+
{
|
|
288
|
+
let tmpParts = pSizeStr.split(',');
|
|
289
|
+
let tmpTotal = 0;
|
|
290
|
+
for (let i = 0; i < tmpParts.length; i++)
|
|
291
|
+
{
|
|
292
|
+
tmpTotal += this.parseSize(tmpParts[i].trim());
|
|
293
|
+
}
|
|
294
|
+
return tmpTotal;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// Handle "cities15000: 2MB, allCountries: 1.5GB" style
|
|
298
|
+
if (pSizeStr.indexOf(':') > -1)
|
|
299
|
+
{
|
|
300
|
+
let tmpColonParts = pSizeStr.split(':');
|
|
301
|
+
if (tmpColonParts.length >= 2)
|
|
302
|
+
{
|
|
303
|
+
return this.parseSize(tmpColonParts[tmpColonParts.length - 1].trim());
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// Strip leading text like "~" or "companyfacts "
|
|
308
|
+
let tmpMatch = pSizeStr.match(/([\d.]+)\s*(KB|MB|GB|TB)/i);
|
|
309
|
+
if (!tmpMatch)
|
|
310
|
+
{
|
|
311
|
+
return 0;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
let tmpValue = parseFloat(tmpMatch[1]);
|
|
315
|
+
let tmpUnit = tmpMatch[2].toUpperCase();
|
|
316
|
+
let tmpMultipliers = { 'KB': 1024, 'MB': 1024 * 1024, 'GB': 1024 * 1024 * 1024, 'TB': 1024 * 1024 * 1024 * 1024 };
|
|
317
|
+
return Math.round(tmpValue * (tmpMultipliers[tmpUnit] || 1));
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
formatSize(pBytes)
|
|
321
|
+
{
|
|
322
|
+
if (pBytes === 0)
|
|
323
|
+
{
|
|
324
|
+
return '???';
|
|
325
|
+
}
|
|
326
|
+
if (pBytes < 1024)
|
|
327
|
+
{
|
|
328
|
+
return pBytes + ' B';
|
|
329
|
+
}
|
|
330
|
+
if (pBytes < 1024 * 1024)
|
|
331
|
+
{
|
|
332
|
+
return (pBytes / 1024).toFixed(1) + ' KB';
|
|
333
|
+
}
|
|
334
|
+
if (pBytes < 1024 * 1024 * 1024)
|
|
335
|
+
{
|
|
336
|
+
return (pBytes / (1024 * 1024)).toFixed(1) + ' MB';
|
|
337
|
+
}
|
|
338
|
+
return (pBytes / (1024 * 1024 * 1024)).toFixed(2) + ' GB';
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// ================================================================
|
|
342
|
+
// HTTP Download
|
|
343
|
+
// ================================================================
|
|
344
|
+
|
|
345
|
+
/**
|
|
346
|
+
* Download a single URL to a local file path.
|
|
347
|
+
* Returns a Promise that resolves with { path, size }.
|
|
348
|
+
* Follows redirects up to 5 times.
|
|
349
|
+
*/
|
|
350
|
+
downloadFile(pUrl, pDestPath, pRedirects, pHeaders)
|
|
351
|
+
{
|
|
352
|
+
let tmpRedirects = pRedirects || 0;
|
|
353
|
+
if (tmpRedirects > 5)
|
|
354
|
+
{
|
|
355
|
+
return Promise.reject(new Error(`Too many redirects for ${pUrl}`));
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
let tmpSelf = this;
|
|
359
|
+
return new Promise(
|
|
360
|
+
(fResolve, fReject) =>
|
|
361
|
+
{
|
|
362
|
+
let tmpParsed = new URL(pUrl);
|
|
363
|
+
let tmpLib = tmpParsed.protocol === 'https:' ? libHttps : libHttp;
|
|
364
|
+
|
|
365
|
+
let tmpRequestOptions = (
|
|
366
|
+
{
|
|
367
|
+
headers: Object.assign(
|
|
368
|
+
{
|
|
369
|
+
'User-Agent': tmpSelf.options.UserAgent
|
|
370
|
+
}, pHeaders || {})
|
|
371
|
+
});
|
|
372
|
+
|
|
373
|
+
let tmpRequest = tmpLib.get(pUrl, tmpRequestOptions,
|
|
374
|
+
(pResponse) =>
|
|
375
|
+
{
|
|
376
|
+
// Follow redirects
|
|
377
|
+
if (pResponse.statusCode >= 300 && pResponse.statusCode < 400 && pResponse.headers.location)
|
|
378
|
+
{
|
|
379
|
+
let tmpRedirectUrl = pResponse.headers.location;
|
|
380
|
+
if (!tmpRedirectUrl.startsWith('http'))
|
|
381
|
+
{
|
|
382
|
+
tmpRedirectUrl = tmpParsed.protocol + '//' + tmpParsed.host + tmpRedirectUrl;
|
|
383
|
+
}
|
|
384
|
+
pResponse.resume();
|
|
385
|
+
return tmpSelf.downloadFile(tmpRedirectUrl, pDestPath, tmpRedirects + 1, pHeaders)
|
|
386
|
+
.then(fResolve)
|
|
387
|
+
.catch(fReject);
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
if (pResponse.statusCode !== 200)
|
|
391
|
+
{
|
|
392
|
+
pResponse.resume();
|
|
393
|
+
return fReject(new Error(`HTTP ${pResponse.statusCode} for ${pUrl}`));
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// Ensure parent directory exists
|
|
397
|
+
let tmpDir = libPath.dirname(pDestPath);
|
|
398
|
+
if (!libFs.existsSync(tmpDir))
|
|
399
|
+
{
|
|
400
|
+
libFs.mkdirSync(tmpDir, { recursive: true });
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
let tmpFile = libFs.createWriteStream(pDestPath);
|
|
404
|
+
let tmpDownloaded = 0;
|
|
405
|
+
let tmpContentLength = parseInt(pResponse.headers['content-length'] || '0', 10);
|
|
406
|
+
let tmpLastProgress = 0;
|
|
407
|
+
|
|
408
|
+
pResponse.on('data',
|
|
409
|
+
(pChunk) =>
|
|
410
|
+
{
|
|
411
|
+
tmpDownloaded += pChunk.length;
|
|
412
|
+
|
|
413
|
+
if (tmpContentLength > 0)
|
|
414
|
+
{
|
|
415
|
+
let tmpProgress = Math.floor((tmpDownloaded / tmpContentLength) * 100);
|
|
416
|
+
if (tmpProgress >= tmpLastProgress + 10)
|
|
417
|
+
{
|
|
418
|
+
tmpLastProgress = tmpProgress;
|
|
419
|
+
process.stdout.write(` ... ${tmpProgress}% (${tmpSelf.formatSize(tmpDownloaded)})\r`);
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
});
|
|
423
|
+
|
|
424
|
+
pResponse.pipe(tmpFile);
|
|
425
|
+
|
|
426
|
+
tmpFile.on('finish',
|
|
427
|
+
() =>
|
|
428
|
+
{
|
|
429
|
+
tmpFile.close();
|
|
430
|
+
if (tmpContentLength > 0)
|
|
431
|
+
{
|
|
432
|
+
process.stdout.write(' \r');
|
|
433
|
+
}
|
|
434
|
+
fResolve({ path: pDestPath, size: tmpDownloaded });
|
|
435
|
+
});
|
|
436
|
+
|
|
437
|
+
tmpFile.on('error',
|
|
438
|
+
(pError) =>
|
|
439
|
+
{
|
|
440
|
+
libFs.unlink(pDestPath, () => {});
|
|
441
|
+
fReject(pError);
|
|
442
|
+
});
|
|
443
|
+
});
|
|
444
|
+
|
|
445
|
+
tmpRequest.on('error', fReject);
|
|
446
|
+
|
|
447
|
+
tmpRequest.setTimeout(300000,
|
|
448
|
+
() =>
|
|
449
|
+
{
|
|
450
|
+
tmpRequest.destroy();
|
|
451
|
+
fReject(new Error(`Timeout downloading ${pUrl}`));
|
|
452
|
+
});
|
|
453
|
+
});
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
/**
|
|
457
|
+
* Fetch a URL and return the parsed JSON body.
|
|
458
|
+
* Returns a Promise that resolves with the parsed object.
|
|
459
|
+
*/
|
|
460
|
+
fetchJson(pUrl, pHeaders)
|
|
461
|
+
{
|
|
462
|
+
let tmpSelf = this;
|
|
463
|
+
return new Promise(
|
|
464
|
+
(fResolve, fReject) =>
|
|
465
|
+
{
|
|
466
|
+
let tmpParsed = new URL(pUrl);
|
|
467
|
+
let tmpLib = tmpParsed.protocol === 'https:' ? libHttps : libHttp;
|
|
468
|
+
|
|
469
|
+
let tmpRequestHeaders = Object.assign(
|
|
470
|
+
{
|
|
471
|
+
'User-Agent': tmpSelf.options.UserAgent,
|
|
472
|
+
'Accept': 'application/json'
|
|
473
|
+
}, pHeaders || {});
|
|
474
|
+
|
|
475
|
+
let tmpRequestOptions = (
|
|
476
|
+
{
|
|
477
|
+
headers: tmpRequestHeaders
|
|
478
|
+
});
|
|
479
|
+
|
|
480
|
+
let tmpRequest = tmpLib.get(pUrl, tmpRequestOptions,
|
|
481
|
+
(pResponse) =>
|
|
482
|
+
{
|
|
483
|
+
// Follow redirects
|
|
484
|
+
if (pResponse.statusCode >= 300 && pResponse.statusCode < 400 && pResponse.headers.location)
|
|
485
|
+
{
|
|
486
|
+
let tmpRedirectUrl = pResponse.headers.location;
|
|
487
|
+
if (!tmpRedirectUrl.startsWith('http'))
|
|
488
|
+
{
|
|
489
|
+
tmpRedirectUrl = tmpParsed.protocol + '//' + tmpParsed.host + tmpRedirectUrl;
|
|
490
|
+
}
|
|
491
|
+
pResponse.resume();
|
|
492
|
+
return tmpSelf.fetchJson(tmpRedirectUrl, pHeaders)
|
|
493
|
+
.then(fResolve)
|
|
494
|
+
.catch(fReject);
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
if (pResponse.statusCode !== 200)
|
|
498
|
+
{
|
|
499
|
+
pResponse.resume();
|
|
500
|
+
return fReject(new Error(`HTTP ${pResponse.statusCode} for ${pUrl}`));
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
let tmpChunks = [];
|
|
504
|
+
pResponse.on('data', (pChunk) => { tmpChunks.push(pChunk); });
|
|
505
|
+
pResponse.on('end',
|
|
506
|
+
() =>
|
|
507
|
+
{
|
|
508
|
+
try
|
|
509
|
+
{
|
|
510
|
+
let tmpBody = Buffer.concat(tmpChunks).toString('utf8');
|
|
511
|
+
fResolve(JSON.parse(tmpBody));
|
|
512
|
+
}
|
|
513
|
+
catch (pError)
|
|
514
|
+
{
|
|
515
|
+
fReject(new Error(`Failed to parse JSON from ${pUrl}: ${pError.message}`));
|
|
516
|
+
}
|
|
517
|
+
});
|
|
518
|
+
pResponse.on('error', fReject);
|
|
519
|
+
});
|
|
520
|
+
|
|
521
|
+
tmpRequest.on('error', fReject);
|
|
522
|
+
|
|
523
|
+
tmpRequest.setTimeout(60000,
|
|
524
|
+
() =>
|
|
525
|
+
{
|
|
526
|
+
tmpRequest.destroy();
|
|
527
|
+
fReject(new Error(`Timeout fetching ${pUrl}`));
|
|
528
|
+
});
|
|
529
|
+
});
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
/**
|
|
533
|
+
* Fetch a URL and return the raw text body.
|
|
534
|
+
* Returns a Promise that resolves with a string.
|
|
535
|
+
*/
|
|
536
|
+
fetchText(pUrl, pHeaders)
|
|
537
|
+
{
|
|
538
|
+
let tmpSelf = this;
|
|
539
|
+
return new Promise(
|
|
540
|
+
(fResolve, fReject) =>
|
|
541
|
+
{
|
|
542
|
+
let tmpParsed = new URL(pUrl);
|
|
543
|
+
let tmpLib = tmpParsed.protocol === 'https:' ? libHttps : libHttp;
|
|
544
|
+
|
|
545
|
+
let tmpRequestHeaders = Object.assign(
|
|
546
|
+
{
|
|
547
|
+
'User-Agent': tmpSelf.options.UserAgent
|
|
548
|
+
}, pHeaders || {});
|
|
549
|
+
|
|
550
|
+
let tmpRequestOptions = (
|
|
551
|
+
{
|
|
552
|
+
headers: tmpRequestHeaders
|
|
553
|
+
});
|
|
554
|
+
|
|
555
|
+
let tmpRequest = tmpLib.get(pUrl, tmpRequestOptions,
|
|
556
|
+
(pResponse) =>
|
|
557
|
+
{
|
|
558
|
+
if (pResponse.statusCode >= 300 && pResponse.statusCode < 400 && pResponse.headers.location)
|
|
559
|
+
{
|
|
560
|
+
let tmpRedirectUrl = pResponse.headers.location;
|
|
561
|
+
if (!tmpRedirectUrl.startsWith('http'))
|
|
562
|
+
{
|
|
563
|
+
tmpRedirectUrl = tmpParsed.protocol + '//' + tmpParsed.host + tmpRedirectUrl;
|
|
564
|
+
}
|
|
565
|
+
pResponse.resume();
|
|
566
|
+
return tmpSelf.fetchText(tmpRedirectUrl, pHeaders)
|
|
567
|
+
.then(fResolve)
|
|
568
|
+
.catch(fReject);
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
if (pResponse.statusCode !== 200)
|
|
572
|
+
{
|
|
573
|
+
pResponse.resume();
|
|
574
|
+
return fReject(new Error(`HTTP ${pResponse.statusCode} for ${pUrl}`));
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
let tmpChunks = [];
|
|
578
|
+
pResponse.on('data', (pChunk) => { tmpChunks.push(pChunk); });
|
|
579
|
+
pResponse.on('end',
|
|
580
|
+
() =>
|
|
581
|
+
{
|
|
582
|
+
fResolve(Buffer.concat(tmpChunks).toString('utf8'));
|
|
583
|
+
});
|
|
584
|
+
pResponse.on('error', fReject);
|
|
585
|
+
});
|
|
586
|
+
|
|
587
|
+
tmpRequest.on('error', fReject);
|
|
588
|
+
|
|
589
|
+
tmpRequest.setTimeout(60000,
|
|
590
|
+
() =>
|
|
591
|
+
{
|
|
592
|
+
tmpRequest.destroy();
|
|
593
|
+
fReject(new Error(`Timeout fetching ${pUrl}`));
|
|
594
|
+
});
|
|
595
|
+
});
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
// ================================================================
|
|
599
|
+
// Archive Extraction
|
|
600
|
+
// ================================================================
|
|
601
|
+
|
|
602
|
+
extractArchive(pArchivePath, pDestDir)
|
|
603
|
+
{
|
|
604
|
+
return new Promise(
|
|
605
|
+
(fResolve, fReject) =>
|
|
606
|
+
{
|
|
607
|
+
let tmpFilename = libPath.basename(pArchivePath).toLowerCase();
|
|
608
|
+
let tmpCommand = null;
|
|
609
|
+
|
|
610
|
+
if (tmpFilename.endsWith('.zip'))
|
|
611
|
+
{
|
|
612
|
+
tmpCommand = `unzip -o -q "${pArchivePath}" -d "${pDestDir}"`;
|
|
613
|
+
}
|
|
614
|
+
else if (tmpFilename.endsWith('.tar.gz') || tmpFilename.endsWith('.tgz'))
|
|
615
|
+
{
|
|
616
|
+
tmpCommand = `tar -xzf "${pArchivePath}" -C "${pDestDir}"`;
|
|
617
|
+
}
|
|
618
|
+
else if (tmpFilename.endsWith('.tar.bz2') || tmpFilename.endsWith('.tbz2'))
|
|
619
|
+
{
|
|
620
|
+
tmpCommand = `tar -xjf "${pArchivePath}" -C "${pDestDir}"`;
|
|
621
|
+
}
|
|
622
|
+
else if (tmpFilename.endsWith('.tar.xz'))
|
|
623
|
+
{
|
|
624
|
+
tmpCommand = `tar -xJf "${pArchivePath}" -C "${pDestDir}"`;
|
|
625
|
+
}
|
|
626
|
+
else if (tmpFilename.endsWith('.tar.zst') || tmpFilename.endsWith('.tar.zstd'))
|
|
627
|
+
{
|
|
628
|
+
tmpCommand = `zstd -d "${pArchivePath}" --stdout | tar -xf - -C "${pDestDir}"`;
|
|
629
|
+
}
|
|
630
|
+
else if (tmpFilename.endsWith('.gz') && !tmpFilename.endsWith('.tar.gz'))
|
|
631
|
+
{
|
|
632
|
+
tmpCommand = `gunzip -k "${pArchivePath}"`;
|
|
633
|
+
}
|
|
634
|
+
else
|
|
635
|
+
{
|
|
636
|
+
return fResolve();
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
this.log.info(` Extracting: ${tmpFilename}`);
|
|
640
|
+
libChildProcess.exec(tmpCommand, { maxBuffer: 50 * 1024 * 1024 },
|
|
641
|
+
(pError, pStdout, pStderr) =>
|
|
642
|
+
{
|
|
643
|
+
if (pError)
|
|
644
|
+
{
|
|
645
|
+
this.log.warn(` Extract warning: ${pError.message}`);
|
|
646
|
+
}
|
|
647
|
+
fResolve();
|
|
648
|
+
});
|
|
649
|
+
});
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
// ================================================================
|
|
653
|
+
// Git Clone
|
|
654
|
+
// ================================================================
|
|
655
|
+
|
|
656
|
+
gitClone(pUrl, pDestDir)
|
|
657
|
+
{
|
|
658
|
+
let tmpSelf = this;
|
|
659
|
+
return new Promise(
|
|
660
|
+
(fResolve, fReject) =>
|
|
661
|
+
{
|
|
662
|
+
let tmpRepoName = libPath.basename(pUrl, '.git').replace(/\.git$/, '');
|
|
663
|
+
let tmpCloneTarget = libPath.join(pDestDir, tmpRepoName);
|
|
664
|
+
|
|
665
|
+
if (libFs.existsSync(tmpCloneTarget))
|
|
666
|
+
{
|
|
667
|
+
tmpSelf.log.info(` Updating existing clone: ${tmpRepoName}`);
|
|
668
|
+
libChildProcess.exec(`git -C "${tmpCloneTarget}" pull --ff-only`, { timeout: 120000 },
|
|
669
|
+
(pError) =>
|
|
670
|
+
{
|
|
671
|
+
if (pError)
|
|
672
|
+
{
|
|
673
|
+
tmpSelf.log.warn(` Git pull warning: ${pError.message}`);
|
|
674
|
+
}
|
|
675
|
+
fResolve({ path: tmpCloneTarget });
|
|
676
|
+
});
|
|
677
|
+
}
|
|
678
|
+
else
|
|
679
|
+
{
|
|
680
|
+
tmpSelf.log.info(` Cloning: ${pUrl}`);
|
|
681
|
+
libChildProcess.exec(`git clone --depth 1 "${pUrl}" "${tmpCloneTarget}"`, { timeout: 300000 },
|
|
682
|
+
(pError) =>
|
|
683
|
+
{
|
|
684
|
+
if (pError)
|
|
685
|
+
{
|
|
686
|
+
return fReject(new Error(`Git clone failed for ${pUrl}: ${pError.message}`));
|
|
687
|
+
}
|
|
688
|
+
fResolve({ path: tmpCloneTarget });
|
|
689
|
+
});
|
|
690
|
+
}
|
|
691
|
+
});
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
// ================================================================
|
|
695
|
+
// URL / Filename Helpers
|
|
696
|
+
// ================================================================
|
|
697
|
+
|
|
698
|
+
filenameFromUrl(pUrl)
|
|
699
|
+
{
|
|
700
|
+
try
|
|
701
|
+
{
|
|
702
|
+
let tmpParsed = new URL(pUrl);
|
|
703
|
+
let tmpPathname = tmpParsed.pathname;
|
|
704
|
+
let tmpFilename = libPath.basename(tmpPathname);
|
|
705
|
+
|
|
706
|
+
if (!tmpFilename || tmpFilename === '/')
|
|
707
|
+
{
|
|
708
|
+
tmpFilename = 'download_' + libCrypto.createHash('md5').update(pUrl).digest('hex').substring(0, 8);
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
if (!tmpFilename.match(/\.\w{2,5}$/))
|
|
712
|
+
{
|
|
713
|
+
if (pUrl.indexOf('.csv') > -1)
|
|
714
|
+
{
|
|
715
|
+
tmpFilename += '.csv';
|
|
716
|
+
}
|
|
717
|
+
else if (pUrl.indexOf('.json') > -1)
|
|
718
|
+
{
|
|
719
|
+
tmpFilename += '.json';
|
|
720
|
+
}
|
|
721
|
+
else if (pUrl.indexOf('.zip') > -1)
|
|
722
|
+
{
|
|
723
|
+
tmpFilename += '.zip';
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
return tmpFilename;
|
|
728
|
+
}
|
|
729
|
+
catch (pError)
|
|
730
|
+
{
|
|
731
|
+
return 'download_' + libCrypto.createHash('md5').update(pUrl).digest('hex').substring(0, 8);
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
isDirectDownloadUrl(pUrl)
|
|
736
|
+
{
|
|
737
|
+
let tmpLower = pUrl.toLowerCase();
|
|
738
|
+
|
|
739
|
+
if (tmpLower.match(/\.(zip|gz|tgz|tar|csv|tsv|json|xml|txt|xls|xlsx|dat|bz2|xz|zst|zstd|sqlite|sql)$/))
|
|
740
|
+
{
|
|
741
|
+
return true;
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
if (tmpLower.indexOf('/download') > -1 || tmpLower.indexOf('/data/') > -1)
|
|
745
|
+
{
|
|
746
|
+
return true;
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
if (tmpLower.indexOf('raw.githubusercontent.com') > -1 || tmpLower.indexOf('/releases/download/') > -1)
|
|
750
|
+
{
|
|
751
|
+
return true;
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
if (tmpLower.indexOf('download.geonames.org') > -1 || tmpLower.indexOf('datasets.imdbws.com') > -1)
|
|
755
|
+
{
|
|
756
|
+
return true;
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
if (tmpLower.indexOf('datahub.io') > -1 && tmpLower.indexOf('/r/') > -1)
|
|
760
|
+
{
|
|
761
|
+
return true;
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
if (tmpLower.indexOf('standards-oui.ieee.org') > -1)
|
|
765
|
+
{
|
|
766
|
+
return true;
|
|
767
|
+
}
|
|
768
|
+
if (tmpLower.indexOf('rfc-editor.org/rfc-index') > -1)
|
|
769
|
+
{
|
|
770
|
+
return true;
|
|
771
|
+
}
|
|
772
|
+
if (tmpLower.indexOf('data.iana.org') > -1)
|
|
773
|
+
{
|
|
774
|
+
return true;
|
|
775
|
+
}
|
|
776
|
+
if (tmpLower.indexOf('nasdaqtrader.com') > -1)
|
|
777
|
+
{
|
|
778
|
+
return true;
|
|
779
|
+
}
|
|
780
|
+
if (tmpLower.indexOf('files.usaspending.gov') > -1)
|
|
781
|
+
{
|
|
782
|
+
return true;
|
|
783
|
+
}
|
|
784
|
+
if (tmpLower.indexOf('sec.gov/files/') > -1)
|
|
785
|
+
{
|
|
786
|
+
return true;
|
|
787
|
+
}
|
|
788
|
+
if (tmpLower.indexOf('openlibrary.org/data/') > -1)
|
|
789
|
+
{
|
|
790
|
+
return true;
|
|
791
|
+
}
|
|
792
|
+
if (tmpLower.indexOf('gutenberg.org/cache/') > -1)
|
|
793
|
+
{
|
|
794
|
+
return true;
|
|
795
|
+
}
|
|
796
|
+
if (tmpLower.indexOf('ourairports.com/data/') > -1)
|
|
797
|
+
{
|
|
798
|
+
return true;
|
|
799
|
+
}
|
|
800
|
+
if (tmpLower.indexOf('iso639-3.sil.org') > -1 && tmpLower.indexOf('/downloads/') > -1)
|
|
801
|
+
{
|
|
802
|
+
return true;
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
if (tmpLower.match(/[?&]format=(csv|json|xml)/))
|
|
806
|
+
{
|
|
807
|
+
return true;
|
|
808
|
+
}
|
|
809
|
+
if (tmpLower.indexOf('goldencopy.gleif.org') > -1)
|
|
810
|
+
{
|
|
811
|
+
return true;
|
|
812
|
+
}
|
|
813
|
+
if (tmpLower.indexOf('enterpriseefiling.fcc.gov') > -1)
|
|
814
|
+
{
|
|
815
|
+
return true;
|
|
816
|
+
}
|
|
817
|
+
if (tmpLower.indexOf('irs.gov/pub/') > -1)
|
|
818
|
+
{
|
|
819
|
+
return true;
|
|
820
|
+
}
|
|
821
|
+
if (tmpLower.indexOf('onetcenter.org/dl_files/') > -1)
|
|
822
|
+
{
|
|
823
|
+
return true;
|
|
824
|
+
}
|
|
825
|
+
if (tmpLower.indexOf('downloads.dbpedia.org') > -1)
|
|
826
|
+
{
|
|
827
|
+
return true;
|
|
828
|
+
}
|
|
829
|
+
if (tmpLower.indexOf('data.metabrainz.org') > -1)
|
|
830
|
+
{
|
|
831
|
+
return true;
|
|
832
|
+
}
|
|
833
|
+
if (tmpLower.indexOf('archive.org/download/') > -1)
|
|
834
|
+
{
|
|
835
|
+
return true;
|
|
836
|
+
}
|
|
837
|
+
if (tmpLower.indexOf('planet.openstreetmap.org') > -1)
|
|
838
|
+
{
|
|
839
|
+
return true;
|
|
840
|
+
}
|
|
841
|
+
if (tmpLower.indexOf('static.openfoodfacts.org') > -1)
|
|
842
|
+
{
|
|
843
|
+
return true;
|
|
844
|
+
}
|
|
845
|
+
if (tmpLower.indexOf('static.openbeautyfacts.org') > -1)
|
|
846
|
+
{
|
|
847
|
+
return true;
|
|
848
|
+
}
|
|
849
|
+
if (tmpLower.indexOf('static.openpetfoodfacts.org') > -1)
|
|
850
|
+
{
|
|
851
|
+
return true;
|
|
852
|
+
}
|
|
853
|
+
if (tmpLower.indexOf('api.crossref.org/snapshots/') > -1)
|
|
854
|
+
{
|
|
855
|
+
return true;
|
|
856
|
+
}
|
|
857
|
+
if (tmpLower.indexOf('loc.gov/cds/downloads/') > -1)
|
|
858
|
+
{
|
|
859
|
+
return true;
|
|
860
|
+
}
|
|
861
|
+
if (tmpLower.indexOf('sec.gov/Archives/') > -1)
|
|
862
|
+
{
|
|
863
|
+
return true;
|
|
864
|
+
}
|
|
865
|
+
if (tmpLower.indexOf('discogs-data-dumps') > -1)
|
|
866
|
+
{
|
|
867
|
+
return true;
|
|
868
|
+
}
|
|
869
|
+
if (tmpLower.indexOf('databus.dbpedia.org') > -1)
|
|
870
|
+
{
|
|
871
|
+
return true;
|
|
872
|
+
}
|
|
873
|
+
if (tmpLower.indexOf('aqs.epa.gov/aqsweb/airdata/') > -1)
|
|
874
|
+
{
|
|
875
|
+
return true;
|
|
876
|
+
}
|
|
877
|
+
if (tmpLower.indexOf('download.open.fda.gov') > -1)
|
|
878
|
+
{
|
|
879
|
+
return true;
|
|
880
|
+
}
|
|
881
|
+
if (tmpLower.indexOf('fdc.nal.usda.gov/fdc-datasets/') > -1)
|
|
882
|
+
{
|
|
883
|
+
return true;
|
|
884
|
+
}
|
|
885
|
+
if (tmpLower.indexOf('ncei.noaa.gov/pub/data/') > -1)
|
|
886
|
+
{
|
|
887
|
+
return true;
|
|
888
|
+
}
|
|
889
|
+
if (tmpLower.indexOf('fenixservices.fao.org') > -1)
|
|
890
|
+
{
|
|
891
|
+
return true;
|
|
892
|
+
}
|
|
893
|
+
if (tmpLower.indexOf('ftp.cdc.gov/pub/') > -1)
|
|
894
|
+
{
|
|
895
|
+
return true;
|
|
896
|
+
}
|
|
897
|
+
if (tmpLower.indexOf('accessdata.fda.gov/cder/') > -1)
|
|
898
|
+
{
|
|
899
|
+
return true;
|
|
900
|
+
}
|
|
901
|
+
if (tmpLower.indexOf('cdstar.eva.mpg.de') > -1)
|
|
902
|
+
{
|
|
903
|
+
return true;
|
|
904
|
+
}
|
|
905
|
+
if (tmpLower.indexOf('nces.ed.gov/ipeds/datacenter/data/') > -1)
|
|
906
|
+
{
|
|
907
|
+
return true;
|
|
908
|
+
}
|
|
909
|
+
if (tmpLower.indexOf('storage.googleapis.com/pantheon-public-data') > -1)
|
|
910
|
+
{
|
|
911
|
+
return true;
|
|
912
|
+
}
|
|
913
|
+
if (tmpLower.indexOf('ndownloader.figshare.com') > -1)
|
|
914
|
+
{
|
|
915
|
+
return true;
|
|
916
|
+
}
|
|
917
|
+
if (tmpLower.indexOf('gist.githubusercontent.com') > -1)
|
|
918
|
+
{
|
|
919
|
+
return true;
|
|
920
|
+
}
|
|
921
|
+
if (tmpLower.indexOf('cms.gov/files/zip/') > -1)
|
|
922
|
+
{
|
|
923
|
+
return true;
|
|
924
|
+
}
|
|
925
|
+
if (tmpLower.indexOf('data.bls.gov/cew/') > -1)
|
|
926
|
+
{
|
|
927
|
+
return true;
|
|
928
|
+
}
|
|
929
|
+
if (tmpLower.indexOf('dumps.wikimedia.org') > -1)
|
|
930
|
+
{
|
|
931
|
+
return true;
|
|
932
|
+
}
|
|
933
|
+
if (tmpLower.indexOf('cbwinslow/baseballdatabank') > -1)
|
|
934
|
+
{
|
|
935
|
+
return true;
|
|
936
|
+
}
|
|
937
|
+
if (tmpLower.indexOf('onetcenter.org/taxonomy/') > -1)
|
|
938
|
+
{
|
|
939
|
+
return true;
|
|
940
|
+
}
|
|
941
|
+
if (tmpLower.indexOf('bulks-faostat.fao.org') > -1)
|
|
942
|
+
{
|
|
943
|
+
return true;
|
|
944
|
+
}
|
|
945
|
+
if (tmpLower.indexOf('nces.ed.gov/ccd/') > -1)
|
|
946
|
+
{
|
|
947
|
+
return true;
|
|
948
|
+
}
|
|
949
|
+
if (tmpLower.indexOf('nces.ed.gov/surveys/pss/') > -1)
|
|
950
|
+
{
|
|
951
|
+
return true;
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
return false;
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
isArchiveFilename(pFilename)
|
|
958
|
+
{
|
|
959
|
+
let tmpLower = pFilename.toLowerCase();
|
|
960
|
+
return tmpLower.endsWith('.zip') ||
|
|
961
|
+
tmpLower.endsWith('.tar.gz') ||
|
|
962
|
+
tmpLower.endsWith('.tgz') ||
|
|
963
|
+
tmpLower.endsWith('.tar.bz2') ||
|
|
964
|
+
tmpLower.endsWith('.tar.xz') ||
|
|
965
|
+
tmpLower.endsWith('.tar.zst') ||
|
|
966
|
+
tmpLower.endsWith('.tar.zstd') ||
|
|
967
|
+
tmpLower.endsWith('.gz');
|
|
968
|
+
}
|
|
969
|
+
|
|
970
|
+
// ================================================================
|
|
971
|
+
// Download Method Handlers
|
|
972
|
+
// ================================================================
|
|
973
|
+
|
|
974
|
+
async downloadHttpFiles(pDataset, pDatasetDir, pManifest)
|
|
975
|
+
{
|
|
976
|
+
let tmpUrls = pDataset.urls || [];
|
|
977
|
+
let tmpDatasetHeaders = pDataset.headers || null;
|
|
978
|
+
|
|
979
|
+
for (let i = 0; i < tmpUrls.length; i++)
|
|
980
|
+
{
|
|
981
|
+
let tmpUrl = tmpUrls[i];
|
|
982
|
+
|
|
983
|
+
if (tmpUrl.indexOf('{') > -1)
|
|
984
|
+
{
|
|
985
|
+
this.log.info(` Skipping template URL: ${tmpUrl}`);
|
|
986
|
+
continue;
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
let tmpFilename = this.filenameFromUrl(tmpUrl);
|
|
990
|
+
let tmpDestPath = libPath.join(pDatasetDir, tmpFilename);
|
|
991
|
+
|
|
992
|
+
this.log.info(` Downloading: ${tmpFilename}`);
|
|
993
|
+
try
|
|
994
|
+
{
|
|
995
|
+
let tmpResult = await this.downloadFile(tmpUrl, tmpDestPath, 0, tmpDatasetHeaders);
|
|
996
|
+
pManifest.urls_downloaded.push(tmpUrl);
|
|
997
|
+
this.log.info(` OK: ${this.formatSize(tmpResult.size)}`);
|
|
998
|
+
}
|
|
999
|
+
catch (pError)
|
|
1000
|
+
{
|
|
1001
|
+
this.log.error(` Failed: ${tmpUrl} — ${pError.message}`);
|
|
1002
|
+
}
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
async downloadHttpArchives(pDataset, pDatasetDir, pManifest)
|
|
1007
|
+
{
|
|
1008
|
+
let tmpUrls = pDataset.urls || [];
|
|
1009
|
+
let tmpDatasetHeaders = pDataset.headers || null;
|
|
1010
|
+
|
|
1011
|
+
for (let i = 0; i < tmpUrls.length; i++)
|
|
1012
|
+
{
|
|
1013
|
+
let tmpUrl = tmpUrls[i];
|
|
1014
|
+
|
|
1015
|
+
if (!this.isDirectDownloadUrl(tmpUrl))
|
|
1016
|
+
{
|
|
1017
|
+
this.log.info(` Skipping non-direct URL (browse manually): ${tmpUrl}`);
|
|
1018
|
+
continue;
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
let tmpFilename = this.filenameFromUrl(tmpUrl);
|
|
1022
|
+
let tmpDestPath = libPath.join(pDatasetDir, tmpFilename);
|
|
1023
|
+
|
|
1024
|
+
this.log.info(` Downloading: ${tmpFilename}`);
|
|
1025
|
+
try
|
|
1026
|
+
{
|
|
1027
|
+
let tmpResult = await this.downloadFile(tmpUrl, tmpDestPath, 0, tmpDatasetHeaders);
|
|
1028
|
+
pManifest.urls_downloaded.push(tmpUrl);
|
|
1029
|
+
this.log.info(` OK: ${this.formatSize(tmpResult.size)}`);
|
|
1030
|
+
|
|
1031
|
+
if (this.isArchiveFilename(tmpFilename) && !pDataset.skip_extract)
|
|
1032
|
+
{
|
|
1033
|
+
await this.extractArchive(tmpDestPath, pDatasetDir);
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
catch (pError)
|
|
1037
|
+
{
|
|
1038
|
+
this.log.error(` Failed: ${tmpUrl} — ${pError.message}`);
|
|
1039
|
+
}
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
|
|
1043
|
+
async downloadGitClone(pDataset, pDatasetDir, pManifest)
|
|
1044
|
+
{
|
|
1045
|
+
let tmpUrls = pDataset.urls || [];
|
|
1046
|
+
|
|
1047
|
+
for (let i = 0; i < tmpUrls.length; i++)
|
|
1048
|
+
{
|
|
1049
|
+
let tmpUrl = tmpUrls[i];
|
|
1050
|
+
|
|
1051
|
+
if (!tmpUrl.match(/github\.com|gitlab\.com|bitbucket\.org/))
|
|
1052
|
+
{
|
|
1053
|
+
this.log.info(` Skipping non-git URL: ${tmpUrl}`);
|
|
1054
|
+
continue;
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
try
|
|
1058
|
+
{
|
|
1059
|
+
await this.gitClone(tmpUrl, pDatasetDir);
|
|
1060
|
+
pManifest.urls_downloaded.push(tmpUrl);
|
|
1061
|
+
this.log.info(` OK: cloned`);
|
|
1062
|
+
}
|
|
1063
|
+
catch (pError)
|
|
1064
|
+
{
|
|
1065
|
+
this.log.error(` Failed: ${tmpUrl} — ${pError.message}`);
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
|
|
1070
|
+
// ================================================================
|
|
1071
|
+
// REST API Download (fetch_steps)
|
|
1072
|
+
// ================================================================
|
|
1073
|
+
|
|
1074
|
+
/**
|
|
1075
|
+
* Download a dataset via its fetch_steps DSL.
|
|
1076
|
+
*/
|
|
1077
|
+
async downloadRestApi(pDataset, pDatasetDir, pManifest)
|
|
1078
|
+
{
|
|
1079
|
+
let tmpSteps = pDataset.fetch_steps;
|
|
1080
|
+
if (!Array.isArray(tmpSteps) || tmpSteps.length === 0)
|
|
1081
|
+
{
|
|
1082
|
+
this.log.warn(` No fetch_steps defined for ${pDataset.id} — skipping`);
|
|
1083
|
+
return;
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
await this.executeFetchSteps(tmpSteps, pDatasetDir, pManifest);
|
|
1087
|
+
}
|
|
1088
|
+
|
|
1089
|
+
/**
|
|
1090
|
+
* Execute an array of fetch_steps sequentially.
|
|
1091
|
+
* Each step can reference files created by previous steps.
|
|
1092
|
+
*/
|
|
1093
|
+
async executeFetchSteps(pSteps, pDatasetDir, pManifest)
|
|
1094
|
+
{
|
|
1095
|
+
for (let i = 0; i < pSteps.length; i++)
|
|
1096
|
+
{
|
|
1097
|
+
let tmpStep = pSteps[i];
|
|
1098
|
+
let tmpAction = tmpStep.action;
|
|
1099
|
+
|
|
1100
|
+
if (tmpAction === 'get_json')
|
|
1101
|
+
{
|
|
1102
|
+
await this.executeGetJson(tmpStep, pDatasetDir, pManifest);
|
|
1103
|
+
}
|
|
1104
|
+
else if (tmpAction === 'get_text')
|
|
1105
|
+
{
|
|
1106
|
+
await this.executeGetText(tmpStep, pDatasetDir, pManifest);
|
|
1107
|
+
}
|
|
1108
|
+
else if (tmpAction === 'for_each')
|
|
1109
|
+
{
|
|
1110
|
+
await this.executeForEach(tmpStep, pDatasetDir, pManifest);
|
|
1111
|
+
}
|
|
1112
|
+
else if (tmpAction === 'paginate')
|
|
1113
|
+
{
|
|
1114
|
+
await this.executePaginate(tmpStep, pDatasetDir, pManifest);
|
|
1115
|
+
}
|
|
1116
|
+
else if (tmpAction === 'merge_pages')
|
|
1117
|
+
{
|
|
1118
|
+
await this.executeMergePages(tmpStep, pDatasetDir, pManifest);
|
|
1119
|
+
}
|
|
1120
|
+
else if (tmpAction === 'for_each_in_pages')
|
|
1121
|
+
{
|
|
1122
|
+
await this.executeForEachInPages(tmpStep, pDatasetDir, pManifest);
|
|
1123
|
+
}
|
|
1124
|
+
else
|
|
1125
|
+
{
|
|
1126
|
+
this.log.warn(` Unknown fetch_step action: ${tmpAction}`);
|
|
1127
|
+
}
|
|
1128
|
+
}
|
|
1129
|
+
}
|
|
1130
|
+
|
|
1131
|
+
/**
|
|
1132
|
+
* get_json: Fetch a URL, save the JSON response to a file.
|
|
1133
|
+
*/
|
|
1134
|
+
async executeGetJson(pStep, pDatasetDir, pManifest)
|
|
1135
|
+
{
|
|
1136
|
+
let tmpUrl = pStep.url;
|
|
1137
|
+
let tmpSaveAs = pStep.save_as;
|
|
1138
|
+
let tmpHeaders = pStep.headers || {};
|
|
1139
|
+
|
|
1140
|
+
this.log.info(` Fetching: ${tmpUrl}`);
|
|
1141
|
+
|
|
1142
|
+
let tmpData = await this.fetchJson(tmpUrl, tmpHeaders);
|
|
1143
|
+
let tmpDestPath = libPath.join(pDatasetDir, tmpSaveAs);
|
|
1144
|
+
|
|
1145
|
+
// Ensure subdirectory exists
|
|
1146
|
+
let tmpDir = libPath.dirname(tmpDestPath);
|
|
1147
|
+
if (!libFs.existsSync(tmpDir))
|
|
1148
|
+
{
|
|
1149
|
+
libFs.mkdirSync(tmpDir, { recursive: true });
|
|
1150
|
+
}
|
|
1151
|
+
|
|
1152
|
+
libFs.writeFileSync(tmpDestPath, JSON.stringify(tmpData, null, '\t'), 'utf8');
|
|
1153
|
+
pManifest.urls_downloaded.push(tmpUrl);
|
|
1154
|
+
this.log.info(` Saved: ${tmpSaveAs} (${this.formatSize(Buffer.byteLength(JSON.stringify(tmpData, null, '\t')))})`);
|
|
1155
|
+
|
|
1156
|
+
if (pStep.delay_ms)
|
|
1157
|
+
{
|
|
1158
|
+
await this.delay(pStep.delay_ms);
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1162
|
+
/**
|
|
1163
|
+
* get_text: Fetch a URL, save the raw text to a file.
|
|
1164
|
+
*/
|
|
1165
|
+
async executeGetText(pStep, pDatasetDir, pManifest)
|
|
1166
|
+
{
|
|
1167
|
+
let tmpUrl = pStep.url;
|
|
1168
|
+
let tmpSaveAs = pStep.save_as;
|
|
1169
|
+
let tmpHeaders = pStep.headers || {};
|
|
1170
|
+
|
|
1171
|
+
this.log.info(` Fetching: ${tmpUrl}`);
|
|
1172
|
+
|
|
1173
|
+
let tmpData = await this.fetchText(tmpUrl, tmpHeaders);
|
|
1174
|
+
let tmpDestPath = libPath.join(pDatasetDir, tmpSaveAs);
|
|
1175
|
+
|
|
1176
|
+
let tmpDir = libPath.dirname(tmpDestPath);
|
|
1177
|
+
if (!libFs.existsSync(tmpDir))
|
|
1178
|
+
{
|
|
1179
|
+
libFs.mkdirSync(tmpDir, { recursive: true });
|
|
1180
|
+
}
|
|
1181
|
+
|
|
1182
|
+
libFs.writeFileSync(tmpDestPath, tmpData, 'utf8');
|
|
1183
|
+
pManifest.urls_downloaded.push(tmpUrl);
|
|
1184
|
+
this.log.info(` Saved: ${tmpSaveAs} (${this.formatSize(Buffer.byteLength(tmpData))})`);
|
|
1185
|
+
|
|
1186
|
+
if (pStep.delay_ms)
|
|
1187
|
+
{
|
|
1188
|
+
await this.delay(pStep.delay_ms);
|
|
1189
|
+
}
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
/**
|
|
1193
|
+
* for_each: Read a previously-saved JSON array file, iterate items,
|
|
1194
|
+
* fetch a templated URL per item, and save results.
|
|
1195
|
+
*/
|
|
1196
|
+
async executeForEach(pStep, pDatasetDir, pManifest)
|
|
1197
|
+
{
|
|
1198
|
+
let tmpSourcePath = libPath.join(pDatasetDir, pStep.source_file);
|
|
1199
|
+
if (!libFs.existsSync(tmpSourcePath))
|
|
1200
|
+
{
|
|
1201
|
+
this.log.error(` for_each: source file not found: ${pStep.source_file}`);
|
|
1202
|
+
return;
|
|
1203
|
+
}
|
|
1204
|
+
|
|
1205
|
+
let tmpSourceData = JSON.parse(libFs.readFileSync(tmpSourcePath, 'utf8'));
|
|
1206
|
+
if (!Array.isArray(tmpSourceData))
|
|
1207
|
+
{
|
|
1208
|
+
this.log.error(` for_each: source file is not a JSON array: ${pStep.source_file}`);
|
|
1209
|
+
return;
|
|
1210
|
+
}
|
|
1211
|
+
|
|
1212
|
+
let tmpField = pStep.field;
|
|
1213
|
+
let tmpUrlTemplate = pStep.url_template;
|
|
1214
|
+
let tmpSaveAsTemplate = pStep.save_as;
|
|
1215
|
+
let tmpDelayMs = pStep.delay_ms || 100;
|
|
1216
|
+
let tmpHeaders = pStep.headers || {};
|
|
1217
|
+
let tmpSuccessCount = 0;
|
|
1218
|
+
let tmpErrorCount = 0;
|
|
1219
|
+
|
|
1220
|
+
this.log.info(` Iterating ${tmpSourceData.length} items from ${pStep.source_file}...`);
|
|
1221
|
+
|
|
1222
|
+
for (let i = 0; i < tmpSourceData.length; i++)
|
|
1223
|
+
{
|
|
1224
|
+
let tmpItem = tmpSourceData[i];
|
|
1225
|
+
let tmpValue = tmpField ? tmpItem[tmpField] : (typeof tmpItem === 'string' ? tmpItem : JSON.stringify(tmpItem));
|
|
1226
|
+
|
|
1227
|
+
// Expand template variables
|
|
1228
|
+
let tmpUrl = this.expandTemplate(tmpUrlTemplate, tmpItem, tmpValue);
|
|
1229
|
+
let tmpSaveAs = this.expandTemplate(tmpSaveAsTemplate, tmpItem, tmpValue);
|
|
1230
|
+
let tmpDestPath = libPath.join(pDatasetDir, tmpSaveAs);
|
|
1231
|
+
|
|
1232
|
+
// Ensure subdirectory exists
|
|
1233
|
+
let tmpDir = libPath.dirname(tmpDestPath);
|
|
1234
|
+
if (!libFs.existsSync(tmpDir))
|
|
1235
|
+
{
|
|
1236
|
+
libFs.mkdirSync(tmpDir, { recursive: true });
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
let tmpSkipExisting = pStep.skip_existing !== false;
|
|
1240
|
+
let tmpSkipCount = 0;
|
|
1241
|
+
|
|
1242
|
+
try
|
|
1243
|
+
{
|
|
1244
|
+
// Skip if file already exists (for resumability)
|
|
1245
|
+
if (tmpSkipExisting && libFs.existsSync(tmpDestPath))
|
|
1246
|
+
{
|
|
1247
|
+
tmpSkipCount++;
|
|
1248
|
+
if ((i + 1) % 100 === 0 || i === tmpSourceData.length - 1)
|
|
1249
|
+
{
|
|
1250
|
+
process.stdout.write(` ... ${i + 1}/${tmpSourceData.length} (${tmpSuccessCount} ok, ${tmpSkipCount} cached, ${tmpErrorCount} err)\r`);
|
|
1251
|
+
}
|
|
1252
|
+
continue;
|
|
1253
|
+
}
|
|
1254
|
+
|
|
1255
|
+
let tmpData = await this.fetchJson(tmpUrl, tmpHeaders);
|
|
1256
|
+
libFs.writeFileSync(tmpDestPath, JSON.stringify(tmpData, null, '\t'), 'utf8');
|
|
1257
|
+
pManifest.urls_downloaded.push(tmpUrl);
|
|
1258
|
+
tmpSuccessCount++;
|
|
1259
|
+
|
|
1260
|
+
// Progress every 10 items or on last item
|
|
1261
|
+
if ((i + 1) % 10 === 0 || i === tmpSourceData.length - 1)
|
|
1262
|
+
{
|
|
1263
|
+
process.stdout.write(` ... ${i + 1}/${tmpSourceData.length} (${tmpSuccessCount} ok, ${tmpSkipCount} cached, ${tmpErrorCount} err)\r`);
|
|
1264
|
+
}
|
|
1265
|
+
}
|
|
1266
|
+
catch (pError)
|
|
1267
|
+
{
|
|
1268
|
+
tmpErrorCount++;
|
|
1269
|
+
this.log.warn(` Failed: ${tmpValue} — ${pError.message}`);
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
if (tmpDelayMs > 0 && i < tmpSourceData.length - 1)
|
|
1273
|
+
{
|
|
1274
|
+
await this.delay(tmpDelayMs);
|
|
1275
|
+
}
|
|
1276
|
+
}
|
|
1277
|
+
|
|
1278
|
+
process.stdout.write(' \r');
|
|
1279
|
+
this.log.info(` for_each complete: ${tmpSuccessCount} ok, ${tmpSkipCount} cached, ${tmpErrorCount} errors`);
|
|
1280
|
+
}
|
|
1281
|
+
|
|
1282
|
+
/**
|
|
1283
|
+
* paginate: Fetch pages from a URL using offset or page-based pagination.
|
|
1284
|
+
*/
|
|
1285
|
+
async executePaginate(pStep, pDatasetDir, pManifest)
|
|
1286
|
+
{
|
|
1287
|
+
let tmpUrlTemplate = pStep.url_template;
|
|
1288
|
+
let tmpSaveAsTemplate = pStep.save_as;
|
|
1289
|
+
let tmpDelayMs = pStep.delay_ms || 100;
|
|
1290
|
+
let tmpHeaders = pStep.headers || {};
|
|
1291
|
+
let tmpMaxPages = pStep.max_pages || 1000;
|
|
1292
|
+
let tmpStartPage = pStep.start_page || 1;
|
|
1293
|
+
let tmpPageSize = pStep.page_size || 100;
|
|
1294
|
+
let tmpResultField = pStep.result_field;
|
|
1295
|
+
let tmpStopWhenEmpty = pStep.stop_when_empty !== false;
|
|
1296
|
+
|
|
1297
|
+
this.log.info(` Paginating: ${tmpUrlTemplate}`);
|
|
1298
|
+
|
|
1299
|
+
let tmpPage = tmpStartPage;
|
|
1300
|
+
let tmpOffset = 0;
|
|
1301
|
+
let tmpTotalSaved = 0;
|
|
1302
|
+
|
|
1303
|
+
while (tmpPage < tmpStartPage + tmpMaxPages)
|
|
1304
|
+
{
|
|
1305
|
+
let tmpUrl = tmpUrlTemplate
|
|
1306
|
+
.replace(/\{page\}/g, String(tmpPage))
|
|
1307
|
+
.replace(/\{offset\}/g, String(tmpOffset))
|
|
1308
|
+
.replace(/\{page_size\}/g, String(tmpPageSize));
|
|
1309
|
+
|
|
1310
|
+
let tmpSaveAs = tmpSaveAsTemplate
|
|
1311
|
+
.replace(/\{page\}/g, String(tmpPage))
|
|
1312
|
+
.replace(/\{offset\}/g, String(tmpOffset));
|
|
1313
|
+
|
|
1314
|
+
let tmpDestPath = libPath.join(pDatasetDir, tmpSaveAs);
|
|
1315
|
+
|
|
1316
|
+
let tmpDir = libPath.dirname(tmpDestPath);
|
|
1317
|
+
if (!libFs.existsSync(tmpDir))
|
|
1318
|
+
{
|
|
1319
|
+
libFs.mkdirSync(tmpDir, { recursive: true });
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
// Skip if page file already exists (for resumability)
|
|
1323
|
+
if (libFs.existsSync(tmpDestPath))
|
|
1324
|
+
{
|
|
1325
|
+
tmpTotalSaved++;
|
|
1326
|
+
tmpPage++;
|
|
1327
|
+
tmpOffset += tmpPageSize;
|
|
1328
|
+
continue;
|
|
1329
|
+
}
|
|
1330
|
+
|
|
1331
|
+
try
|
|
1332
|
+
{
|
|
1333
|
+
let tmpData = await this.fetchJson(tmpUrl, tmpHeaders);
|
|
1334
|
+
libFs.writeFileSync(tmpDestPath, JSON.stringify(tmpData, null, '\t'), 'utf8');
|
|
1335
|
+
pManifest.urls_downloaded.push(tmpUrl);
|
|
1336
|
+
tmpTotalSaved++;
|
|
1337
|
+
|
|
1338
|
+
// Check if we should stop
|
|
1339
|
+
if (tmpStopWhenEmpty)
|
|
1340
|
+
{
|
|
1341
|
+
let tmpResults = tmpResultField ? tmpData[tmpResultField] : tmpData;
|
|
1342
|
+
if (Array.isArray(tmpResults) && tmpResults.length === 0)
|
|
1343
|
+
{
|
|
1344
|
+
this.log.info(` Pagination complete: empty page at page ${tmpPage}`);
|
|
1345
|
+
break;
|
|
1346
|
+
}
|
|
1347
|
+
}
|
|
1348
|
+
|
|
1349
|
+
process.stdout.write(` ... page ${tmpPage} (${tmpTotalSaved} saved)\r`);
|
|
1350
|
+
}
|
|
1351
|
+
catch (pError)
|
|
1352
|
+
{
|
|
1353
|
+
this.log.warn(` Pagination stopped at page ${tmpPage}: ${pError.message}`);
|
|
1354
|
+
break;
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
tmpPage++;
|
|
1358
|
+
tmpOffset += tmpPageSize;
|
|
1359
|
+
|
|
1360
|
+
if (tmpDelayMs > 0)
|
|
1361
|
+
{
|
|
1362
|
+
await this.delay(tmpDelayMs);
|
|
1363
|
+
}
|
|
1364
|
+
}
|
|
1365
|
+
|
|
1366
|
+
process.stdout.write(' \r');
|
|
1367
|
+
this.log.info(` Pagination complete: ${tmpTotalSaved} pages saved`);
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1370
|
+
/**
|
|
1371
|
+
* merge_pages: Read all paginated JSON files matching a glob pattern
|
|
1372
|
+
* and merge them into a single JSON array file.
|
|
1373
|
+
*/
|
|
1374
|
+
async executeMergePages(pStep, pDatasetDir, pManifest)
|
|
1375
|
+
{
|
|
1376
|
+
let tmpPattern = pStep.source_pattern; // e.g. "pages/shows_page_{page}.json"
|
|
1377
|
+
let tmpSaveAs = pStep.save_as;
|
|
1378
|
+
let tmpItemField = pStep.item_field; // optional: extract field from each page item (e.g. for search results)
|
|
1379
|
+
let tmpDestPath = libPath.join(pDatasetDir, tmpSaveAs);
|
|
1380
|
+
|
|
1381
|
+
// Skip if already exists
|
|
1382
|
+
if (libFs.existsSync(tmpDestPath))
|
|
1383
|
+
{
|
|
1384
|
+
this.log.info(` merge_pages: ${tmpSaveAs} already exists, skipping`);
|
|
1385
|
+
return;
|
|
1386
|
+
}
|
|
1387
|
+
|
|
1388
|
+
let tmpMerged = [];
|
|
1389
|
+
let tmpPage = 0;
|
|
1390
|
+
// Check if pages start at 0 or 1
|
|
1391
|
+
let tmpTestFile0 = libPath.join(pDatasetDir, tmpPattern.replace(/\{page\}/g, '0'));
|
|
1392
|
+
if (!libFs.existsSync(tmpTestFile0))
|
|
1393
|
+
{
|
|
1394
|
+
tmpPage = 1;
|
|
1395
|
+
}
|
|
1396
|
+
|
|
1397
|
+
while (true)
|
|
1398
|
+
{
|
|
1399
|
+
let tmpFilename = tmpPattern.replace(/\{page\}/g, String(tmpPage));
|
|
1400
|
+
let tmpFilePath = libPath.join(pDatasetDir, tmpFilename);
|
|
1401
|
+
|
|
1402
|
+
if (!libFs.existsSync(tmpFilePath))
|
|
1403
|
+
{
|
|
1404
|
+
break;
|
|
1405
|
+
}
|
|
1406
|
+
|
|
1407
|
+
let tmpData = JSON.parse(libFs.readFileSync(tmpFilePath, 'utf8'));
|
|
1408
|
+
if (Array.isArray(tmpData))
|
|
1409
|
+
{
|
|
1410
|
+
if (tmpItemField)
|
|
1411
|
+
{
|
|
1412
|
+
for (let i = 0; i < tmpData.length; i++)
|
|
1413
|
+
{
|
|
1414
|
+
tmpMerged.push(tmpData[i][tmpItemField] || tmpData[i]);
|
|
1415
|
+
}
|
|
1416
|
+
}
|
|
1417
|
+
else
|
|
1418
|
+
{
|
|
1419
|
+
tmpMerged = tmpMerged.concat(tmpData);
|
|
1420
|
+
}
|
|
1421
|
+
}
|
|
1422
|
+
|
|
1423
|
+
tmpPage++;
|
|
1424
|
+
}
|
|
1425
|
+
|
|
1426
|
+
libFs.writeFileSync(tmpDestPath, JSON.stringify(tmpMerged), 'utf8');
|
|
1427
|
+
this.log.info(` merge_pages: merged ${tmpPage} pages into ${tmpSaveAs} (${tmpMerged.length} items)`);
|
|
1428
|
+
}
|
|
1429
|
+
|
|
1430
|
+
/**
|
|
1431
|
+
* for_each_in_pages: Iterate over items across multiple paginated JSON files
|
|
1432
|
+
* and fetch a URL per item. Skips items whose output file already exists.
|
|
1433
|
+
*/
|
|
1434
|
+
async executeForEachInPages(pStep, pDatasetDir, pManifest)
|
|
1435
|
+
{
|
|
1436
|
+
let tmpPagePattern = pStep.source_pattern; // e.g. "pages/shows_page_{page}.json"
|
|
1437
|
+
let tmpField = pStep.field; // field to extract from each item for URL template
|
|
1438
|
+
let tmpItemField = pStep.item_field; // optional: unwrap item (e.g. search results have {show: {...}})
|
|
1439
|
+
let tmpUrlTemplate = pStep.url_template;
|
|
1440
|
+
let tmpSaveAsTemplate = pStep.save_as;
|
|
1441
|
+
let tmpDelayMs = pStep.delay_ms || 100;
|
|
1442
|
+
let tmpHeaders = pStep.headers || {};
|
|
1443
|
+
let tmpSuccessCount = 0;
|
|
1444
|
+
let tmpSkipCount = 0;
|
|
1445
|
+
let tmpErrorCount = 0;
|
|
1446
|
+
let tmpTotalItems = 0;
|
|
1447
|
+
|
|
1448
|
+
// Count total items first — try page 0 then page 1 to find the start
|
|
1449
|
+
let tmpPage = 0;
|
|
1450
|
+
let tmpAllItems = [];
|
|
1451
|
+
// Check if pages start at 0 or 1
|
|
1452
|
+
let tmpTestFile0 = libPath.join(pDatasetDir, tmpPagePattern.replace(/\{page\}/g, '0'));
|
|
1453
|
+
if (!libFs.existsSync(tmpTestFile0))
|
|
1454
|
+
{
|
|
1455
|
+
tmpPage = 1;
|
|
1456
|
+
}
|
|
1457
|
+
while (true)
|
|
1458
|
+
{
|
|
1459
|
+
let tmpFilename = tmpPagePattern.replace(/\{page\}/g, String(tmpPage));
|
|
1460
|
+
let tmpFilePath = libPath.join(pDatasetDir, tmpFilename);
|
|
1461
|
+
|
|
1462
|
+
if (!libFs.existsSync(tmpFilePath))
|
|
1463
|
+
{
|
|
1464
|
+
break;
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
let tmpData = JSON.parse(libFs.readFileSync(tmpFilePath, 'utf8'));
|
|
1468
|
+
if (Array.isArray(tmpData))
|
|
1469
|
+
{
|
|
1470
|
+
for (let i = 0; i < tmpData.length; i++)
|
|
1471
|
+
{
|
|
1472
|
+
let tmpItem = tmpItemField ? (tmpData[i][tmpItemField] || tmpData[i]) : tmpData[i];
|
|
1473
|
+
tmpAllItems.push(tmpItem);
|
|
1474
|
+
}
|
|
1475
|
+
}
|
|
1476
|
+
tmpPage++;
|
|
1477
|
+
}
|
|
1478
|
+
|
|
1479
|
+
tmpTotalItems = tmpAllItems.length;
|
|
1480
|
+
this.log.info(` for_each_in_pages: ${tmpTotalItems} items across ${tmpPage} pages`);
|
|
1481
|
+
|
|
1482
|
+
for (let i = 0; i < tmpAllItems.length; i++)
|
|
1483
|
+
{
|
|
1484
|
+
let tmpItem = tmpAllItems[i];
|
|
1485
|
+
let tmpValue = tmpField ? tmpItem[tmpField] : (typeof tmpItem === 'string' ? tmpItem : JSON.stringify(tmpItem));
|
|
1486
|
+
|
|
1487
|
+
let tmpUrl = this.expandTemplate(tmpUrlTemplate, tmpItem, tmpValue);
|
|
1488
|
+
let tmpSaveAs = this.expandTemplate(tmpSaveAsTemplate, tmpItem, tmpValue);
|
|
1489
|
+
let tmpDestPath = libPath.join(pDatasetDir, tmpSaveAs);
|
|
1490
|
+
|
|
1491
|
+
// Ensure subdirectory exists
|
|
1492
|
+
let tmpDir = libPath.dirname(tmpDestPath);
|
|
1493
|
+
if (!libFs.existsSync(tmpDir))
|
|
1494
|
+
{
|
|
1495
|
+
libFs.mkdirSync(tmpDir, { recursive: true });
|
|
1496
|
+
}
|
|
1497
|
+
|
|
1498
|
+
// Skip if already exists
|
|
1499
|
+
if (libFs.existsSync(tmpDestPath))
|
|
1500
|
+
{
|
|
1501
|
+
tmpSkipCount++;
|
|
1502
|
+
if ((i + 1) % 500 === 0)
|
|
1503
|
+
{
|
|
1504
|
+
process.stdout.write(` ... ${i + 1}/${tmpTotalItems} (${tmpSuccessCount} ok, ${tmpSkipCount} cached, ${tmpErrorCount} err)\r`);
|
|
1505
|
+
}
|
|
1506
|
+
continue;
|
|
1507
|
+
}
|
|
1508
|
+
|
|
1509
|
+
try
|
|
1510
|
+
{
|
|
1511
|
+
let tmpData = await this.fetchJson(tmpUrl, tmpHeaders);
|
|
1512
|
+
libFs.writeFileSync(tmpDestPath, JSON.stringify(tmpData, null, '\t'), 'utf8');
|
|
1513
|
+
pManifest.urls_downloaded.push(tmpUrl);
|
|
1514
|
+
tmpSuccessCount++;
|
|
1515
|
+
|
|
1516
|
+
if ((i + 1) % 10 === 0 || i === tmpTotalItems - 1)
|
|
1517
|
+
{
|
|
1518
|
+
process.stdout.write(` ... ${i + 1}/${tmpTotalItems} (${tmpSuccessCount} ok, ${tmpSkipCount} cached, ${tmpErrorCount} err)\r`);
|
|
1519
|
+
}
|
|
1520
|
+
}
|
|
1521
|
+
catch (pError)
|
|
1522
|
+
{
|
|
1523
|
+
tmpErrorCount++;
|
|
1524
|
+
if (tmpErrorCount <= 10)
|
|
1525
|
+
{
|
|
1526
|
+
this.log.warn(` Failed: ${tmpValue} — ${pError.message}`);
|
|
1527
|
+
}
|
|
1528
|
+
}
|
|
1529
|
+
|
|
1530
|
+
if (tmpDelayMs > 0)
|
|
1531
|
+
{
|
|
1532
|
+
await this.delay(tmpDelayMs);
|
|
1533
|
+
}
|
|
1534
|
+
}
|
|
1535
|
+
|
|
1536
|
+
process.stdout.write(' \r');
|
|
1537
|
+
this.log.info(` for_each_in_pages complete: ${tmpSuccessCount} ok, ${tmpSkipCount} cached, ${tmpErrorCount} errors`);
|
|
1538
|
+
}
|
|
1539
|
+
|
|
1540
|
+
/**
|
|
1541
|
+
* Expand template variables in a string.
|
|
1542
|
+
* Supports {value}, {item.fieldName}
|
|
1543
|
+
*/
|
|
1544
|
+
expandTemplate(pTemplate, pItem, pValue)
|
|
1545
|
+
{
|
|
1546
|
+
let tmpResult = pTemplate.replace(/\{value\}/g, String(pValue));
|
|
1547
|
+
|
|
1548
|
+
// Replace {item.fieldName} patterns
|
|
1549
|
+
tmpResult = tmpResult.replace(/\{item\.(\w+)\}/g,
|
|
1550
|
+
(pMatch, pFieldName) =>
|
|
1551
|
+
{
|
|
1552
|
+
return pItem && pItem[pFieldName] !== undefined ? String(pItem[pFieldName]) : pMatch;
|
|
1553
|
+
});
|
|
1554
|
+
|
|
1555
|
+
return tmpResult;
|
|
1556
|
+
}
|
|
1557
|
+
|
|
1558
|
+
/**
|
|
1559
|
+
* Simple delay helper.
|
|
1560
|
+
*/
|
|
1561
|
+
delay(pMs)
|
|
1562
|
+
{
|
|
1563
|
+
return new Promise((fResolve) => { setTimeout(fResolve, pMs); });
|
|
1564
|
+
}
|
|
1565
|
+
|
|
1566
|
+
// ================================================================
|
|
1567
|
+
// Download Dispatcher
|
|
1568
|
+
// ================================================================
|
|
1569
|
+
|
|
1570
|
+
/**
|
|
1571
|
+
* Download a single dataset entry.
|
|
1572
|
+
* Returns a Promise that resolves with a manifest object.
|
|
1573
|
+
*/
|
|
1574
|
+
async downloadDataset(pEntry)
|
|
1575
|
+
{
|
|
1576
|
+
let tmpDataset = pEntry.dataset;
|
|
1577
|
+
let tmpDatasetDir = this.getDatasetDir(pEntry.categoryFolder, tmpDataset.id);
|
|
1578
|
+
let tmpManifest = (
|
|
1579
|
+
{
|
|
1580
|
+
id: tmpDataset.id,
|
|
1581
|
+
name: tmpDataset.name,
|
|
1582
|
+
category: pEntry.categoryKey,
|
|
1583
|
+
tier: tmpDataset.tier,
|
|
1584
|
+
method: tmpDataset.method,
|
|
1585
|
+
license: tmpDataset.license,
|
|
1586
|
+
urls_downloaded: [],
|
|
1587
|
+
download_date: new Date().toISOString(),
|
|
1588
|
+
files: [],
|
|
1589
|
+
total_size: 0,
|
|
1590
|
+
status: 'in_progress'
|
|
1591
|
+
});
|
|
1592
|
+
|
|
1593
|
+
// Ensure dataset directory exists
|
|
1594
|
+
if (!libFs.existsSync(tmpDatasetDir))
|
|
1595
|
+
{
|
|
1596
|
+
libFs.mkdirSync(tmpDatasetDir, { recursive: true });
|
|
1597
|
+
}
|
|
1598
|
+
|
|
1599
|
+
try
|
|
1600
|
+
{
|
|
1601
|
+
if (tmpDataset.method === 'http_file')
|
|
1602
|
+
{
|
|
1603
|
+
await this.downloadHttpFiles(tmpDataset, tmpDatasetDir, tmpManifest);
|
|
1604
|
+
}
|
|
1605
|
+
else if (tmpDataset.method === 'http_archive')
|
|
1606
|
+
{
|
|
1607
|
+
await this.downloadHttpArchives(tmpDataset, tmpDatasetDir, tmpManifest);
|
|
1608
|
+
}
|
|
1609
|
+
else if (tmpDataset.method === 'git_clone')
|
|
1610
|
+
{
|
|
1611
|
+
await this.downloadGitClone(tmpDataset, tmpDatasetDir, tmpManifest);
|
|
1612
|
+
}
|
|
1613
|
+
else if (tmpDataset.method === 'rest_api')
|
|
1614
|
+
{
|
|
1615
|
+
await this.downloadRestApi(tmpDataset, tmpDatasetDir, tmpManifest);
|
|
1616
|
+
}
|
|
1617
|
+
|
|
1618
|
+
// Build file inventory
|
|
1619
|
+
tmpManifest.files = this.inventoryFiles(tmpDatasetDir);
|
|
1620
|
+
tmpManifest.total_size = 0;
|
|
1621
|
+
for (let i = 0; i < tmpManifest.files.length; i++)
|
|
1622
|
+
{
|
|
1623
|
+
tmpManifest.total_size += tmpManifest.files[i].size;
|
|
1624
|
+
}
|
|
1625
|
+
|
|
1626
|
+
tmpManifest.status = 'complete';
|
|
1627
|
+
}
|
|
1628
|
+
catch (pError)
|
|
1629
|
+
{
|
|
1630
|
+
tmpManifest.status = 'error';
|
|
1631
|
+
tmpManifest.error = pError.message;
|
|
1632
|
+
this.log.error(` ERROR: ${pError.message}`);
|
|
1633
|
+
}
|
|
1634
|
+
|
|
1635
|
+
// Write manifest
|
|
1636
|
+
this.writeManifest(tmpDatasetDir, tmpManifest);
|
|
1637
|
+
return tmpManifest;
|
|
1638
|
+
}
|
|
1639
|
+
}
|
|
1640
|
+
|
|
1641
|
+
module.exports = RetoldFactoDataLakeService;
|
|
1642
|
+
module.exports.default_options = defaultDataLakeOptions;
|