retold-facto 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/.claude/launch.json +11 -0
  2. package/.dockerignore +8 -0
  3. package/.quackage.json +19 -0
  4. package/Dockerfile +26 -0
  5. package/bin/retold-facto.js +909 -0
  6. package/examples/facto-government-data.sqlite +0 -0
  7. package/examples/government-data-catalog.json +137 -0
  8. package/examples/government-data-loader.js +1432 -0
  9. package/package.json +91 -0
  10. package/scripts/facto-download.js +425 -0
  11. package/source/Retold-Facto.js +1042 -0
  12. package/source/services/Retold-Facto-BeaconProvider.js +511 -0
  13. package/source/services/Retold-Facto-CatalogManager.js +1252 -0
  14. package/source/services/Retold-Facto-DataLakeService.js +1642 -0
  15. package/source/services/Retold-Facto-DatasetManager.js +417 -0
  16. package/source/services/Retold-Facto-IngestEngine.js +1315 -0
  17. package/source/services/Retold-Facto-ProjectionEngine.js +3960 -0
  18. package/source/services/Retold-Facto-RecordManager.js +360 -0
  19. package/source/services/Retold-Facto-SchemaManager.js +1110 -0
  20. package/source/services/Retold-Facto-SourceFolderScanner.js +2243 -0
  21. package/source/services/Retold-Facto-SourceManager.js +730 -0
  22. package/source/services/Retold-Facto-StoreConnectionManager.js +441 -0
  23. package/source/services/Retold-Facto-ThroughputMonitor.js +478 -0
  24. package/source/services/web-app/codemirror-entry.js +7 -0
  25. package/source/services/web-app/pict-app/Pict-Application-Facto-Configuration.json +9 -0
  26. package/source/services/web-app/pict-app/Pict-Application-Facto.js +70 -0
  27. package/source/services/web-app/pict-app/Pict-Facto-Bundle.js +11 -0
  28. package/source/services/web-app/pict-app/providers/Pict-Provider-Facto-UI.js +66 -0
  29. package/source/services/web-app/pict-app/providers/Pict-Provider-Facto.js +69 -0
  30. package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Catalog.js +93 -0
  31. package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Connections.js +42 -0
  32. package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Datasets.js +605 -0
  33. package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Projections.js +188 -0
  34. package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Scanner.js +80 -0
  35. package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Schema.js +116 -0
  36. package/source/services/web-app/pict-app/providers/facto-api/Facto-API-Sources.js +104 -0
  37. package/source/services/web-app/pict-app/views/PictView-Facto-Catalog.js +526 -0
  38. package/source/services/web-app/pict-app/views/PictView-Facto-Datasets.js +173 -0
  39. package/source/services/web-app/pict-app/views/PictView-Facto-Ingest.js +259 -0
  40. package/source/services/web-app/pict-app/views/PictView-Facto-Layout.js +191 -0
  41. package/source/services/web-app/pict-app/views/PictView-Facto-Projections.js +231 -0
  42. package/source/services/web-app/pict-app/views/PictView-Facto-Records.js +326 -0
  43. package/source/services/web-app/pict-app/views/PictView-Facto-Scanner.js +624 -0
  44. package/source/services/web-app/pict-app/views/PictView-Facto-Sources.js +201 -0
  45. package/source/services/web-app/pict-app/views/PictView-Facto-Throughput.js +456 -0
  46. package/source/services/web-app/pict-app-full/Pict-Application-Facto-Full-Configuration.json +14 -0
  47. package/source/services/web-app/pict-app-full/Pict-Application-Facto-Full.js +391 -0
  48. package/source/services/web-app/pict-app-full/providers/PictRouter-Facto-Configuration.json +56 -0
  49. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-BottomBar.js +68 -0
  50. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Connections.js +340 -0
  51. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Dashboard.js +149 -0
  52. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Dashboards.js +819 -0
  53. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Datasets.js +178 -0
  54. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-IngestJobs.js +99 -0
  55. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Layout.js +62 -0
  56. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-MappingEditor.js +158 -0
  57. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-ProjectionDetail.js +1120 -0
  58. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Projections.js +172 -0
  59. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-QueryPanel.js +119 -0
  60. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-RecordViewer.js +663 -0
  61. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Records.js +648 -0
  62. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Scanner.js +1017 -0
  63. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SchemaDetail.js +1404 -0
  64. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SchemaDocEditor.js +1036 -0
  65. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SchemaEditor.js +636 -0
  66. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SchemaResearch.js +357 -0
  67. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SourceDetail.js +822 -0
  68. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SourceEditor.js +1036 -0
  69. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-SourceResearch.js +487 -0
  70. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Sources.js +165 -0
  71. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-Throughput.js +439 -0
  72. package/source/services/web-app/pict-app-full/views/PictView-Facto-Full-TopBar.js +335 -0
  73. package/source/services/web-app/pict-app-full/views/projections/Facto-Projections-Constants.js +71 -0
  74. package/source/services/web-app/web/chart.min.js +20 -0
  75. package/source/services/web-app/web/codemirror-bundle.js +30099 -0
  76. package/source/services/web-app/web/css/facto-themes.css +467 -0
  77. package/source/services/web-app/web/css/facto.css +502 -0
  78. package/source/services/web-app/web/index.html +28 -0
  79. package/source/services/web-app/web/retold-facto.js +12138 -0
  80. package/source/services/web-app/web/retold-facto.js.map +1 -0
  81. package/source/services/web-app/web/retold-facto.min.js +2 -0
  82. package/source/services/web-app/web/retold-facto.min.js.map +1 -0
  83. package/source/services/web-app/web/simple/index.html +17 -0
  84. package/test/Facto_Browser_Integration_tests.js +798 -0
  85. package/test/RetoldFacto_tests.js +4117 -0
  86. package/test/fixtures/weather-readings.csv +17 -0
  87. package/test/fixtures/weather-stations.csv +9 -0
  88. package/test/model/MeadowModel-Extended.json +8497 -0
  89. package/test/model/MeadowModel-PICT.json +1 -0
  90. package/test/model/MeadowModel.json +1355 -0
  91. package/test/model/ddl/Facto.ddl +225 -0
  92. package/test/model/fable-configuration.json +14 -0
@@ -0,0 +1,2243 @@
1
+ /**
2
+ * Retold Facto - Source Folder Scanner Service
3
+ *
4
+ * Scans folder trees to discover dataset research folders (identified by
5
+ * having a README.md). Parses README metadata, enumerates data files,
6
+ * and bridges discovered datasets into the facto database through
7
+ * provisioning and ingestion.
8
+ *
9
+ * Folder structure expected:
10
+ * {scan-root}/
11
+ * {dataset-id}/
12
+ * README.md — 12-section research documentation
13
+ * data/ — Raw data files (CSV, TSV, JSON, archives)
14
+ *
15
+ * @author Steven Velozo <steven@velozo.com>
16
+ */
17
+ const libFableServiceProviderBase = require('fable-serviceproviderbase');
18
+
19
+ const libPath = require('path');
20
+ const libFs = require('fs');
21
+ const libZlib = require('zlib');
22
+
23
+ const defaultSourceFolderScannerOptions = (
24
+ {
25
+ RoutePrefix: '/facto'
26
+ });
27
+
28
+ // Map README heading text to structured keys
29
+ const SECTION_MAP = (
30
+ {
31
+ 'Provider / Organization': 'Provider',
32
+ 'Provider': 'Provider',
33
+ 'Source URL': 'SourceURL',
34
+ 'License': 'License',
35
+ 'Description': 'Description',
36
+ 'Schema / Field Descriptions': 'Schema',
37
+ 'Schema': 'Schema',
38
+ 'Data Format': 'DataFormat',
39
+ 'Update Frequency': 'UpdateFrequency',
40
+ 'Record Count': 'RecordCount',
41
+ 'Known Issues / Quirks': 'KnownIssues',
42
+ 'Known Issues': 'KnownIssues',
43
+ 'Ingestion Notes': 'IngestionNotes',
44
+ 'Related Datasets': 'RelatedDatasets',
45
+ 'Documentation Links': 'DocumentationLinks'
46
+ });
47
+
48
+ class RetoldFactoSourceFolderScanner extends libFableServiceProviderBase
49
+ {
50
+ constructor(pFable, pOptions, pServiceHash)
51
+ {
52
+ let tmpOptions = Object.assign({}, defaultSourceFolderScannerOptions, pOptions);
53
+ super(pFable, tmpOptions, pServiceHash);
54
+
55
+ this.serviceType = 'RetoldFactoSourceFolderScanner';
56
+
57
+ // In-memory state
58
+ this.scanPaths = [];
59
+ this.discoveredDatasets = {};
60
+ }
61
+
62
+ // ================================================================
63
+ // README Parsing
64
+ // ================================================================
65
+
66
+ /**
67
+ * Parse a README.md content string into a structured metadata object.
68
+ *
69
+ * @param {string} pContent - The raw README.md content
70
+ * @returns {object} Parsed metadata object
71
+ */
72
+ parseReadme(pContent)
73
+ {
74
+ if (!pContent || typeof pContent !== 'string')
75
+ {
76
+ return { Title: '', Sections: {} };
77
+ }
78
+
79
+ let tmpResult = { Title: '', Sections: {} };
80
+ let tmpLines = pContent.split('\n');
81
+
82
+ // Extract title from the first # line
83
+ for (let i = 0; i < tmpLines.length; i++)
84
+ {
85
+ let tmpLine = tmpLines[i].trim();
86
+ if (tmpLine.startsWith('# ') && !tmpLine.startsWith('## '))
87
+ {
88
+ tmpResult.Title = tmpLine.substring(2).trim();
89
+ break;
90
+ }
91
+ }
92
+
93
+ // Split by ## headings
94
+ let tmpCurrentSection = null;
95
+ let tmpCurrentContent = [];
96
+
97
+ for (let i = 0; i < tmpLines.length; i++)
98
+ {
99
+ let tmpLine = tmpLines[i];
100
+ let tmpTrimmed = tmpLine.trim();
101
+
102
+ if (tmpTrimmed.startsWith('## '))
103
+ {
104
+ // Save previous section
105
+ if (tmpCurrentSection)
106
+ {
107
+ let tmpKey = SECTION_MAP[tmpCurrentSection] || tmpCurrentSection;
108
+ tmpResult.Sections[tmpKey] = tmpCurrentContent.join('\n').trim();
109
+ }
110
+
111
+ tmpCurrentSection = tmpTrimmed.substring(3).trim();
112
+ tmpCurrentContent = [];
113
+ }
114
+ else if (tmpCurrentSection)
115
+ {
116
+ tmpCurrentContent.push(tmpLine);
117
+ }
118
+ }
119
+
120
+ // Save last section
121
+ if (tmpCurrentSection)
122
+ {
123
+ let tmpKey = SECTION_MAP[tmpCurrentSection] || tmpCurrentSection;
124
+ tmpResult.Sections[tmpKey] = tmpCurrentContent.join('\n').trim();
125
+ }
126
+
127
+ // Convenience accessors
128
+ tmpResult.Provider = tmpResult.Sections.Provider || '';
129
+ tmpResult.SourceURL = this.extractFirstUrl(tmpResult.Sections.SourceURL || '');
130
+ tmpResult.License = tmpResult.Sections.License || '';
131
+ tmpResult.Description = tmpResult.Sections.Description || '';
132
+ tmpResult.Schema = tmpResult.Sections.Schema || '';
133
+ tmpResult.UpdateFrequency = tmpResult.Sections.UpdateFrequency || '';
134
+ tmpResult.RecordCount = tmpResult.Sections.RecordCount || '';
135
+ tmpResult.KnownIssues = tmpResult.Sections.KnownIssues || '';
136
+ tmpResult.IngestionNotes = tmpResult.Sections.IngestionNotes || '';
137
+ tmpResult.RelatedDatasets = tmpResult.Sections.RelatedDatasets || '';
138
+ tmpResult.DocumentationLinks = tmpResult.Sections.DocumentationLinks || '';
139
+
140
+ // Parse data format section for structured info
141
+ tmpResult.DataFormat = this.parseDataFormatSection(tmpResult.Sections.DataFormat || '');
142
+
143
+ // Extract all documentation URLs
144
+ tmpResult.DocumentationURLs = this.extractAllUrls(tmpResult.Sections.DocumentationLinks || '');
145
+
146
+ return tmpResult;
147
+ }
148
+
149
+ /**
150
+ * Extract the first URL from a markdown text section.
151
+ */
152
+ extractFirstUrl(pText)
153
+ {
154
+ if (!pText)
155
+ {
156
+ return '';
157
+ }
158
+ let tmpMatch = pText.match(/https?:\/\/[^\s)>\]]+/);
159
+ return tmpMatch ? tmpMatch[0] : '';
160
+ }
161
+
162
+ /**
163
+ * Extract all URLs from a markdown text section.
164
+ */
165
+ extractAllUrls(pText)
166
+ {
167
+ if (!pText)
168
+ {
169
+ return [];
170
+ }
171
+ let tmpMatches = pText.match(/https?:\/\/[^\s)>\]]+/g);
172
+ return tmpMatches || [];
173
+ }
174
+
175
+ /**
176
+ * Parse the Data Format section for structured format info.
177
+ */
178
+ parseDataFormatSection(pText)
179
+ {
180
+ if (!pText)
181
+ {
182
+ return { Format: 'unknown', Encoding: 'UTF-8', Delimiter: '', Compression: 'none' };
183
+ }
184
+
185
+ let tmpLower = pText.toLowerCase();
186
+ let tmpFormat = 'unknown';
187
+ let tmpDelimiter = '';
188
+ let tmpCompression = 'none';
189
+ let tmpEncoding = 'UTF-8';
190
+
191
+ // Format detection
192
+ if (tmpLower.indexOf('tab-separated') > -1 || tmpLower.indexOf('\ttsv') > -1 || tmpLower.match(/\btsv\b/))
193
+ {
194
+ tmpFormat = 'tsv';
195
+ tmpDelimiter = '\t';
196
+ }
197
+ else if (tmpLower.indexOf('comma-separated') > -1 || tmpLower.match(/\bcsv\b/))
198
+ {
199
+ tmpFormat = 'csv';
200
+ tmpDelimiter = ',';
201
+ }
202
+ else if (tmpLower.indexOf('pipe-delimited') > -1 || tmpLower.indexOf('pipe-separated') > -1)
203
+ {
204
+ tmpFormat = 'csv';
205
+ tmpDelimiter = '|';
206
+ }
207
+ else if (tmpLower.match(/\bjson\b/))
208
+ {
209
+ tmpFormat = 'json';
210
+ }
211
+ else if (tmpLower.match(/\bxml\b/))
212
+ {
213
+ tmpFormat = 'xml';
214
+ }
215
+ else if (tmpLower.match(/\brdf\b/) || tmpLower.match(/\bn-triples\b/))
216
+ {
217
+ tmpFormat = 'rdf';
218
+ }
219
+
220
+ // Compression detection
221
+ if (tmpLower.indexOf('gzip') > -1 || tmpLower.indexOf('.gz') > -1)
222
+ {
223
+ tmpCompression = 'gzip';
224
+ }
225
+ else if (tmpLower.indexOf('.zip') > -1 || tmpLower.indexOf('zip archive') > -1)
226
+ {
227
+ tmpCompression = 'zip';
228
+ }
229
+ else if (tmpLower.indexOf('.tar.xz') > -1 || tmpLower.indexOf('xz') > -1)
230
+ {
231
+ tmpCompression = 'tar.xz';
232
+ }
233
+ else if (tmpLower.indexOf('.tar.zst') > -1 || tmpLower.indexOf('zstandard') > -1)
234
+ {
235
+ tmpCompression = 'tar.zst';
236
+ }
237
+
238
+ // Encoding detection
239
+ if (tmpLower.indexOf('latin') > -1 || tmpLower.indexOf('iso-8859') > -1)
240
+ {
241
+ tmpEncoding = 'ISO-8859-1';
242
+ }
243
+ else if (tmpLower.indexOf('ascii') > -1)
244
+ {
245
+ tmpEncoding = 'ASCII';
246
+ }
247
+
248
+ return (
249
+ {
250
+ Format: tmpFormat,
251
+ Encoding: tmpEncoding,
252
+ Delimiter: tmpDelimiter,
253
+ Compression: tmpCompression,
254
+ RawText: pText
255
+ });
256
+ }
257
+
258
+ // ================================================================
259
+ // File System Scanning
260
+ // ================================================================
261
+
262
+ /**
263
+ * Detect file format from filename extension.
264
+ */
265
+ detectFileFormat(pFileName)
266
+ {
267
+ let tmpLower = pFileName.toLowerCase();
268
+
269
+ // Strip compression extensions to find the real format
270
+ let tmpBase = tmpLower;
271
+ let tmpCompressed = false;
272
+
273
+ if (tmpBase.endsWith('.gz'))
274
+ {
275
+ tmpBase = tmpBase.slice(0, -3);
276
+ tmpCompressed = true;
277
+ }
278
+ else if (tmpBase.endsWith('.zip'))
279
+ {
280
+ tmpBase = tmpBase.slice(0, -4);
281
+ tmpCompressed = true;
282
+ }
283
+ else if (tmpBase.endsWith('.bz2'))
284
+ {
285
+ tmpBase = tmpBase.slice(0, -4);
286
+ tmpCompressed = true;
287
+ }
288
+ else if (tmpBase.endsWith('.xz'))
289
+ {
290
+ tmpBase = tmpBase.slice(0, -3);
291
+ tmpCompressed = true;
292
+ }
293
+ else if (tmpBase.endsWith('.zst') || tmpBase.endsWith('.zstd'))
294
+ {
295
+ tmpBase = tmpBase.replace(/\.zstd?$/, '');
296
+ tmpCompressed = true;
297
+ }
298
+
299
+ // Detect archive formats
300
+ if (tmpBase.endsWith('.tar'))
301
+ {
302
+ return { Format: 'archive', Compressed: true };
303
+ }
304
+
305
+ // Detect data format from extension
306
+ if (tmpBase.endsWith('.tsv') || tmpBase.endsWith('.tab'))
307
+ {
308
+ return { Format: 'tsv', Compressed: tmpCompressed };
309
+ }
310
+ if (tmpBase.endsWith('.csv'))
311
+ {
312
+ return { Format: 'csv', Compressed: tmpCompressed };
313
+ }
314
+ if (tmpBase.endsWith('.json') || tmpBase.endsWith('.jsonl') || tmpBase.endsWith('.ndjson'))
315
+ {
316
+ return { Format: 'json', Compressed: tmpCompressed };
317
+ }
318
+ if (tmpBase.endsWith('.xml') || tmpBase.endsWith('.rdf'))
319
+ {
320
+ return { Format: 'xml', Compressed: tmpCompressed };
321
+ }
322
+ if (tmpBase.endsWith('.xlsx') || tmpBase.endsWith('.xls'))
323
+ {
324
+ return { Format: 'excel', Compressed: false };
325
+ }
326
+ if (tmpBase.endsWith('.sql') || tmpBase.endsWith('.sqlite'))
327
+ {
328
+ return { Format: 'sql', Compressed: tmpCompressed };
329
+ }
330
+ if (tmpBase.endsWith('.txt') || tmpBase.endsWith('.dat'))
331
+ {
332
+ return { Format: 'text', Compressed: tmpCompressed };
333
+ }
334
+
335
+ // If we stripped a compression extension but found no data format, it's an archive
336
+ if (tmpCompressed)
337
+ {
338
+ return { Format: 'archive', Compressed: true };
339
+ }
340
+
341
+ return { Format: 'unknown', Compressed: false };
342
+ }
343
+
344
+ /**
345
+ * Enumerate data files in a dataset's data/ subfolder.
346
+ *
347
+ * @param {string} pFolderPath - The dataset folder path
348
+ * @param {function} fCallback - Callback(pError, pFiles)
349
+ */
350
+ resolveDataFiles(pFolderPath, fCallback)
351
+ {
352
+ let tmpDataDir = libPath.join(pFolderPath, 'data');
353
+
354
+ if (!libFs.existsSync(tmpDataDir))
355
+ {
356
+ return fCallback(null, []);
357
+ }
358
+
359
+ let tmpFiles = [];
360
+
361
+ try
362
+ {
363
+ let tmpWalk = (pDir, pRelPrefix) =>
364
+ {
365
+ let tmpEntries = libFs.readdirSync(pDir, { withFileTypes: true });
366
+
367
+ for (let i = 0; i < tmpEntries.length; i++)
368
+ {
369
+ let tmpEntry = tmpEntries[i];
370
+ let tmpFullPath = libPath.join(pDir, tmpEntry.name);
371
+ let tmpRelPath = pRelPrefix ? `${pRelPrefix}/${tmpEntry.name}` : tmpEntry.name;
372
+
373
+ if (tmpEntry.name === '_manifest.json' || tmpEntry.name === '_ingestion.json' || tmpEntry.name === '.git' || tmpEntry.name === '.DS_Store')
374
+ {
375
+ continue;
376
+ }
377
+
378
+ if (tmpEntry.isDirectory())
379
+ {
380
+ tmpWalk(tmpFullPath, tmpRelPath);
381
+ }
382
+ else if (tmpEntry.isFile())
383
+ {
384
+ let tmpStat = libFs.statSync(tmpFullPath);
385
+ let tmpFormatInfo = this.detectFileFormat(tmpEntry.name);
386
+
387
+ tmpFiles.push(
388
+ {
389
+ FileName: tmpRelPath,
390
+ FullPath: tmpFullPath,
391
+ Size: tmpStat.size,
392
+ Format: tmpFormatInfo.Format,
393
+ Compressed: tmpFormatInfo.Compressed,
394
+ ModifiedAt: tmpStat.mtime.toISOString()
395
+ });
396
+ }
397
+ }
398
+ };
399
+
400
+ tmpWalk(tmpDataDir, '');
401
+ }
402
+ catch (pError)
403
+ {
404
+ return fCallback(pError);
405
+ }
406
+
407
+ // Sort by size descending (largest first)
408
+ tmpFiles.sort((a, b) => b.Size - a.Size);
409
+
410
+ return fCallback(null, tmpFiles);
411
+ }
412
+
413
+ /**
414
+ * Parse a single dataset folder and build a DiscoveredDataset object.
415
+ *
416
+ * @param {string} pFolderPath - Absolute path to the dataset folder
417
+ * @param {function} fCallback - Callback(pError, pDiscoveredDataset)
418
+ */
419
+ parseDatasetFolder(pFolderPath, fCallback)
420
+ {
421
+ let tmpReadmePath = libPath.join(pFolderPath, 'README.md');
422
+ let tmpFolderName = libPath.basename(pFolderPath);
423
+
424
+ // Read README
425
+ libFs.readFile(tmpReadmePath, 'utf8',
426
+ (pError, pContent) =>
427
+ {
428
+ if (pError)
429
+ {
430
+ return fCallback(null,
431
+ {
432
+ FolderPath: pFolderPath,
433
+ FolderName: tmpFolderName,
434
+ Title: tmpFolderName,
435
+ Status: 'Error',
436
+ Errors: [`Could not read README.md: ${pError.message}`],
437
+ DataFiles: [],
438
+ TotalDataSize: 0,
439
+ HasData: false,
440
+ NeedsDownload: true,
441
+ DiscoveredAt: new Date().toISOString()
442
+ });
443
+ }
444
+
445
+ let tmpParsed = this.parseReadme(pContent);
446
+
447
+ // Resolve data files
448
+ this.resolveDataFiles(pFolderPath,
449
+ (pFileError, pFiles) =>
450
+ {
451
+ let tmpDataFiles = pFiles || [];
452
+ let tmpTotalSize = 0;
453
+ for (let i = 0; i < tmpDataFiles.length; i++)
454
+ {
455
+ tmpTotalSize += tmpDataFiles[i].Size;
456
+ }
457
+
458
+ let tmpDataset = (
459
+ {
460
+ FolderPath: pFolderPath,
461
+ FolderName: tmpFolderName,
462
+
463
+ // README metadata
464
+ Title: tmpParsed.Title || tmpFolderName,
465
+ Provider: tmpParsed.Provider,
466
+ SourceURL: tmpParsed.SourceURL,
467
+ License: tmpParsed.License,
468
+ Description: tmpParsed.Description,
469
+ Schema: tmpParsed.Schema,
470
+ DataFormat: tmpParsed.DataFormat,
471
+ UpdateFrequency: tmpParsed.UpdateFrequency,
472
+ RecordCount: tmpParsed.RecordCount,
473
+ KnownIssues: tmpParsed.KnownIssues,
474
+ IngestionNotes: tmpParsed.IngestionNotes,
475
+ RelatedDatasets: tmpParsed.RelatedDatasets,
476
+ DocumentationLinks: tmpParsed.DocumentationLinks,
477
+ DocumentationURLs: tmpParsed.DocumentationURLs,
478
+
479
+ // Data files
480
+ DataFiles: tmpDataFiles,
481
+ TotalDataSize: tmpTotalSize,
482
+
483
+ // Status
484
+ Status: 'Discovered',
485
+ HasData: tmpDataFiles.length > 0,
486
+ NeedsDownload: tmpDataFiles.length === 0,
487
+
488
+ // Provisioning (set later)
489
+ IDSource: null,
490
+ IDDataset: null,
491
+
492
+ // Timestamps
493
+ DiscoveredAt: new Date().toISOString(),
494
+ ProvisionedAt: null,
495
+ IngestedAt: null,
496
+ LastScannedAt: new Date().toISOString(),
497
+
498
+ Errors: []
499
+ });
500
+
501
+ return fCallback(null, tmpDataset);
502
+ });
503
+ });
504
+ }
505
+
506
+ /**
507
+ * Recursively scan a folder tree for dataset folders (those containing README.md).
508
+ *
509
+ * @param {string} pPath - Absolute path to scan
510
+ * @param {function} fCallback - Callback(pError, pScanResult)
511
+ */
512
+ scanPath(pPath, fCallback)
513
+ {
514
+ let tmpAbsPath = libPath.resolve(pPath);
515
+ let tmpSelf = this;
516
+
517
+ if (!libFs.existsSync(tmpAbsPath))
518
+ {
519
+ return fCallback(new Error(`Scan path does not exist: ${tmpAbsPath}`));
520
+ }
521
+
522
+ let tmpStat = libFs.statSync(tmpAbsPath);
523
+ if (!tmpStat.isDirectory())
524
+ {
525
+ return fCallback(new Error(`Scan path is not a directory: ${tmpAbsPath}`));
526
+ }
527
+
528
+ let tmpFoldersScanned = 0;
529
+ let tmpDatasetsFound = 0;
530
+ let tmpErrors = [];
531
+
532
+ // Find all immediate child directories that contain a README.md
533
+ let tmpDatasetFolders = [];
534
+
535
+ try
536
+ {
537
+ let tmpEntries = libFs.readdirSync(tmpAbsPath, { withFileTypes: true });
538
+
539
+ for (let i = 0; i < tmpEntries.length; i++)
540
+ {
541
+ if (tmpEntries[i].isDirectory())
542
+ {
543
+ let tmpChildPath = libPath.join(tmpAbsPath, tmpEntries[i].name);
544
+ let tmpReadmePath = libPath.join(tmpChildPath, 'README.md');
545
+
546
+ tmpFoldersScanned++;
547
+
548
+ if (libFs.existsSync(tmpReadmePath))
549
+ {
550
+ tmpDatasetFolders.push(tmpChildPath);
551
+ }
552
+ }
553
+ }
554
+ }
555
+ catch (pError)
556
+ {
557
+ this.fable.log.error(`SourceFolderScanner: error reading directory ${tmpAbsPath}: ${pError.message}`);
558
+ return fCallback(pError);
559
+ }
560
+
561
+ this.fable.log.info(`SourceFolderScanner: found ${tmpDatasetFolders.length} dataset folder(s) with README.md in ${tmpAbsPath}`);
562
+
563
+ // Parse each dataset folder
564
+ let tmpAnticipate = this.fable.newAnticipate();
565
+ let tmpProcessed = 0;
566
+
567
+ for (let i = 0; i < tmpDatasetFolders.length; i++)
568
+ {
569
+ let tmpFolderPath = tmpDatasetFolders[i];
570
+
571
+ tmpAnticipate.anticipate(
572
+ (fStep) =>
573
+ {
574
+ tmpSelf.parseDatasetFolder(tmpFolderPath,
575
+ (pParseError, pDataset) =>
576
+ {
577
+ tmpProcessed++;
578
+ if (pParseError)
579
+ {
580
+ tmpErrors.push(`${tmpFolderPath}: ${pParseError.message}`);
581
+ }
582
+ else if (pDataset)
583
+ {
584
+ tmpSelf.discoveredDatasets[tmpFolderPath] = pDataset;
585
+ tmpDatasetsFound++;
586
+ }
587
+
588
+ // Progress log every 25 or on final
589
+ if (tmpProcessed % 25 === 0 || tmpProcessed === tmpDatasetFolders.length)
590
+ {
591
+ tmpSelf.fable.log.info(`SourceFolderScanner: parsed ${tmpProcessed}/${tmpDatasetFolders.length} dataset folders...`);
592
+ }
593
+ return fStep();
594
+ });
595
+ });
596
+ }
597
+
598
+ tmpAnticipate.wait(
599
+ (pError) =>
600
+ {
601
+ // Update scan path metadata
602
+ for (let i = 0; i < tmpSelf.scanPaths.length; i++)
603
+ {
604
+ if (tmpSelf.scanPaths[i].Path === tmpAbsPath)
605
+ {
606
+ tmpSelf.scanPaths[i].LastScannedAt = new Date().toISOString();
607
+ tmpSelf.scanPaths[i].DatasetCount = tmpDatasetsFound;
608
+ }
609
+ }
610
+
611
+ let tmpResult = (
612
+ {
613
+ Path: tmpAbsPath,
614
+ FoldersScanned: tmpFoldersScanned,
615
+ DatasetsFound: tmpDatasetsFound,
616
+ Errors: tmpErrors
617
+ });
618
+
619
+ tmpSelf.fable.log.info(`SourceFolderScanner: scanned ${tmpAbsPath} — ${tmpDatasetsFound} datasets discovered in ${tmpFoldersScanned} folders`);
620
+
621
+ return fCallback(null, tmpResult);
622
+ });
623
+ }
624
+
625
+ // ================================================================
626
+ // Scan Path Management
627
+ // ================================================================
628
+
629
+ /**
630
+ * Add a scan path and immediately scan it.
631
+ */
632
+ addScanPath(pPath, fCallback)
633
+ {
634
+ let tmpAbsPath = libPath.resolve(pPath);
635
+
636
+ // Check if already registered
637
+ for (let i = 0; i < this.scanPaths.length; i++)
638
+ {
639
+ if (this.scanPaths[i].Path === tmpAbsPath)
640
+ {
641
+ // Already registered — just rescan
642
+ return this.scanPath(tmpAbsPath, fCallback);
643
+ }
644
+ }
645
+
646
+ this.scanPaths.push(
647
+ {
648
+ Path: tmpAbsPath,
649
+ AddedAt: new Date().toISOString(),
650
+ LastScannedAt: null,
651
+ DatasetCount: 0
652
+ });
653
+
654
+ this.fable.log.info(`SourceFolderScanner: added scan path ${tmpAbsPath}`);
655
+
656
+ return this.scanPath(tmpAbsPath, fCallback);
657
+ }
658
+
659
+ /**
660
+ * Remove a scan path and all its discovered datasets.
661
+ */
662
+ removeScanPath(pPath, fCallback)
663
+ {
664
+ let tmpAbsPath = libPath.resolve(pPath);
665
+
666
+ // Remove from scanPaths
667
+ this.scanPaths = this.scanPaths.filter((pEntry) => pEntry.Path !== tmpAbsPath);
668
+
669
+ // Remove discovered datasets under this path
670
+ let tmpRemoved = 0;
671
+ let tmpKeys = Object.keys(this.discoveredDatasets);
672
+ for (let i = 0; i < tmpKeys.length; i++)
673
+ {
674
+ if (tmpKeys[i].startsWith(tmpAbsPath))
675
+ {
676
+ delete this.discoveredDatasets[tmpKeys[i]];
677
+ tmpRemoved++;
678
+ }
679
+ }
680
+
681
+ this.fable.log.info(`SourceFolderScanner: removed scan path ${tmpAbsPath} (${tmpRemoved} datasets removed)`);
682
+
683
+ return fCallback(null, { Path: tmpAbsPath, DatasetsRemoved: tmpRemoved });
684
+ }
685
+
686
+ /**
687
+ * Re-scan all registered paths.
688
+ */
689
+ scanAllPaths(fCallback)
690
+ {
691
+ let tmpAnticipate = this.fable.newAnticipate();
692
+ let tmpResults = [];
693
+ let tmpSelf = this;
694
+
695
+ for (let i = 0; i < this.scanPaths.length; i++)
696
+ {
697
+ let tmpPath = this.scanPaths[i].Path;
698
+
699
+ tmpAnticipate.anticipate(
700
+ (fStep) =>
701
+ {
702
+ tmpSelf.scanPath(tmpPath,
703
+ (pError, pResult) =>
704
+ {
705
+ if (pResult)
706
+ {
707
+ tmpResults.push(pResult);
708
+ }
709
+ return fStep();
710
+ });
711
+ });
712
+ }
713
+
714
+ tmpAnticipate.wait(
715
+ (pError) =>
716
+ {
717
+ return fCallback(pError, tmpResults);
718
+ });
719
+ }
720
+
721
+ // ================================================================
722
+ // Dataset Queries
723
+ // ================================================================
724
+
725
+ /**
726
+ * Get all discovered datasets, optionally filtered.
727
+ *
728
+ * @param {object} pFilterOptions - { status, search, hasData }
729
+ * @returns {Array} Array of DiscoveredDataset objects
730
+ */
731
+ getDiscoveredDatasets(pFilterOptions)
732
+ {
733
+ let tmpFilter = pFilterOptions || {};
734
+ let tmpKeys = Object.keys(this.discoveredDatasets);
735
+ let tmpResults = [];
736
+
737
+ for (let i = 0; i < tmpKeys.length; i++)
738
+ {
739
+ let tmpDataset = this.discoveredDatasets[tmpKeys[i]];
740
+
741
+ // Status filter
742
+ if (tmpFilter.status && tmpDataset.Status !== tmpFilter.status)
743
+ {
744
+ continue;
745
+ }
746
+
747
+ // Has data filter
748
+ if (tmpFilter.hasData !== undefined && tmpDataset.HasData !== tmpFilter.hasData)
749
+ {
750
+ continue;
751
+ }
752
+
753
+ // Search filter
754
+ if (tmpFilter.search)
755
+ {
756
+ let tmpSearchTerm = tmpFilter.search.toLowerCase();
757
+ let tmpSearchable = [
758
+ tmpDataset.Title,
759
+ tmpDataset.FolderName,
760
+ tmpDataset.Provider,
761
+ tmpDataset.Description
762
+ ].join(' ').toLowerCase();
763
+
764
+ if (tmpSearchable.indexOf(tmpSearchTerm) < 0)
765
+ {
766
+ continue;
767
+ }
768
+ }
769
+
770
+ tmpResults.push(tmpDataset);
771
+ }
772
+
773
+ // Sort by folder name
774
+ tmpResults.sort((a, b) => a.FolderName.localeCompare(b.FolderName));
775
+
776
+ return tmpResults;
777
+ }
778
+
779
+ /**
780
+ * Get a single discovered dataset by folder name.
781
+ *
782
+ * @param {string} pFolderName - The dataset folder name (e.g. 'iso-639-3')
783
+ * @returns {object|null} The DiscoveredDataset or null
784
+ */
785
+ getDiscoveredDatasetByName(pFolderName)
786
+ {
787
+ let tmpKeys = Object.keys(this.discoveredDatasets);
788
+
789
+ for (let i = 0; i < tmpKeys.length; i++)
790
+ {
791
+ if (this.discoveredDatasets[tmpKeys[i]].FolderName === pFolderName)
792
+ {
793
+ return this.discoveredDatasets[tmpKeys[i]];
794
+ }
795
+ }
796
+
797
+ return null;
798
+ }
799
+
800
+ // ================================================================
801
+ // Provisioning
802
+ // ================================================================
803
+
804
+ /**
805
+ * Provision a discovered dataset into the facto database.
806
+ * Creates Source, Dataset, DatasetSource, and SourceCatalogEntry records.
807
+ *
808
+ * @param {string} pFolderName - The dataset folder name
809
+ * @param {function} fCallback - Callback(pError, pResult)
810
+ */
811
+ provisionDataset(pFolderName, fCallback)
812
+ {
813
+ let tmpDataset = this.getDiscoveredDatasetByName(pFolderName);
814
+
815
+ if (!tmpDataset)
816
+ {
817
+ return fCallback(new Error(`Dataset not found: ${pFolderName}`));
818
+ }
819
+
820
+ if (!this.fable.RetoldFactoCatalogManager)
821
+ {
822
+ return fCallback(new Error('CatalogManager not initialized'));
823
+ }
824
+
825
+ let tmpAnticipate = this.fable.newAnticipate();
826
+ let tmpSource = null;
827
+ let tmpDatasetRecord = null;
828
+ let tmpDatasetSource = null;
829
+ let tmpSelf = this;
830
+
831
+ // Step 1: Find or create Source
832
+ tmpAnticipate.anticipate(
833
+ (fStep) =>
834
+ {
835
+ let tmpSourceName = tmpDataset.Provider || tmpDataset.FolderName;
836
+ tmpSelf.fable.RetoldFactoCatalogManager.findOrCreateSource(tmpSourceName,
837
+ {
838
+ Type: 'File',
839
+ URL: tmpDataset.SourceURL || '',
840
+ Protocol: 'HTTPS',
841
+ Description: tmpDataset.Description ? tmpDataset.Description.substring(0, 255) : ''
842
+ },
843
+ (pError, pSource) =>
844
+ {
845
+ if (pError)
846
+ {
847
+ return fStep(pError);
848
+ }
849
+ tmpSource = pSource;
850
+ return fStep();
851
+ });
852
+ });
853
+
854
+ // Step 2: Find or create Dataset
855
+ tmpAnticipate.anticipate(
856
+ (fStep) =>
857
+ {
858
+ tmpSelf.fable.RetoldFactoCatalogManager.findOrCreateDataset(tmpDataset.Title,
859
+ {
860
+ Type: 'Raw',
861
+ Description: tmpDataset.Description ? tmpDataset.Description.substring(0, 255) : '',
862
+ VersionPolicy: 'Append'
863
+ },
864
+ (pError, pDataset) =>
865
+ {
866
+ if (pError)
867
+ {
868
+ return fStep(pError);
869
+ }
870
+ tmpDatasetRecord = pDataset;
871
+ return fStep();
872
+ });
873
+ });
874
+
875
+ // Step 3: Ensure DatasetSource link
876
+ tmpAnticipate.anticipate(
877
+ (fStep) =>
878
+ {
879
+ if (!tmpSource || !tmpDatasetRecord)
880
+ {
881
+ return fStep();
882
+ }
883
+
884
+ tmpSelf.fable.RetoldFactoCatalogManager.ensureDatasetSourceLink(
885
+ tmpDatasetRecord.IDDataset,
886
+ tmpSource.IDSource,
887
+ (pError, pLink) =>
888
+ {
889
+ if (pError)
890
+ {
891
+ return fStep(pError);
892
+ }
893
+ tmpDatasetSource = pLink;
894
+ return fStep();
895
+ });
896
+ });
897
+
898
+ // Step 4: Create SourceCatalogEntry if DAL is available
899
+ tmpAnticipate.anticipate(
900
+ (fStep) =>
901
+ {
902
+ if (!tmpSelf.fable.DAL || !tmpSelf.fable.DAL.SourceCatalogEntry)
903
+ {
904
+ return fStep();
905
+ }
906
+
907
+ let tmpEntryData = (
908
+ {
909
+ Agency: tmpDataset.Provider || '',
910
+ Name: tmpDataset.Title || tmpDataset.FolderName,
911
+ Type: 'File',
912
+ URL: tmpDataset.SourceURL || '',
913
+ Protocol: 'HTTPS',
914
+ Category: '',
915
+ Region: '',
916
+ UpdateFrequency: tmpDataset.UpdateFrequency || '',
917
+ Description: tmpDataset.Description ? tmpDataset.Description.substring(0, 1000) : '',
918
+ Notes: `Discovered from folder: ${tmpDataset.FolderPath}`,
919
+ Verified: 1
920
+ });
921
+
922
+ let tmpQuery = tmpSelf.fable.DAL.SourceCatalogEntry.query.clone()
923
+ .addRecord(tmpEntryData);
924
+
925
+ tmpSelf.fable.DAL.SourceCatalogEntry.doCreate(tmpQuery,
926
+ (pError) =>
927
+ {
928
+ // Non-fatal if this fails
929
+ return fStep();
930
+ });
931
+ });
932
+
933
+ tmpAnticipate.wait(
934
+ (pError) =>
935
+ {
936
+ if (pError)
937
+ {
938
+ tmpDataset.Errors.push(`Provisioning failed: ${pError.message}`);
939
+ tmpDataset.Status = 'Error';
940
+ return fCallback(pError);
941
+ }
942
+
943
+ // Update discovered dataset with provisioning info
944
+ tmpDataset.Status = 'Provisioned';
945
+ tmpDataset.ProvisionedAt = new Date().toISOString();
946
+ tmpDataset.IDSource = tmpSource ? tmpSource.IDSource : null;
947
+ tmpDataset.IDDataset = tmpDatasetRecord ? tmpDatasetRecord.IDDataset : null;
948
+
949
+ tmpSelf.fable.log.info(`SourceFolderScanner: provisioned ${pFolderName} (Source: ${tmpDataset.IDSource}, Dataset: ${tmpDataset.IDDataset})`);
950
+
951
+ return fCallback(null,
952
+ {
953
+ Success: true,
954
+ FolderName: pFolderName,
955
+ Source: tmpSource,
956
+ Dataset: tmpDatasetRecord,
957
+ DatasetSource: tmpDatasetSource
958
+ });
959
+ });
960
+ }
961
+
962
+ // ================================================================
963
+ // Ingestion
964
+ // ================================================================
965
+
966
+ /**
967
+ * Select the best data file for ingestion from a dataset's files.
968
+ * Prefers uncompressed files, then largest compressed file.
969
+ *
970
+ * @param {Array} pDataFiles - Array of data file objects
971
+ * @returns {object|null} The selected file or null
972
+ */
973
+ selectBestDataFile(pDataFiles)
974
+ {
975
+ if (!pDataFiles || pDataFiles.length === 0)
976
+ {
977
+ return null;
978
+ }
979
+
980
+ // Prefer ingestable formats (csv, tsv, json) over archives
981
+ let tmpIngestable = pDataFiles.filter(
982
+ (pFile) =>
983
+ {
984
+ return pFile.Format === 'csv' || pFile.Format === 'tsv' || pFile.Format === 'json' || pFile.Format === 'text';
985
+ });
986
+
987
+ if (tmpIngestable.length > 0)
988
+ {
989
+ // Return the largest ingestable file
990
+ return tmpIngestable[0]; // Already sorted by size desc
991
+ }
992
+
993
+ // Fall back to the largest non-archive file
994
+ let tmpNonArchive = pDataFiles.filter(
995
+ (pFile) =>
996
+ {
997
+ return pFile.Format !== 'archive';
998
+ });
999
+
1000
+ return tmpNonArchive.length > 0 ? tmpNonArchive[0] : null;
1001
+ }
1002
+
1003
+ /**
1004
+ * Extract any archive files found in a dataset's DataFiles list.
1005
+ * After extraction, re-scans the data directory and updates DataFiles
1006
+ * on the dataset object so subsequent selection/planning sees the
1007
+ * extracted contents.
1008
+ *
1009
+ * Requires RetoldFactoDataLakeService to be available on fable.
1010
+ * If it is not, extraction is skipped with a warning.
1011
+ *
1012
+ * @param {object} pDataset - The discovered dataset object (mutated in place)
1013
+ * @param {function} fCallback - Callback(pError, pResult)
1014
+ */
1015
+ extractArchivesIfNeeded(pDataset, fCallback)
1016
+ {
1017
+ let tmpArchiveFiles = (pDataset.DataFiles || []).filter(
1018
+ (pFile) => pFile.Format === 'archive');
1019
+
1020
+ if (tmpArchiveFiles.length === 0)
1021
+ {
1022
+ return fCallback(null, { ExtractedCount: 0 });
1023
+ }
1024
+
1025
+ if (!this.fable.RetoldFactoDataLakeService)
1026
+ {
1027
+ this.fable.log.warn(`SourceFolderScanner: archives found in ${pDataset.FolderName} but DataLakeService not available — skipping extraction`);
1028
+ return fCallback(null, { ExtractedCount: 0 });
1029
+ }
1030
+
1031
+ let tmpDataLake = this.fable.RetoldFactoDataLakeService;
1032
+ let tmpDataDir = libPath.join(pDataset.FolderPath, 'data');
1033
+ let tmpSelf = this;
1034
+ let tmpExtracted = 0;
1035
+
1036
+ this.fable.log.info(`SourceFolderScanner: extracting ${tmpArchiveFiles.length} archive(s) in ${pDataset.FolderName}`);
1037
+
1038
+ let tmpAnticipate = this.fable.newAnticipate();
1039
+
1040
+ for (let i = 0; i < tmpArchiveFiles.length; i++)
1041
+ {
1042
+ let tmpArchiveFile = tmpArchiveFiles[i];
1043
+
1044
+ tmpAnticipate.anticipate(
1045
+ (fStep) =>
1046
+ {
1047
+ tmpDataLake.extractArchive(tmpArchiveFile.FullPath, tmpDataDir)
1048
+ .then(
1049
+ () =>
1050
+ {
1051
+ tmpExtracted++;
1052
+ tmpSelf.fable.log.info(`SourceFolderScanner: extracted ${tmpArchiveFile.FileName}`);
1053
+ return fStep();
1054
+ })
1055
+ .catch(
1056
+ (pError) =>
1057
+ {
1058
+ tmpSelf.fable.log.error(`SourceFolderScanner: failed to extract ${tmpArchiveFile.FileName}: ${pError.message}`);
1059
+ return fStep();
1060
+ });
1061
+ });
1062
+ }
1063
+
1064
+ tmpAnticipate.wait(
1065
+ (pError) =>
1066
+ {
1067
+ if (tmpExtracted === 0)
1068
+ {
1069
+ return fCallback(null, { ExtractedCount: 0 });
1070
+ }
1071
+
1072
+ // Re-scan so selectBestDataFile / ingestFromPlan see the extracted files
1073
+ tmpSelf.resolveDataFiles(pDataset.FolderPath,
1074
+ (pFileError, pFiles) =>
1075
+ {
1076
+ pDataset.DataFiles = pFiles || [];
1077
+ pDataset.TotalDataSize = 0;
1078
+ for (let i = 0; i < pDataset.DataFiles.length; i++)
1079
+ {
1080
+ pDataset.TotalDataSize += pDataset.DataFiles[i].Size;
1081
+ }
1082
+ pDataset.HasData = pDataset.DataFiles.length > 0;
1083
+
1084
+ return fCallback(null, { ExtractedCount: tmpExtracted });
1085
+ });
1086
+ });
1087
+ }
1088
+
1089
+ // ================================================================
1090
+ // Ingestion Plan
1091
+ // ================================================================
1092
+
1093
+ /**
1094
+ * Read an existing _ingestion.json from a dataset's data directory.
1095
+ *
1096
+ * @param {string} pDataDir - Path to the dataset's data/ directory
1097
+ * @returns {object|null} The parsed plan, or null if not found
1098
+ */
1099
+ readIngestionPlan(pDataDir)
1100
+ {
1101
+ let tmpPlanPath = libPath.join(pDataDir, '_ingestion.json');
1102
+
1103
+ if (!libFs.existsSync(tmpPlanPath))
1104
+ {
1105
+ return null;
1106
+ }
1107
+
1108
+ try
1109
+ {
1110
+ let tmpContent = libFs.readFileSync(tmpPlanPath, 'utf8');
1111
+ return JSON.parse(tmpContent);
1112
+ }
1113
+ catch (pError)
1114
+ {
1115
+ this.log.warn(`Failed to read ingestion plan: ${pError.message}`);
1116
+ return null;
1117
+ }
1118
+ }
1119
+
1120
+ /**
1121
+ * Write an ingestion plan to a dataset's data directory.
1122
+ *
1123
+ * @param {string} pDataDir - Path to the dataset's data/ directory
1124
+ * @param {object} pPlan - The plan object to write
1125
+ */
1126
+ writeIngestionPlan(pDataDir, pPlan)
1127
+ {
1128
+ let tmpPlanPath = libPath.join(pDataDir, '_ingestion.json');
1129
+
1130
+ try
1131
+ {
1132
+ libFs.writeFileSync(tmpPlanPath, JSON.stringify(pPlan, null, '\t'), 'utf8');
1133
+ }
1134
+ catch (pError)
1135
+ {
1136
+ this.log.error(`Failed to write ingestion plan: ${pError.message}`);
1137
+ }
1138
+ }
1139
+
1140
+ /**
1141
+ * Derive a record type name from a data file path.
1142
+ * Strips directory prefixes and file extensions.
1143
+ * e.g., "core/People.csv" → "People", "ml-32m/ratings.csv" → "ratings"
1144
+ *
1145
+ * @param {string} pFileName - Relative file path
1146
+ * @returns {string} The derived record type
1147
+ */
1148
+ deriveRecordType(pFileName)
1149
+ {
1150
+ let tmpBase = libPath.basename(pFileName);
1151
+
1152
+ // Strip all extensions (including compound ones like .csv.gz)
1153
+ let tmpName = tmpBase;
1154
+ while (libPath.extname(tmpName))
1155
+ {
1156
+ tmpName = tmpName.slice(0, tmpName.length - libPath.extname(tmpName).length);
1157
+ }
1158
+
1159
+ return tmpName;
1160
+ }
1161
+
1162
+ /**
1163
+ * Parse the README Schema section for per-file sub-headings.
1164
+ * Returns a map of lowercase filename → { primaryKey, fields }.
1165
+ *
1166
+ * @param {string} pSchemaText - The raw Schema section text
1167
+ * @returns {object} Map of filename → schema info
1168
+ */
1169
+ parseSchemaForFiles(pSchemaText)
1170
+ {
1171
+ if (!pSchemaText)
1172
+ {
1173
+ return {};
1174
+ }
1175
+
1176
+ let tmpResult = {};
1177
+ let tmpLines = pSchemaText.split('\n');
1178
+ let tmpCurrentFile = null;
1179
+ let tmpCurrentFields = [];
1180
+
1181
+ for (let i = 0; i < tmpLines.length; i++)
1182
+ {
1183
+ let tmpLine = tmpLines[i].trim();
1184
+
1185
+ // Look for ### or #### headings that name files or tables
1186
+ let tmpHeadingMatch = tmpLine.match(/^#{3,4}\s+(.+)/);
1187
+ if (tmpHeadingMatch)
1188
+ {
1189
+ // Save previous file entry
1190
+ if (tmpCurrentFile)
1191
+ {
1192
+ tmpResult[tmpCurrentFile] = { fields: tmpCurrentFields };
1193
+ }
1194
+
1195
+ // Extract the file/table name from the heading
1196
+ let tmpHeading = tmpHeadingMatch[1].trim();
1197
+
1198
+ // Strip markdown formatting (bold, code) and common suffixes
1199
+ tmpHeading = tmpHeading.replace(/[*`]/g, '').trim();
1200
+
1201
+ // Try to extract just a filename or table name
1202
+ // Common patterns: "ratings.csv", "People (player biographical data)"
1203
+ let tmpNameMatch = tmpHeading.match(/^(\S+\.(?:csv|tsv|json|txt|xml))/i);
1204
+ if (tmpNameMatch)
1205
+ {
1206
+ tmpCurrentFile = tmpNameMatch[1].toLowerCase();
1207
+ }
1208
+ else
1209
+ {
1210
+ // Use the first word as a table name
1211
+ let tmpFirstWord = tmpHeading.split(/[\s(]/)[0];
1212
+ if (tmpFirstWord.length > 1)
1213
+ {
1214
+ tmpCurrentFile = tmpFirstWord.toLowerCase();
1215
+ }
1216
+ else
1217
+ {
1218
+ tmpCurrentFile = null;
1219
+ }
1220
+ }
1221
+
1222
+ tmpCurrentFields = [];
1223
+ continue;
1224
+ }
1225
+
1226
+ // Look for table rows with field definitions: | `fieldName` | type | description |
1227
+ if (tmpCurrentFile && tmpLine.startsWith('|') && !tmpLine.match(/^\|[\s-]+\|/))
1228
+ {
1229
+ let tmpCells = tmpLine.split('|').map((pC) => pC.trim()).filter((pC) => pC.length > 0);
1230
+ if (tmpCells.length >= 2)
1231
+ {
1232
+ let tmpFieldName = tmpCells[0].replace(/`/g, '').trim();
1233
+ if (tmpFieldName.toLowerCase() !== 'field' && tmpFieldName.toLowerCase() !== 'table')
1234
+ {
1235
+ tmpCurrentFields.push(tmpFieldName);
1236
+ }
1237
+ }
1238
+ }
1239
+ }
1240
+
1241
+ // Save last file entry
1242
+ if (tmpCurrentFile)
1243
+ {
1244
+ tmpResult[tmpCurrentFile] = { fields: tmpCurrentFields };
1245
+ }
1246
+
1247
+ return tmpResult;
1248
+ }
1249
+
1250
+ /**
1251
+ * Parse the README IngestionNotes section for primary key, foreign key,
1252
+ * and load-order hints.
1253
+ *
1254
+ * @param {string} pIngestionText - The raw Ingestion Notes text
1255
+ * @returns {object} { primaryKeys: {table: key}, foreignKeys: {table: [keys]}, loadOrder: [names] }
1256
+ */
1257
+ parseIngestionNotes(pIngestionText)
1258
+ {
1259
+ let tmpResult = { primaryKeys: {}, foreignKeys: {}, loadOrder: [] };
1260
+
1261
+ if (!pIngestionText)
1262
+ {
1263
+ return tmpResult;
1264
+ }
1265
+
1266
+ let tmpLines = pIngestionText.split('\n');
1267
+
1268
+ for (let i = 0; i < tmpLines.length; i++)
1269
+ {
1270
+ let tmpLine = tmpLines[i].trim();
1271
+
1272
+ // Look for primary key mentions: "Primary key: fieldName for TableName"
1273
+ // or "playerID for People" patterns or "`movieId` for movies.csv"
1274
+ let tmpPKMatch = tmpLine.match(/[Pp]rimary\s+key[s]?[:\s]+`?(\w+)`?\s+(?:for|in)\s+(\w[\w.]*)/);
1275
+ if (tmpPKMatch)
1276
+ {
1277
+ let tmpKey = tmpPKMatch[1];
1278
+ let tmpTable = tmpPKMatch[2].toLowerCase().replace(/\.csv$|\.tsv$|\.json$/, '');
1279
+ tmpResult.primaryKeys[tmpTable] = tmpKey;
1280
+ }
1281
+
1282
+ // Also look for composite key patterns:
1283
+ // "composite `userId`, `movieId`, `timestamp` for ratings.csv"
1284
+ let tmpCompositeMatch = tmpLine.match(/composite\s+([^f]+)\s+for\s+(\w[\w.]*)/i);
1285
+ if (tmpCompositeMatch)
1286
+ {
1287
+ let tmpKeys = tmpCompositeMatch[1].match(/`(\w+)`/g);
1288
+ let tmpTable = tmpCompositeMatch[2].toLowerCase().replace(/\.csv$|\.tsv$|\.json$/, '');
1289
+ if (tmpKeys)
1290
+ {
1291
+ tmpResult.primaryKeys[tmpTable] = tmpKeys.map((pK) => pK.replace(/`/g, '')).join(', ');
1292
+ }
1293
+ }
1294
+
1295
+ // Look for foreign key mentions: "foreignKey links/references TableName"
1296
+ let tmpFKMatch = tmpLine.match(/`(\w+)`\s+(?:links?|references?|joins?)\s+(?:\w+\s+)?(?:to\s+)?(\w[\w.]*)/i);
1297
+ if (tmpFKMatch)
1298
+ {
1299
+ let tmpKey = tmpFKMatch[1];
1300
+ let tmpRefTable = tmpFKMatch[2].toLowerCase().replace(/\.csv$|\.tsv$|\.json$/, '');
1301
+ if (!tmpResult.foreignKeys[tmpRefTable])
1302
+ {
1303
+ tmpResult.foreignKeys[tmpRefTable] = [];
1304
+ }
1305
+ tmpResult.foreignKeys[tmpRefTable].push(tmpKey);
1306
+ }
1307
+
1308
+ // Look for load-order mentions: "Load X first ... then Y"
1309
+ let tmpLoadOrderMatch = tmpLine.match(/[Ll]oad\s+(\w[\w.]*)\s+first/);
1310
+ if (tmpLoadOrderMatch)
1311
+ {
1312
+ let tmpFirst = tmpLoadOrderMatch[1].toLowerCase().replace(/\.csv$|\.tsv$|\.json$/, '');
1313
+ if (tmpResult.loadOrder.indexOf(tmpFirst) < 0)
1314
+ {
1315
+ tmpResult.loadOrder.push(tmpFirst);
1316
+ }
1317
+ }
1318
+ }
1319
+
1320
+ return tmpResult;
1321
+ }
1322
+
1323
+ /**
1324
+ * Generate an ingestion plan for a dataset.
1325
+ * If a plan already exists on disk, returns it.
1326
+ * Otherwise, auto-generates from file list and README metadata.
1327
+ *
1328
+ * @param {string} pFolderName - The dataset folder name
1329
+ * @param {function} fCallback - Callback(pError, pPlan)
1330
+ */
1331
+ generateIngestionPlan(pFolderName, fCallback)
1332
+ {
1333
+ let tmpDataset = this.getDiscoveredDatasetByName(pFolderName);
1334
+
1335
+ if (!tmpDataset)
1336
+ {
1337
+ return fCallback(new Error(`Dataset not found: ${pFolderName}`));
1338
+ }
1339
+
1340
+ let tmpDataDir = libPath.join(tmpDataset.FolderPath, 'data');
1341
+
1342
+ // Check for existing plan
1343
+ let tmpExistingPlan = this.readIngestionPlan(tmpDataDir);
1344
+ if (tmpExistingPlan)
1345
+ {
1346
+ return fCallback(null, tmpExistingPlan);
1347
+ }
1348
+
1349
+ let tmpSelf = this;
1350
+
1351
+ // Resolve data files
1352
+ this.resolveDataFiles(tmpDataset.FolderPath,
1353
+ (pError, pFiles) =>
1354
+ {
1355
+ if (pError)
1356
+ {
1357
+ return fCallback(pError);
1358
+ }
1359
+
1360
+ if (!pFiles || pFiles.length === 0)
1361
+ {
1362
+ return fCallback(null,
1363
+ {
1364
+ version: 1,
1365
+ generatedAt: new Date().toISOString(),
1366
+ modifiedAt: null,
1367
+ autoGenerated: true,
1368
+ files: []
1369
+ });
1370
+ }
1371
+
1372
+ // Build base plan entries from file list
1373
+ let tmpIngestableFormats = { csv: true, tsv: true, json: true, text: true, xml: true, excel: true };
1374
+ let tmpPlanFiles = [];
1375
+
1376
+ for (let i = 0; i < pFiles.length; i++)
1377
+ {
1378
+ let tmpFile = pFiles[i];
1379
+ let tmpInclude = !!tmpIngestableFormats[tmpFile.Format];
1380
+ let tmpRecordType = tmpSelf.deriveRecordType(tmpFile.FileName);
1381
+ let tmpDelimiter = '';
1382
+
1383
+ if (tmpFile.Format === 'csv')
1384
+ {
1385
+ tmpDelimiter = ',';
1386
+ }
1387
+ else if (tmpFile.Format === 'tsv')
1388
+ {
1389
+ tmpDelimiter = '\t';
1390
+ }
1391
+
1392
+ tmpPlanFiles.push(
1393
+ {
1394
+ fileName: tmpFile.FileName,
1395
+ include: tmpInclude,
1396
+ format: tmpFile.Format,
1397
+ delimiter: tmpDelimiter,
1398
+ recordType: tmpRecordType,
1399
+ order: i + 1,
1400
+ primaryKey: '',
1401
+ foreignKeys: [],
1402
+ notes: ''
1403
+ });
1404
+ }
1405
+
1406
+ // Enrich from README if available
1407
+ try
1408
+ {
1409
+ let tmpReadmePath = libPath.join(tmpDataset.FolderPath, 'README.md');
1410
+ if (libFs.existsSync(tmpReadmePath))
1411
+ {
1412
+ let tmpReadmeContent = libFs.readFileSync(tmpReadmePath, 'utf8');
1413
+ let tmpReadme = tmpSelf.parseReadme(tmpReadmeContent);
1414
+
1415
+ // Parse schema for per-file metadata
1416
+ let tmpSchemaFiles = tmpSelf.parseSchemaForFiles(tmpReadme.Schema);
1417
+
1418
+ // Parse ingestion notes for keys and load order
1419
+ let tmpIngestionHints = tmpSelf.parseIngestionNotes(tmpReadme.IngestionNotes);
1420
+
1421
+ // Match schema info to plan entries
1422
+ for (let i = 0; i < tmpPlanFiles.length; i++)
1423
+ {
1424
+ let tmpEntry = tmpPlanFiles[i];
1425
+ let tmpLowerType = tmpEntry.recordType.toLowerCase();
1426
+ let tmpLowerFileName = tmpEntry.fileName.toLowerCase();
1427
+ let tmpBaseFileName = libPath.basename(tmpLowerFileName);
1428
+
1429
+ // Try to match by recordType or full filename
1430
+ let tmpSchemaMatch = tmpSchemaFiles[tmpLowerType]
1431
+ || tmpSchemaFiles[tmpBaseFileName]
1432
+ || tmpSchemaFiles[tmpBaseFileName.replace(/\.\w+$/, '')];
1433
+
1434
+ if (tmpSchemaMatch)
1435
+ {
1436
+ // Schema info found — no direct mapping to primaryKey here,
1437
+ // but confirms this is a recognized file
1438
+ }
1439
+
1440
+ // Match primary key from ingestion notes
1441
+ if (tmpIngestionHints.primaryKeys[tmpLowerType])
1442
+ {
1443
+ tmpEntry.primaryKey = tmpIngestionHints.primaryKeys[tmpLowerType];
1444
+ }
1445
+
1446
+ // Match foreign keys
1447
+ if (tmpIngestionHints.foreignKeys[tmpLowerType])
1448
+ {
1449
+ tmpEntry.foreignKeys = tmpIngestionHints.foreignKeys[tmpLowerType];
1450
+ }
1451
+ }
1452
+
1453
+ // Apply load order if available
1454
+ if (tmpIngestionHints.loadOrder.length > 0)
1455
+ {
1456
+ let tmpOrderMap = {};
1457
+ for (let i = 0; i < tmpIngestionHints.loadOrder.length; i++)
1458
+ {
1459
+ tmpOrderMap[tmpIngestionHints.loadOrder[i]] = i + 1;
1460
+ }
1461
+
1462
+ // Reorder: items in loadOrder get low order numbers, others follow
1463
+ let tmpNextOrder = tmpIngestionHints.loadOrder.length + 1;
1464
+ for (let i = 0; i < tmpPlanFiles.length; i++)
1465
+ {
1466
+ let tmpLowerType = tmpPlanFiles[i].recordType.toLowerCase();
1467
+ if (tmpOrderMap[tmpLowerType] !== undefined)
1468
+ {
1469
+ tmpPlanFiles[i].order = tmpOrderMap[tmpLowerType];
1470
+ }
1471
+ else
1472
+ {
1473
+ tmpPlanFiles[i].order = tmpNextOrder++;
1474
+ }
1475
+ }
1476
+ }
1477
+
1478
+ // Override delimiter from README DataFormat if all files share a format
1479
+ if (tmpReadme.DataFormat && tmpReadme.DataFormat.Delimiter)
1480
+ {
1481
+ for (let i = 0; i < tmpPlanFiles.length; i++)
1482
+ {
1483
+ if (tmpPlanFiles[i].format === 'csv' || tmpPlanFiles[i].format === 'tsv' || tmpPlanFiles[i].format === 'text')
1484
+ {
1485
+ tmpPlanFiles[i].delimiter = tmpReadme.DataFormat.Delimiter;
1486
+ }
1487
+ }
1488
+ }
1489
+ }
1490
+ }
1491
+ catch (pEnrichError)
1492
+ {
1493
+ tmpSelf.log.warn(`Ingestion plan enrichment failed for ${pFolderName}: ${pEnrichError.message}`);
1494
+ }
1495
+
1496
+ // Sort by order
1497
+ tmpPlanFiles.sort((a, b) => a.order - b.order);
1498
+
1499
+ let tmpPlan = {
1500
+ version: 1,
1501
+ generatedAt: new Date().toISOString(),
1502
+ modifiedAt: null,
1503
+ autoGenerated: true,
1504
+ files: tmpPlanFiles
1505
+ };
1506
+
1507
+ // Write the plan to disk
1508
+ if (libFs.existsSync(tmpDataDir))
1509
+ {
1510
+ tmpSelf.writeIngestionPlan(tmpDataDir, tmpPlan);
1511
+ }
1512
+
1513
+ return fCallback(null, tmpPlan);
1514
+ });
1515
+ }
1516
+
1517
+ /**
1518
+ * Ingest a single data file with decompression support.
1519
+ *
1520
+ * @param {object} pFile - Data file object from resolveDataFiles
1521
+ * @param {object} pDataset - The discovered dataset
1522
+ * @param {object} pOptions - Ingestion options { format, delimiter, type }
1523
+ * @param {function} fCallback - Callback(pError, pResult)
1524
+ */
1525
+ ingestSingleFile(pFile, pDataset, pOptions, fCallback)
1526
+ {
1527
+ let tmpSelf = this;
1528
+ let tmpFilePath = pFile.FullPath;
1529
+
1530
+ this.fable.log.info(`SourceFolderScanner: ingesting ${pFile.FileName} from ${pDataset.FolderName}`);
1531
+
1532
+ // Handle compressed .gz files
1533
+ if (pFile.Compressed && tmpFilePath.endsWith('.gz') && !tmpFilePath.endsWith('.tar.gz'))
1534
+ {
1535
+ let tmpDecompressedPath = tmpFilePath.slice(0, -3);
1536
+
1537
+ let tmpInput = libFs.createReadStream(tmpFilePath);
1538
+ let tmpGunzip = libZlib.createGunzip();
1539
+ let tmpOutput = libFs.createWriteStream(tmpDecompressedPath);
1540
+
1541
+ tmpInput.pipe(tmpGunzip).pipe(tmpOutput);
1542
+
1543
+ tmpOutput.on('finish',
1544
+ () =>
1545
+ {
1546
+ tmpSelf.fable.RetoldFactoIngestEngine.ingestFile(
1547
+ tmpDecompressedPath,
1548
+ pDataset.IDDataset,
1549
+ pDataset.IDSource,
1550
+ pOptions,
1551
+ fCallback);
1552
+ });
1553
+
1554
+ tmpOutput.on('error', (pErr) => fCallback(pErr));
1555
+ tmpGunzip.on('error', (pErr) => fCallback(pErr));
1556
+ }
1557
+ else if (!pFile.Compressed || pFile.Format !== 'archive')
1558
+ {
1559
+ this.fable.RetoldFactoIngestEngine.ingestFile(
1560
+ tmpFilePath,
1561
+ pDataset.IDDataset,
1562
+ pDataset.IDSource,
1563
+ pOptions,
1564
+ fCallback);
1565
+ }
1566
+ else
1567
+ {
1568
+ return fCallback(new Error(`Cannot directly ingest archive file ${pFile.FileName}. Extract first.`));
1569
+ }
1570
+ }
1571
+
1572
+ /**
1573
+ * Ingest a dataset's data files into the facto database.
1574
+ * Supports both single-file (legacy) and multi-file plan-based ingestion.
1575
+ *
1576
+ * @param {string} pFolderName - The dataset folder name
1577
+ * @param {object} pOptions - { fileName, format, useIngestionPlan }
1578
+ * @param {function} fCallback - Callback(pError, pResult)
1579
+ */
1580
+ ingestDataset(pFolderName, pOptions, fCallback)
1581
+ {
1582
+ let tmpDataset = this.getDiscoveredDatasetByName(pFolderName);
1583
+
1584
+ if (!tmpDataset)
1585
+ {
1586
+ return fCallback(new Error(`Dataset not found: ${pFolderName}`));
1587
+ }
1588
+
1589
+ if (!tmpDataset.IDDataset || !tmpDataset.IDSource)
1590
+ {
1591
+ // Auto-provision first
1592
+ return this.provisionDataset(pFolderName,
1593
+ (pProvError) =>
1594
+ {
1595
+ if (pProvError)
1596
+ {
1597
+ return fCallback(pProvError);
1598
+ }
1599
+ return this.ingestDataset(pFolderName, pOptions, fCallback);
1600
+ });
1601
+ }
1602
+
1603
+ if (!tmpDataset.HasData)
1604
+ {
1605
+ return fCallback(new Error(`Dataset ${pFolderName} has no data files. Download first.`));
1606
+ }
1607
+
1608
+ if (!this.fable.RetoldFactoIngestEngine)
1609
+ {
1610
+ return fCallback(new Error('IngestEngine not initialized'));
1611
+ }
1612
+
1613
+ let tmpOptions = pOptions || {};
1614
+ let tmpSelf = this;
1615
+ let tmpDataDir = libPath.join(tmpDataset.FolderPath, 'data');
1616
+
1617
+ // Pre-ingest: extract any archive files so selectBestDataFile /
1618
+ // ingestFromPlan see the extracted contents rather than the archives.
1619
+ this.extractArchivesIfNeeded(tmpDataset,
1620
+ (pExtractError) =>
1621
+ {
1622
+ if (pExtractError)
1623
+ {
1624
+ tmpSelf.fable.log.warn(`SourceFolderScanner: archive extraction error for ${pFolderName}: ${pExtractError.message}`);
1625
+ }
1626
+
1627
+ // Check if we should use the ingestion plan
1628
+ let tmpExistingPlan = tmpSelf.readIngestionPlan(tmpDataDir);
1629
+ let tmpUsePlan = tmpOptions.useIngestionPlan || (tmpExistingPlan && !tmpOptions.fileName);
1630
+
1631
+ if (tmpUsePlan)
1632
+ {
1633
+ return tmpSelf.ingestFromPlan(pFolderName, tmpExistingPlan, fCallback);
1634
+ }
1635
+
1636
+ // Legacy single-file ingestion path
1637
+ let tmpFile = null;
1638
+ if (tmpOptions.fileName)
1639
+ {
1640
+ tmpFile = tmpDataset.DataFiles.find((f) => f.FileName === tmpOptions.fileName);
1641
+ }
1642
+ else
1643
+ {
1644
+ tmpFile = tmpSelf.selectBestDataFile(tmpDataset.DataFiles);
1645
+ }
1646
+
1647
+ if (!tmpFile)
1648
+ {
1649
+ return fCallback(new Error(`No suitable data file found in ${pFolderName}`));
1650
+ }
1651
+
1652
+ tmpSelf.ingestSingleFile(tmpFile, tmpDataset, tmpOptions,
1653
+ (pIngestError, pResult) =>
1654
+ {
1655
+ if (!pIngestError)
1656
+ {
1657
+ tmpDataset.Status = 'Ingested';
1658
+ tmpDataset.IngestedAt = new Date().toISOString();
1659
+ }
1660
+ return fCallback(pIngestError, pResult);
1661
+ });
1662
+ });
1663
+ }
1664
+
1665
+ /**
1666
+ * Ingest multiple files from a dataset using its ingestion plan.
1667
+ *
1668
+ * @param {string} pFolderName - The dataset folder name
1669
+ * @param {object|null} pPlan - Existing plan, or null to auto-generate
1670
+ * @param {function} fCallback - Callback(pError, pResult)
1671
+ */
1672
+ ingestFromPlan(pFolderName, pPlan, fCallback)
1673
+ {
1674
+ let tmpSelf = this;
1675
+ let tmpDataset = this.getDiscoveredDatasetByName(pFolderName);
1676
+
1677
+ if (!tmpDataset)
1678
+ {
1679
+ return fCallback(new Error(`Dataset not found: ${pFolderName}`));
1680
+ }
1681
+
1682
+ let tmpExecutePlan = (pPlanToUse) =>
1683
+ {
1684
+ // Filter to included files and sort by order
1685
+ let tmpIncluded = (pPlanToUse.files || [])
1686
+ .filter((pEntry) => pEntry.include)
1687
+ .sort((a, b) => (a.order || 0) - (b.order || 0));
1688
+
1689
+ if (tmpIncluded.length === 0)
1690
+ {
1691
+ return fCallback(new Error(`Ingestion plan for ${pFolderName} has no included files.`));
1692
+ }
1693
+
1694
+ tmpSelf.fable.log.info(`SourceFolderScanner: ingesting ${tmpIncluded.length} files from plan for ${pFolderName}`);
1695
+
1696
+ let tmpAnticipate = tmpSelf.fable.newAnticipate();
1697
+ let tmpFileResults = [];
1698
+ let tmpFilesIngested = 0;
1699
+ let tmpFilesErrored = 0;
1700
+ let tmpTotalRecords = 0;
1701
+
1702
+ for (let i = 0; i < tmpIncluded.length; i++)
1703
+ {
1704
+ let tmpPlanEntry = tmpIncluded[i];
1705
+
1706
+ tmpAnticipate.anticipate(
1707
+ (fStep) =>
1708
+ {
1709
+ // Find the matching data file
1710
+ let tmpFile = tmpDataset.DataFiles.find(
1711
+ (pF) => pF.FileName === tmpPlanEntry.fileName);
1712
+
1713
+ if (!tmpFile)
1714
+ {
1715
+ tmpSelf.fable.log.warn(` Skipping ${tmpPlanEntry.fileName}: file not found`);
1716
+ tmpFilesErrored++;
1717
+ tmpFileResults.push(
1718
+ {
1719
+ fileName: tmpPlanEntry.fileName,
1720
+ recordType: tmpPlanEntry.recordType,
1721
+ recordsIngested: 0,
1722
+ error: 'File not found'
1723
+ });
1724
+ return fStep();
1725
+ }
1726
+
1727
+ let tmpFileOptions = {
1728
+ format: tmpPlanEntry.format,
1729
+ type: tmpPlanEntry.recordType
1730
+ };
1731
+ if (tmpPlanEntry.delimiter)
1732
+ {
1733
+ tmpFileOptions.delimiter = tmpPlanEntry.delimiter;
1734
+ }
1735
+
1736
+ tmpSelf.fable.log.info(` [${tmpFilesIngested + tmpFilesErrored + 1}/${tmpIncluded.length}] ${tmpPlanEntry.fileName} as "${tmpPlanEntry.recordType}"`);
1737
+
1738
+ tmpSelf.ingestSingleFile(tmpFile, tmpDataset, tmpFileOptions,
1739
+ (pIngestError, pResult) =>
1740
+ {
1741
+ if (pIngestError)
1742
+ {
1743
+ tmpFilesErrored++;
1744
+ tmpFileResults.push(
1745
+ {
1746
+ fileName: tmpPlanEntry.fileName,
1747
+ recordType: tmpPlanEntry.recordType,
1748
+ recordsIngested: 0,
1749
+ error: pIngestError.message
1750
+ });
1751
+ }
1752
+ else
1753
+ {
1754
+ tmpFilesIngested++;
1755
+ let tmpCount = (pResult && pResult.Ingested) ? pResult.Ingested : 0;
1756
+ tmpTotalRecords += tmpCount;
1757
+ tmpFileResults.push(
1758
+ {
1759
+ fileName: tmpPlanEntry.fileName,
1760
+ recordType: tmpPlanEntry.recordType,
1761
+ recordsIngested: tmpCount,
1762
+ error: null
1763
+ });
1764
+ }
1765
+ return fStep();
1766
+ });
1767
+ });
1768
+ }
1769
+
1770
+ tmpAnticipate.wait(
1771
+ (pError) =>
1772
+ {
1773
+ if (tmpFilesIngested > 0)
1774
+ {
1775
+ tmpDataset.Status = 'Ingested';
1776
+ tmpDataset.IngestedAt = new Date().toISOString();
1777
+ }
1778
+
1779
+ return fCallback(null,
1780
+ {
1781
+ Success: tmpFilesIngested > 0,
1782
+ FolderName: pFolderName,
1783
+ FilesIngested: tmpFilesIngested,
1784
+ FilesErrored: tmpFilesErrored,
1785
+ TotalRecords: tmpTotalRecords,
1786
+ FileResults: tmpFileResults
1787
+ });
1788
+ });
1789
+ };
1790
+
1791
+ // Use provided plan or generate one
1792
+ if (pPlan)
1793
+ {
1794
+ return tmpExecutePlan(pPlan);
1795
+ }
1796
+
1797
+ this.generateIngestionPlan(pFolderName,
1798
+ (pGenError, pGeneratedPlan) =>
1799
+ {
1800
+ if (pGenError)
1801
+ {
1802
+ return fCallback(pGenError);
1803
+ }
1804
+ return tmpExecutePlan(pGeneratedPlan);
1805
+ });
1806
+ }
1807
+
1808
+ // ================================================================
1809
+ // Download Support
1810
+ // ================================================================
1811
+
1812
+ /**
1813
+ * Download data files for a dataset that has no cached data.
1814
+ * Matches the folder name to the download-catalog.json.
1815
+ *
1816
+ * @param {string} pFolderName - The dataset folder name
1817
+ * @param {function} fCallback - Callback(pError, pResult)
1818
+ */
1819
+ downloadDataset(pFolderName, fCallback)
1820
+ {
1821
+ let tmpDataset = this.getDiscoveredDatasetByName(pFolderName);
1822
+
1823
+ if (!tmpDataset)
1824
+ {
1825
+ return fCallback(new Error(`Dataset not found: ${pFolderName}`));
1826
+ }
1827
+
1828
+ if (!this.fable.RetoldFactoDataLakeService)
1829
+ {
1830
+ return fCallback(new Error('DataLakeService not initialized'));
1831
+ }
1832
+
1833
+ let tmpDataLake = this.fable.RetoldFactoDataLakeService;
1834
+ let tmpSelf = this;
1835
+
1836
+ // Try to match folder name to a catalog entry
1837
+ let tmpCatalog = tmpDataLake.loadCatalog();
1838
+ if (!tmpCatalog)
1839
+ {
1840
+ return fCallback(new Error('Could not load download catalog'));
1841
+ }
1842
+
1843
+ let tmpEntries = tmpDataLake.flattenCatalog(tmpCatalog, { id: pFolderName });
1844
+
1845
+ if (tmpEntries.length === 0)
1846
+ {
1847
+ return fCallback(new Error(`No catalog entry found for ${pFolderName}. Cannot auto-download.`));
1848
+ }
1849
+
1850
+ let tmpEntry = tmpEntries[0];
1851
+
1852
+ // Override the data directory to point to our source_research data/ folder
1853
+ let tmpDataDir = libPath.join(tmpDataset.FolderPath, 'data');
1854
+ if (!libFs.existsSync(tmpDataDir))
1855
+ {
1856
+ libFs.mkdirSync(tmpDataDir, { recursive: true });
1857
+ }
1858
+
1859
+ this.fable.log.info(`SourceFolderScanner: downloading ${pFolderName} via DataLakeService`);
1860
+
1861
+ // Use DataLakeService's download methods directly
1862
+ tmpDataLake.downloadDataset(tmpEntry,
1863
+ (pError, pManifest) =>
1864
+ {
1865
+ if (pError)
1866
+ {
1867
+ return fCallback(pError);
1868
+ }
1869
+
1870
+ // Re-scan the data files
1871
+ tmpSelf.resolveDataFiles(tmpDataset.FolderPath,
1872
+ (pFileError, pFiles) =>
1873
+ {
1874
+ tmpDataset.DataFiles = pFiles || [];
1875
+ tmpDataset.TotalDataSize = 0;
1876
+ for (let i = 0; i < tmpDataset.DataFiles.length; i++)
1877
+ {
1878
+ tmpDataset.TotalDataSize += tmpDataset.DataFiles[i].Size;
1879
+ }
1880
+ tmpDataset.HasData = tmpDataset.DataFiles.length > 0;
1881
+ tmpDataset.NeedsDownload = !tmpDataset.HasData;
1882
+
1883
+ return fCallback(null,
1884
+ {
1885
+ Success: true,
1886
+ FolderName: pFolderName,
1887
+ FilesDownloaded: tmpDataset.DataFiles.length,
1888
+ TotalSize: tmpDataset.TotalDataSize
1889
+ });
1890
+ });
1891
+ });
1892
+ }
1893
+
1894
+ // ================================================================
1895
+ // REST API Routes
1896
+ // ================================================================
1897
+
1898
+ /**
1899
+ * Format a byte size for display.
1900
+ */
1901
+ formatSize(pBytes)
1902
+ {
1903
+ if (pBytes === 0)
1904
+ {
1905
+ return '0 B';
1906
+ }
1907
+ let tmpUnits = ['B', 'KB', 'MB', 'GB', 'TB'];
1908
+ let tmpIndex = Math.floor(Math.log(pBytes) / Math.log(1024));
1909
+ return `${(pBytes / Math.pow(1024, tmpIndex)).toFixed(1)} ${tmpUnits[tmpIndex]}`;
1910
+ }
1911
+
1912
+ /**
1913
+ * Connect REST API routes.
1914
+ *
1915
+ * @param {object} pOratorServiceServer - The Orator service server instance
1916
+ */
1917
+ connectRoutes(pOratorServiceServer)
1918
+ {
1919
+ let tmpRoutePrefix = this.options.RoutePrefix;
1920
+ let tmpSelf = this;
1921
+
1922
+ // GET /facto/scanner/paths — list all scan paths
1923
+ pOratorServiceServer.doGet(`${tmpRoutePrefix}/scanner/paths`,
1924
+ (pRequest, pResponse, fNext) =>
1925
+ {
1926
+ pResponse.send(
1927
+ {
1928
+ Count: tmpSelf.scanPaths.length,
1929
+ Paths: tmpSelf.scanPaths
1930
+ });
1931
+ return fNext();
1932
+ });
1933
+
1934
+ // POST /facto/scanner/path — add a scan path
1935
+ pOratorServiceServer.doPost(`${tmpRoutePrefix}/scanner/path`,
1936
+ (pRequest, pResponse, fNext) =>
1937
+ {
1938
+ let tmpBody = pRequest.body || {};
1939
+ let tmpPath = tmpBody.Path;
1940
+
1941
+ if (!tmpPath)
1942
+ {
1943
+ pResponse.send({ Error: 'Path is required' });
1944
+ return fNext();
1945
+ }
1946
+
1947
+ tmpSelf.addScanPath(tmpPath,
1948
+ (pError, pResult) =>
1949
+ {
1950
+ if (pError)
1951
+ {
1952
+ pResponse.send({ Error: pError.message });
1953
+ return fNext();
1954
+ }
1955
+ pResponse.send({ Success: true, ScanResult: pResult });
1956
+ return fNext();
1957
+ });
1958
+ });
1959
+
1960
+ // DELETE /facto/scanner/path — remove a scan path
1961
+ pOratorServiceServer.doDel(`${tmpRoutePrefix}/scanner/path`,
1962
+ (pRequest, pResponse, fNext) =>
1963
+ {
1964
+ let tmpBody = pRequest.body || {};
1965
+ let tmpPath = tmpBody.Path;
1966
+
1967
+ if (!tmpPath)
1968
+ {
1969
+ pResponse.send({ Error: 'Path is required' });
1970
+ return fNext();
1971
+ }
1972
+
1973
+ tmpSelf.removeScanPath(tmpPath,
1974
+ (pError, pResult) =>
1975
+ {
1976
+ if (pError)
1977
+ {
1978
+ pResponse.send({ Error: pError.message });
1979
+ return fNext();
1980
+ }
1981
+ pResponse.send({ Success: true, Result: pResult });
1982
+ return fNext();
1983
+ });
1984
+ });
1985
+
1986
+ // POST /facto/scanner/rescan — re-scan all paths or a specific one
1987
+ pOratorServiceServer.doPost(`${tmpRoutePrefix}/scanner/rescan`,
1988
+ (pRequest, pResponse, fNext) =>
1989
+ {
1990
+ let tmpBody = pRequest.body || {};
1991
+
1992
+ if (tmpBody.Path)
1993
+ {
1994
+ tmpSelf.scanPath(tmpBody.Path,
1995
+ (pError, pResult) =>
1996
+ {
1997
+ if (pError)
1998
+ {
1999
+ pResponse.send({ Error: pError.message });
2000
+ return fNext();
2001
+ }
2002
+ pResponse.send({ Success: true, ScanResult: pResult });
2003
+ return fNext();
2004
+ });
2005
+ }
2006
+ else
2007
+ {
2008
+ tmpSelf.scanAllPaths(
2009
+ (pError, pResults) =>
2010
+ {
2011
+ if (pError)
2012
+ {
2013
+ pResponse.send({ Error: pError.message });
2014
+ return fNext();
2015
+ }
2016
+ pResponse.send({ Success: true, ScanResults: pResults });
2017
+ return fNext();
2018
+ });
2019
+ }
2020
+ });
2021
+
2022
+ // GET /facto/scanner/datasets — list all discovered datasets
2023
+ pOratorServiceServer.doGet(`${tmpRoutePrefix}/scanner/datasets`,
2024
+ (pRequest, pResponse, fNext) =>
2025
+ {
2026
+ let tmpFilter = {};
2027
+
2028
+ if (pRequest.query)
2029
+ {
2030
+ if (pRequest.query.status)
2031
+ {
2032
+ tmpFilter.status = pRequest.query.status;
2033
+ }
2034
+ if (pRequest.query.search)
2035
+ {
2036
+ tmpFilter.search = pRequest.query.search;
2037
+ }
2038
+ if (pRequest.query.hasData !== undefined)
2039
+ {
2040
+ tmpFilter.hasData = pRequest.query.hasData === 'true';
2041
+ }
2042
+ }
2043
+
2044
+ let tmpDatasets = tmpSelf.getDiscoveredDatasets(tmpFilter);
2045
+
2046
+ // Return summary view (omit large schema/description fields)
2047
+ let tmpSummaries = tmpDatasets.map(
2048
+ (pDs) =>
2049
+ {
2050
+ return (
2051
+ {
2052
+ FolderName: pDs.FolderName,
2053
+ Title: pDs.Title,
2054
+ Provider: pDs.Provider,
2055
+ License: pDs.License,
2056
+ Status: pDs.Status,
2057
+ HasData: pDs.HasData,
2058
+ NeedsDownload: pDs.NeedsDownload,
2059
+ DataFileCount: pDs.DataFiles.length,
2060
+ TotalDataSize: pDs.TotalDataSize,
2061
+ TotalDataSizeFormatted: tmpSelf.formatSize(pDs.TotalDataSize),
2062
+ DataFormat: pDs.DataFormat,
2063
+ RecordCount: pDs.RecordCount,
2064
+ IDSource: pDs.IDSource,
2065
+ IDDataset: pDs.IDDataset,
2066
+ DiscoveredAt: pDs.DiscoveredAt
2067
+ });
2068
+ });
2069
+
2070
+ pResponse.send({ Count: tmpSummaries.length, Datasets: tmpSummaries });
2071
+ return fNext();
2072
+ });
2073
+
2074
+ // GET /facto/scanner/dataset/:FolderName — get full dataset detail
2075
+ pOratorServiceServer.doGet(`${tmpRoutePrefix}/scanner/dataset/:FolderName`,
2076
+ (pRequest, pResponse, fNext) =>
2077
+ {
2078
+ let tmpDataset = tmpSelf.getDiscoveredDatasetByName(pRequest.params.FolderName);
2079
+
2080
+ if (!tmpDataset)
2081
+ {
2082
+ pResponse.send({ Error: `Dataset not found: ${pRequest.params.FolderName}` });
2083
+ return fNext();
2084
+ }
2085
+
2086
+ pResponse.send(tmpDataset);
2087
+ return fNext();
2088
+ });
2089
+
2090
+ // POST /facto/scanner/dataset/:FolderName/provision — provision a dataset
2091
+ pOratorServiceServer.doPost(`${tmpRoutePrefix}/scanner/dataset/:FolderName/provision`,
2092
+ (pRequest, pResponse, fNext) =>
2093
+ {
2094
+ tmpSelf.provisionDataset(pRequest.params.FolderName,
2095
+ (pError, pResult) =>
2096
+ {
2097
+ if (pError)
2098
+ {
2099
+ pResponse.send({ Error: pError.message });
2100
+ return fNext();
2101
+ }
2102
+ pResponse.send(pResult);
2103
+ return fNext();
2104
+ });
2105
+ });
2106
+
2107
+ // GET /facto/scanner/dataset/:FolderName/ingestion-plan — get or generate ingestion plan
2108
+ pOratorServiceServer.doGet(`${tmpRoutePrefix}/scanner/dataset/:FolderName/ingestion-plan`,
2109
+ (pRequest, pResponse, fNext) =>
2110
+ {
2111
+ tmpSelf.generateIngestionPlan(pRequest.params.FolderName,
2112
+ (pError, pPlan) =>
2113
+ {
2114
+ if (pError)
2115
+ {
2116
+ pResponse.send({ Error: pError.message });
2117
+ return fNext();
2118
+ }
2119
+ pResponse.send(pPlan);
2120
+ return fNext();
2121
+ });
2122
+ });
2123
+
2124
+ // PUT /facto/scanner/dataset/:FolderName/ingestion-plan — save edited ingestion plan
2125
+ pOratorServiceServer.doPut(`${tmpRoutePrefix}/scanner/dataset/:FolderName/ingestion-plan`,
2126
+ (pRequest, pResponse, fNext) =>
2127
+ {
2128
+ let tmpDataset = tmpSelf.getDiscoveredDatasetByName(pRequest.params.FolderName);
2129
+
2130
+ if (!tmpDataset)
2131
+ {
2132
+ pResponse.send({ Error: `Dataset not found: ${pRequest.params.FolderName}` });
2133
+ return fNext();
2134
+ }
2135
+
2136
+ let tmpPlan = pRequest.body;
2137
+
2138
+ if (!tmpPlan || !Array.isArray(tmpPlan.files))
2139
+ {
2140
+ pResponse.send({ Error: 'Invalid plan: files array is required' });
2141
+ return fNext();
2142
+ }
2143
+
2144
+ // Mark as user-modified
2145
+ tmpPlan.autoGenerated = false;
2146
+ tmpPlan.modifiedAt = new Date().toISOString();
2147
+
2148
+ let tmpDataDir = libPath.join(tmpDataset.FolderPath, 'data');
2149
+ tmpSelf.writeIngestionPlan(tmpDataDir, tmpPlan);
2150
+
2151
+ pResponse.send({ Success: true, Plan: tmpPlan });
2152
+ return fNext();
2153
+ });
2154
+
2155
+ // POST /facto/scanner/dataset/:FolderName/ingest — ingest a dataset
2156
+ pOratorServiceServer.doPost(`${tmpRoutePrefix}/scanner/dataset/:FolderName/ingest`,
2157
+ (pRequest, pResponse, fNext) =>
2158
+ {
2159
+ let tmpOptions = pRequest.body || {};
2160
+
2161
+ tmpSelf.ingestDataset(pRequest.params.FolderName, tmpOptions,
2162
+ (pError, pResult) =>
2163
+ {
2164
+ if (pError)
2165
+ {
2166
+ pResponse.send({ Error: pError.message });
2167
+ return fNext();
2168
+ }
2169
+ pResponse.send(pResult);
2170
+ return fNext();
2171
+ });
2172
+ });
2173
+
2174
+ // POST /facto/scanner/dataset/:FolderName/download — download data if missing
2175
+ pOratorServiceServer.doPost(`${tmpRoutePrefix}/scanner/dataset/:FolderName/download`,
2176
+ (pRequest, pResponse, fNext) =>
2177
+ {
2178
+ tmpSelf.downloadDataset(pRequest.params.FolderName,
2179
+ (pError, pResult) =>
2180
+ {
2181
+ if (pError)
2182
+ {
2183
+ pResponse.send({ Error: pError.message });
2184
+ return fNext();
2185
+ }
2186
+ pResponse.send(pResult);
2187
+ return fNext();
2188
+ });
2189
+ });
2190
+
2191
+ // POST /facto/scanner/provision-all — provision all discovered datasets
2192
+ pOratorServiceServer.doPost(`${tmpRoutePrefix}/scanner/provision-all`,
2193
+ (pRequest, pResponse, fNext) =>
2194
+ {
2195
+ let tmpDatasets = tmpSelf.getDiscoveredDatasets({ status: 'Discovered' });
2196
+ let tmpAnticipate = tmpSelf.fable.newAnticipate();
2197
+ let tmpProvisioned = 0;
2198
+ let tmpErrors = 0;
2199
+
2200
+ for (let i = 0; i < tmpDatasets.length; i++)
2201
+ {
2202
+ let tmpFolderName = tmpDatasets[i].FolderName;
2203
+
2204
+ tmpAnticipate.anticipate(
2205
+ (fStep) =>
2206
+ {
2207
+ tmpSelf.provisionDataset(tmpFolderName,
2208
+ (pError) =>
2209
+ {
2210
+ if (pError)
2211
+ {
2212
+ tmpErrors++;
2213
+ }
2214
+ else
2215
+ {
2216
+ tmpProvisioned++;
2217
+ }
2218
+ return fStep();
2219
+ });
2220
+ });
2221
+ }
2222
+
2223
+ tmpAnticipate.wait(
2224
+ (pError) =>
2225
+ {
2226
+ pResponse.send(
2227
+ {
2228
+ Success: true,
2229
+ Provisioned: tmpProvisioned,
2230
+ Errors: tmpErrors,
2231
+ Total: tmpDatasets.length
2232
+ });
2233
+ return fNext();
2234
+ });
2235
+ });
2236
+
2237
+ this.fable.log.info(`SourceFolderScanner routes connected at ${tmpRoutePrefix}/scanner/*`);
2238
+ }
2239
+ }
2240
+
2241
+ module.exports = RetoldFactoSourceFolderScanner;
2242
+ module.exports.serviceType = 'RetoldFactoSourceFolderScanner';
2243
+ module.exports.default_configuration = defaultSourceFolderScannerOptions;