@sanity/export 6.0.2 → 6.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/bin/detect-corrupt.js +109 -0
- package/dist/detectCorruption.d.ts +64 -0
- package/dist/detectCorruption.d.ts.map +1 -0
- package/dist/detectCorruption.js +193 -0
- package/dist/detectCorruption.js.map +1 -0
- package/dist/util/streamHelpers.d.ts.map +1 -1
- package/dist/util/streamHelpers.js +5 -1
- package/dist/util/streamHelpers.js.map +1 -1
- package/package.json +8 -2
- package/src/detectCorruption.ts +258 -0
- package/src/util/streamHelpers.ts +6 -1
package/LICENSE
CHANGED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* CLI tool to detect potentially corrupted export files caused by UTF-8
|
|
5
|
+
* multi-byte characters being split across chunk boundaries during streaming.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* npx @sanity/export detect-corrupt <file.ndjson|file.tar.gz|directory>
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import {existsSync} from 'node:fs'
|
|
12
|
+
import {detectCorruption} from '../dist/detectCorruption.js'
|
|
13
|
+
|
|
14
|
+
const REPLACEMENT_CHAR_DISPLAY = '�'
|
|
15
|
+
|
|
16
|
+
function printUsage() {
|
|
17
|
+
console.log(`
|
|
18
|
+
Usage: detect-corrupt <file.ndjson|file.tar.gz|directory>
|
|
19
|
+
|
|
20
|
+
Detects potentially corrupted export files caused by UTF-8 multi-byte
|
|
21
|
+
characters being split across chunk boundaries during streaming.
|
|
22
|
+
|
|
23
|
+
The corruption manifests as U+FFFD replacement characters (${REPLACEMENT_CHAR_DISPLAY}) appearing
|
|
24
|
+
where valid multi-byte characters should be.
|
|
25
|
+
|
|
26
|
+
Supported inputs:
|
|
27
|
+
- .tar.gz or .tgz archive (scans data.ndjson and assets.json inside)
|
|
28
|
+
- .ndjson file
|
|
29
|
+
- Directory containing data.ndjson and/or assets.json
|
|
30
|
+
|
|
31
|
+
Examples:
|
|
32
|
+
npx @sanity/export detect-corrupt export.tar.gz
|
|
33
|
+
npx @sanity/export detect-corrupt data.ndjson
|
|
34
|
+
npx @sanity/export detect-corrupt ./my-export-folder
|
|
35
|
+
`)
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function printReport(filename, corruptions) {
|
|
39
|
+
console.log(`\n ${filename}:`)
|
|
40
|
+
|
|
41
|
+
// Limit output to avoid overwhelming terminal
|
|
42
|
+
const maxToShow = 10
|
|
43
|
+
const shown = corruptions.slice(0, maxToShow)
|
|
44
|
+
|
|
45
|
+
for (const c of shown) {
|
|
46
|
+
console.log(` Line ${c.line}, col ${c.column}: ${c.count} replacement char(s)`)
|
|
47
|
+
// Escape the context for display
|
|
48
|
+
const displayContext = c.context
|
|
49
|
+
.replace(/\n/g, '\\n')
|
|
50
|
+
.replace(/\r/g, '\\r')
|
|
51
|
+
.replace(/\t/g, '\\t')
|
|
52
|
+
console.log(` Context: "...${displayContext}..."`)
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if (corruptions.length > maxToShow) {
|
|
56
|
+
console.log(` ... and ${corruptions.length - maxToShow} more occurrences`)
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
async function main() {
|
|
61
|
+
const args = process.argv.slice(2)
|
|
62
|
+
|
|
63
|
+
if (args.length === 0 || args.includes('--help') || args.includes('-h')) {
|
|
64
|
+
printUsage()
|
|
65
|
+
process.exit(0)
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
const filePath = args[0]
|
|
69
|
+
|
|
70
|
+
if (!existsSync(filePath)) {
|
|
71
|
+
console.error(`Error: File not found: ${filePath}`)
|
|
72
|
+
process.exit(1)
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
console.log(`Scanning ${filePath} for UTF-8 corruption...`)
|
|
76
|
+
|
|
77
|
+
try {
|
|
78
|
+
const result = await detectCorruption(filePath)
|
|
79
|
+
|
|
80
|
+
// Show which files were scanned
|
|
81
|
+
if (result.scannedFiles.length > 0) {
|
|
82
|
+
console.log(`\nScanned files:`)
|
|
83
|
+
for (const file of result.scannedFiles) {
|
|
84
|
+
console.log(` - ${file}`)
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
if (!result.corrupted) {
|
|
89
|
+
console.log('\n✓ No corruption detected')
|
|
90
|
+
process.exit(0)
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
console.log(`\n✗ Found potential corruption in ${result.files.size} file(s):`)
|
|
94
|
+
|
|
95
|
+
for (const [filename, corruptions] of result.files) {
|
|
96
|
+
printReport(filename, corruptions)
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
console.log(`\nTotal: ${result.totalCorruptedLines} line(s) with replacement characters`)
|
|
100
|
+
console.log('\nNote: U+FFFD replacement characters indicate where multi-byte')
|
|
101
|
+
console.log('UTF-8 sequences were corrupted during export streaming.')
|
|
102
|
+
process.exit(1)
|
|
103
|
+
} catch (err) {
|
|
104
|
+
console.error('Error:', err instanceof Error ? err.message : String(err))
|
|
105
|
+
process.exit(1)
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
main()
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Information about corruption found on a specific line
|
|
3
|
+
* @public
|
|
4
|
+
*/
|
|
5
|
+
export interface CorruptionInfo {
|
|
6
|
+
/** Line number (1-indexed) */
|
|
7
|
+
line: number;
|
|
8
|
+
/** Column position of first replacement char */
|
|
9
|
+
column: number;
|
|
10
|
+
/** Surrounding text for context */
|
|
11
|
+
context: string;
|
|
12
|
+
/** Number of replacement chars on this line */
|
|
13
|
+
count: number;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Result of scanning a file for corruption
|
|
17
|
+
* @public
|
|
18
|
+
*/
|
|
19
|
+
export interface ScanResult {
|
|
20
|
+
/** Whether corruption was detected */
|
|
21
|
+
corrupted: boolean;
|
|
22
|
+
/** Map of filename to corruption info (for tar.gz, multiple files may be scanned) */
|
|
23
|
+
files: Map<string, CorruptionInfo[]>;
|
|
24
|
+
/** Total number of corrupted lines across all files */
|
|
25
|
+
totalCorruptedLines: number;
|
|
26
|
+
/** List of files that were scanned */
|
|
27
|
+
scannedFiles: string[];
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Scans an NDJSON file for UTF-8 corruption
|
|
31
|
+
*
|
|
32
|
+
* @param filePath - Path to the ndjson file
|
|
33
|
+
* @returns Scan result with corruption information
|
|
34
|
+
* @public
|
|
35
|
+
*/
|
|
36
|
+
export declare function scanNdjsonFile(filePath: string): Promise<ScanResult>;
|
|
37
|
+
/**
|
|
38
|
+
* Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files
|
|
39
|
+
*
|
|
40
|
+
* @param filePath - Path to the tar.gz file
|
|
41
|
+
* @returns Scan result with corruption information
|
|
42
|
+
* @public
|
|
43
|
+
*/
|
|
44
|
+
export declare function scanTarGz(filePath: string): Promise<ScanResult>;
|
|
45
|
+
/**
|
|
46
|
+
* Scans a directory for UTF-8 corruption in data.ndjson and assets.json files
|
|
47
|
+
*
|
|
48
|
+
* @param dirPath - Path to the directory
|
|
49
|
+
* @returns Scan result with corruption information
|
|
50
|
+
* @public
|
|
51
|
+
*/
|
|
52
|
+
export declare function scanDirectory(dirPath: string): Promise<ScanResult>;
|
|
53
|
+
/**
|
|
54
|
+
* Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory)
|
|
55
|
+
*
|
|
56
|
+
* The corruption manifests as U+FFFD replacement characters appearing
|
|
57
|
+
* where valid multi-byte characters (CJK, emoji, etc.) should be.
|
|
58
|
+
*
|
|
59
|
+
* @param filePath - Path to the file or directory to scan
|
|
60
|
+
* @returns Scan result with corruption information
|
|
61
|
+
* @public
|
|
62
|
+
*/
|
|
63
|
+
export declare function detectCorruption(filePath: string): Promise<ScanResult>;
|
|
64
|
+
//# sourceMappingURL=detectCorruption.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"detectCorruption.d.ts","sourceRoot":"","sources":["../src/detectCorruption.ts"],"names":[],"mappings":"AAWA;;;GAGG;AACH,MAAM,WAAW,cAAc;IAC7B,8BAA8B;IAC9B,IAAI,EAAE,MAAM,CAAA;IACZ,gDAAgD;IAChD,MAAM,EAAE,MAAM,CAAA;IACd,mCAAmC;IACnC,OAAO,EAAE,MAAM,CAAA;IACf,+CAA+C;IAC/C,KAAK,EAAE,MAAM,CAAA;CACd;AAED;;;GAGG;AACH,MAAM,WAAW,UAAU;IACzB,sCAAsC;IACtC,SAAS,EAAE,OAAO,CAAA;IAClB,qFAAqF;IACrF,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,cAAc,EAAE,CAAC,CAAA;IACpC,uDAAuD;IACvD,mBAAmB,EAAE,MAAM,CAAA;IAC3B,sCAAsC;IACtC,YAAY,EAAE,MAAM,EAAE,CAAA;CACvB;AAmDD;;;;;;GAMG;AACH,wBAAsB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAe1E;AAED;;;;;;GAMG;AACH,wBAAsB,SAAS,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAsErE;AAED;;;;;;GAMG;AACH,wBAAsB,aAAa,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAoCxE;AAED;;;;;;;;;GASG;AACH,wBAAsB,gBAAgB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAS5E"}
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import { createReadStream, existsSync, statSync } from 'node:fs';
|
|
2
|
+
import { basename, join } from 'node:path';
|
|
3
|
+
import { createInterface } from 'node:readline';
|
|
4
|
+
import { createGunzip } from 'node:zlib';
|
|
5
|
+
import tarStream from 'tar-stream';
|
|
6
|
+
// U+FFFD replacement character - appears when invalid UTF-8 sequences are decoded
|
|
7
|
+
const REPLACEMENT_CHAR = '\uFFFD';
|
|
8
|
+
/**
|
|
9
|
+
* Scans a line for U+FFFD replacement characters
|
|
10
|
+
*/
|
|
11
|
+
function scanLine(line, lineNumber) {
|
|
12
|
+
const index = line.indexOf(REPLACEMENT_CHAR);
|
|
13
|
+
if (index === -1)
|
|
14
|
+
return null;
|
|
15
|
+
// Count total replacement chars on this line
|
|
16
|
+
let count = 0;
|
|
17
|
+
for (const char of line) {
|
|
18
|
+
if (char === REPLACEMENT_CHAR)
|
|
19
|
+
count++;
|
|
20
|
+
}
|
|
21
|
+
// Extract context around the corruption
|
|
22
|
+
const contextStart = Math.max(0, index - 20);
|
|
23
|
+
const contextEnd = Math.min(line.length, index + 30);
|
|
24
|
+
const context = line.slice(contextStart, contextEnd);
|
|
25
|
+
return {
|
|
26
|
+
line: lineNumber,
|
|
27
|
+
column: index + 1,
|
|
28
|
+
context,
|
|
29
|
+
count,
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Scans a readable stream (expecting UTF-8 text) for corruption
|
|
34
|
+
*/
|
|
35
|
+
async function scanStream(stream) {
|
|
36
|
+
const corruptions = [];
|
|
37
|
+
let lineNumber = 0;
|
|
38
|
+
const rl = createInterface({
|
|
39
|
+
input: stream,
|
|
40
|
+
crlfDelay: Infinity,
|
|
41
|
+
});
|
|
42
|
+
for await (const line of rl) {
|
|
43
|
+
lineNumber++;
|
|
44
|
+
const corruption = scanLine(line, lineNumber);
|
|
45
|
+
if (corruption) {
|
|
46
|
+
corruptions.push(corruption);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
return corruptions;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Scans an NDJSON file for UTF-8 corruption
|
|
53
|
+
*
|
|
54
|
+
* @param filePath - Path to the ndjson file
|
|
55
|
+
* @returns Scan result with corruption information
|
|
56
|
+
* @public
|
|
57
|
+
*/
|
|
58
|
+
export async function scanNdjsonFile(filePath) {
|
|
59
|
+
const stream = createReadStream(filePath, { encoding: 'utf8' });
|
|
60
|
+
const corruptions = await scanStream(stream);
|
|
61
|
+
const files = new Map();
|
|
62
|
+
if (corruptions.length > 0) {
|
|
63
|
+
files.set(filePath, corruptions);
|
|
64
|
+
}
|
|
65
|
+
return {
|
|
66
|
+
corrupted: corruptions.length > 0,
|
|
67
|
+
files,
|
|
68
|
+
totalCorruptedLines: corruptions.length,
|
|
69
|
+
scannedFiles: [filePath],
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files
|
|
74
|
+
*
|
|
75
|
+
* @param filePath - Path to the tar.gz file
|
|
76
|
+
* @returns Scan result with corruption information
|
|
77
|
+
* @public
|
|
78
|
+
*/
|
|
79
|
+
export async function scanTarGz(filePath) {
|
|
80
|
+
const extract = tarStream.extract();
|
|
81
|
+
const results = new Map();
|
|
82
|
+
const scannedFiles = [];
|
|
83
|
+
const targetFiles = ['data.ndjson', 'asset.json'];
|
|
84
|
+
return new Promise((resolve, reject) => {
|
|
85
|
+
extract.on('entry', (header, stream, next) => {
|
|
86
|
+
const fileBasename = basename(header.name);
|
|
87
|
+
if (targetFiles.includes(fileBasename)) {
|
|
88
|
+
scannedFiles.push(header.name);
|
|
89
|
+
const chunks = [];
|
|
90
|
+
stream.on('data', (chunk) => {
|
|
91
|
+
chunks.push(chunk);
|
|
92
|
+
});
|
|
93
|
+
stream.on('end', () => {
|
|
94
|
+
// Combine all chunks and convert to string
|
|
95
|
+
const content = Buffer.concat(chunks).toString('utf8');
|
|
96
|
+
const lines = content.split(/\r?\n/);
|
|
97
|
+
const corruptions = [];
|
|
98
|
+
for (let i = 0; i < lines.length; i++) {
|
|
99
|
+
const line = lines[i];
|
|
100
|
+
if (line !== undefined && line.length > 0) {
|
|
101
|
+
const corruption = scanLine(line, i + 1);
|
|
102
|
+
if (corruption) {
|
|
103
|
+
corruptions.push(corruption);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
if (corruptions.length > 0) {
|
|
108
|
+
results.set(header.name, corruptions);
|
|
109
|
+
}
|
|
110
|
+
next();
|
|
111
|
+
});
|
|
112
|
+
stream.on('error', reject);
|
|
113
|
+
}
|
|
114
|
+
else {
|
|
115
|
+
// Skip this entry
|
|
116
|
+
stream.on('end', next);
|
|
117
|
+
stream.resume();
|
|
118
|
+
}
|
|
119
|
+
});
|
|
120
|
+
extract.on('finish', () => {
|
|
121
|
+
let totalCorruptedLines = 0;
|
|
122
|
+
for (const corruptions of results.values()) {
|
|
123
|
+
totalCorruptedLines += corruptions.length;
|
|
124
|
+
}
|
|
125
|
+
resolve({
|
|
126
|
+
corrupted: results.size > 0,
|
|
127
|
+
files: results,
|
|
128
|
+
totalCorruptedLines,
|
|
129
|
+
scannedFiles,
|
|
130
|
+
});
|
|
131
|
+
});
|
|
132
|
+
extract.on('error', reject);
|
|
133
|
+
const gunzip = createGunzip();
|
|
134
|
+
gunzip.on('error', reject);
|
|
135
|
+
createReadStream(filePath).pipe(gunzip).pipe(extract);
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Scans a directory for UTF-8 corruption in data.ndjson and assets.json files
|
|
140
|
+
*
|
|
141
|
+
* @param dirPath - Path to the directory
|
|
142
|
+
* @returns Scan result with corruption information
|
|
143
|
+
* @public
|
|
144
|
+
*/
|
|
145
|
+
export async function scanDirectory(dirPath) {
|
|
146
|
+
const targetFiles = ['data.ndjson', 'assets.json'];
|
|
147
|
+
const foundFiles = [];
|
|
148
|
+
for (const filename of targetFiles) {
|
|
149
|
+
const filePath = join(dirPath, filename);
|
|
150
|
+
if (existsSync(filePath)) {
|
|
151
|
+
foundFiles.push(filePath);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
if (foundFiles.length === 0) {
|
|
155
|
+
throw new Error(`No data.ndjson or assets.json found in directory: ${dirPath}`);
|
|
156
|
+
}
|
|
157
|
+
const results = new Map();
|
|
158
|
+
const scannedFiles = [];
|
|
159
|
+
let totalCorruptedLines = 0;
|
|
160
|
+
for (const filePath of foundFiles) {
|
|
161
|
+
const result = await scanNdjsonFile(filePath);
|
|
162
|
+
scannedFiles.push(...result.scannedFiles);
|
|
163
|
+
for (const [file, corruptions] of result.files) {
|
|
164
|
+
results.set(file, corruptions);
|
|
165
|
+
totalCorruptedLines += corruptions.length;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
return {
|
|
169
|
+
corrupted: results.size > 0,
|
|
170
|
+
files: results,
|
|
171
|
+
totalCorruptedLines,
|
|
172
|
+
scannedFiles,
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory)
|
|
177
|
+
*
|
|
178
|
+
* The corruption manifests as U+FFFD replacement characters appearing
|
|
179
|
+
* where valid multi-byte characters (CJK, emoji, etc.) should be.
|
|
180
|
+
*
|
|
181
|
+
* @param filePath - Path to the file or directory to scan
|
|
182
|
+
* @returns Scan result with corruption information
|
|
183
|
+
* @public
|
|
184
|
+
*/
|
|
185
|
+
export async function detectCorruption(filePath) {
|
|
186
|
+
const stat = statSync(filePath);
|
|
187
|
+
if (stat.isDirectory()) {
|
|
188
|
+
return scanDirectory(filePath);
|
|
189
|
+
}
|
|
190
|
+
const isGzip = filePath.endsWith('.tar.gz') || filePath.endsWith('.tgz');
|
|
191
|
+
return isGzip ? scanTarGz(filePath) : scanNdjsonFile(filePath);
|
|
192
|
+
}
|
|
193
|
+
//# sourceMappingURL=detectCorruption.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"detectCorruption.js","sourceRoot":"","sources":["../src/detectCorruption.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,gBAAgB,EAAE,UAAU,EAAE,QAAQ,EAAC,MAAM,SAAS,CAAA;AAC9D,OAAO,EAAC,QAAQ,EAAE,IAAI,EAAC,MAAM,WAAW,CAAA;AACxC,OAAO,EAAC,eAAe,EAAC,MAAM,eAAe,CAAA;AAE7C,OAAO,EAAC,YAAY,EAAC,MAAM,WAAW,CAAA;AAEtC,OAAO,SAAS,MAAM,YAAY,CAAA;AAElC,kFAAkF;AAClF,MAAM,gBAAgB,GAAG,QAAQ,CAAA;AAgCjC;;GAEG;AACH,SAAS,QAAQ,CAAC,IAAY,EAAE,UAAkB;IAChD,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAA;IAC5C,IAAI,KAAK,KAAK,CAAC,CAAC;QAAE,OAAO,IAAI,CAAA;IAE7B,6CAA6C;IAC7C,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,KAAK,MAAM,IAAI,IAAI,IAAI,EAAE,CAAC;QACxB,IAAI,IAAI,KAAK,gBAAgB;YAAE,KAAK,EAAE,CAAA;IACxC,CAAC;IAED,wCAAwC;IACxC,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,GAAG,EAAE,CAAC,CAAA;IAC5C,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,KAAK,GAAG,EAAE,CAAC,CAAA;IACpD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,EAAE,UAAU,CAAC,CAAA;IAEpD,OAAO;QACL,IAAI,EAAE,UAAU;QAChB,MAAM,EAAE,KAAK,GAAG,CAAC;QACjB,OAAO;QACP,KAAK;KACN,CAAA;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,UAAU,CAAC,MAAgB;IACxC,MAAM,WAAW,GAAqB,EAAE,CAAA;IACxC,IAAI,UAAU,GAAG,CAAC,CAAA;IAElB,MAAM,EAAE,GAAG,eAAe,CAAC;QACzB,KAAK,EAAE,MAAM;QACb,SAAS,EAAE,QAAQ;KACpB,CAAC,CAAA;IAEF,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,EAAE,EAAE,CAAC;QAC5B,UAAU,EAAE,CAAA;QACZ,MAAM,UAAU,GAAG,QAAQ,CAAC,IAAI,EAAE,UAAU,CAAC,CAAA;QAC7C,IAAI,UAAU,EAAE,CAAC;YACf,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QAC9B,CAAC;IACH,CAAC;IAED,OAAO,WAAW,CAAA;AACpB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,QAAgB;IACnD,MAAM,MAAM,GAAG,gBAAgB,CAAC,QAAQ,EAAE,EAAC,QAAQ,EAAE,MAAM,EAAC,CAAC,CAAA;IAC7D,MAAM,WAAW,GAAG,MAAM,UAAU,CAAC,MAAM,CAAC,CAAA;IAE5C,MAAM,KAAK,GAAG,IAAI,GAAG,EAA4B,CAAA;IACjD,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,KAAK,CAAC,GAAG,CAAC,QAAQ,EAAE,WAAW,CAAC,CAAA;IAClC,CAAC;IAED,OAAO;QACL,SAAS,EAAE,WAAW,CAAC,MAAM,GAAG,CAAC;QACjC,KAAK;QACL,mBAAmB,EAAE,WAAW,CAAC,MAAM;QACvC,YAAY,EAAE,CAAC,QAAQ,CAAC;KACzB,CAAA;AACH,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,QAAgB;IAC9C,MAAM,OAAO,GAAG,SAAS,CAAC,OAAO,EAAE,CAAA;IAEnC,MAAM,OAAO,GAAG,IAAI,GAAG,EAA4B,CAAA;IACnD,MAAM,YAAY,GAAa,EAAE,CAAA;IACjC,MAAM,WAAW,GAAG,CAAC,aAAa,EAAE,YAAY,CAAC,CAAA;IAEjD,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACrC,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,IAAI,EAAE,EAAE;YAC3C,MAAM,YAAY,GAAG,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;YAE1C,IAAI,WAAW,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;gBACvC,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;gBAC9B,MAAM,MAAM,GAAa,EAAE,CAAA;gBAE3B,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAa,EAAE,EAAE;oBAClC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;gBACpB,CAAC,CAAC,CAAA;gBAEF,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,EAAE;oBACpB,2CAA2C;oBAC3C,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAA;oBACtD,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,CAAA;oBACpC,MAAM,WAAW,GAAqB,EAAE,CAAA;oBAExC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;wBACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAA;wBACrB,IAAI,IAAI,KAAK,SAAS,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;4BAC1C,MAAM,UAAU,GAAG,QAAQ,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAA;4BACxC,IAAI,UAAU,EAAE,CAAC;gCACf,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;4BAC9B,CAAC;wBACH,CAAC;oBACH,CAAC;oBAED,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;wBAC3B,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,EAAE,WAAW,CAAC,CAAA;oBACvC,CAAC;oBACD,IAAI,EAAE,CAAA;gBACR,CAAC,CAAC,CAAA;gBAEF,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;YAC5B,CAAC;iBAAM,CAAC;gBACN,kBAAkB;gBAClB,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,IAAI,CAAC,CAAA;gBACtB,MAAM,CAAC,MAAM,EAAE,CAAA;YACjB,CAAC;QACH,CAAC,CAAC,CAAA;QAEF,OAAO,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,EAAE;YACxB,IAAI,mBAAmB,GAAG,CAAC,CAAA;YAC3B,KAAK,MAAM,WAAW,IAAI,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;gBAC3C,mBAAmB,IAAI,WAAW,CAAC,MAAM,CAAA;YAC3C,CAAC;YAED,OAAO,CAAC;gBACN,SAAS,EAAE,OAAO,CAAC,IAAI,GAAG,CAAC;gBAC3B,KAAK,EAAE,OAAO;gBACd,mBAAmB;gBACnB,YAAY;aACb,CAAC,CAAA;QACJ,CAAC,CAAC,CAAA;QAEF,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;QAE3B,MAAM,MAAM,GAAG,YAAY,EAAE,CAAA;QAC7B,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;QAE1B,gBAAgB,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;AACJ,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,OAAe;IACjD,MAAM,WAAW,GAAG,CAAC,aAAa,EAAE,aAAa,CAAC,CAAA;IAClD,MAAM,UAAU,GAAa,EAAE,CAAA;IAE/B,KAAK,MAAM,QAAQ,IAAI,WAAW,EAAE,CAAC;QACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAA;QACxC,IAAI,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;YACzB,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;QAC3B,CAAC;IACH,CAAC;IAED,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC5B,MAAM,IAAI,KAAK,CACb,qDAAqD,OAAO,EAAE,CAC/D,CAAA;IACH,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,GAAG,EAA4B,CAAA;IACnD,MAAM,YAAY,GAAa,EAAE,CAAA;IACjC,IAAI,mBAAmB,GAAG,CAAC,CAAA;IAE3B,KAAK,MAAM,QAAQ,IAAI,UAAU,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,QAAQ,CAAC,CAAA;QAC7C,YAAY,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,YAAY,CAAC,CAAA;QACzC,KAAK,MAAM,CAAC,IAAI,EAAE,WAAW,CAAC,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;YAC/C,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,WAAW,CAAC,CAAA;YAC9B,mBAAmB,IAAI,WAAW,CAAC,MAAM,CAAA;QAC3C,CAAC;IACH,CAAC;IAED,OAAO;QACL,SAAS,EAAE,OAAO,CAAC,IAAI,GAAG,CAAC;QAC3B,KAAK,EAAE,OAAO;QACd,mBAAmB;QACnB,YAAY;KACb,CAAA;AACH,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,QAAgB;IACrD,MAAM,IAAI,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAA;IAE/B,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;QACvB,OAAO,aAAa,CAAC,QAAQ,CAAC,CAAA;IAChC,CAAC;IAED,MAAM,MAAM,GAAG,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAA;IACxE,OAAO,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAA;AAChE,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"streamHelpers.d.ts","sourceRoot":"","sources":["../../src/util/streamHelpers.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,SAAS,EAAE,KAAK,iBAAiB,EAAE,KAAK,QAAQ,EAAC,MAAM,aAAa,CAAA;
|
|
1
|
+
{"version":3,"file":"streamHelpers.d.ts","sourceRoot":"","sources":["../../src/util/streamHelpers.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,SAAS,EAAE,KAAK,iBAAiB,EAAE,KAAK,QAAQ,EAAC,MAAM,aAAa,CAAA;AAG5E,KAAK,iBAAiB,GAAG,CACvB,KAAK,EAAE,MAAM,EACb,QAAQ,EAAE,cAAc,EACxB,QAAQ,EAAE,iBAAiB,KACxB,IAAI,CAAA;AAET,KAAK,oBAAoB,CAAC,CAAC,EAAE,CAAC,IAAI,CAChC,KAAK,EAAE,CAAC,EACR,QAAQ,EAAE,cAAc,EACxB,QAAQ,EAAE,iBAAiB,KACxB,CAAC,CAAA;AAEN,wBAAgB,OAAO,CAAC,WAAW,EAAE,iBAAiB,GAAG,SAAS,CAMjE;AAED,wBAAgB,UAAU,CAAC,CAAC,GAAG,OAAO,EAAE,CAAC,GAAG,IAAI,EAC9C,WAAW,EAAE,oBAAoB,CAAC,CAAC,EAAE,CAAC,CAAC,GACtC,SAAS,CAOX;AAED,wBAAgB,gBAAgB,CAAC,GAAG,EAAE,OAAO,GAAG,GAAG,IAAI,QAAQ,CAW9D;AAED,wBAAgB,MAAM,CAAC,MAAM,EAAE,CAAC,MAAM,EAAE,OAAO,EAAE,KAAK,IAAI,GAAG,SAAS,CAiBrE;AAED,wBAAgB,KAAK,CAAC,WAAW,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,GAAG,SAAS,CA0DxE"}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { Transform } from 'node:stream';
|
|
2
|
+
import { StringDecoder } from 'node:string_decoder';
|
|
2
3
|
export function through(transformFn) {
|
|
3
4
|
return new Transform({
|
|
4
5
|
transform(chunk, encoding, callback) {
|
|
@@ -46,10 +47,11 @@ export function concat(onData) {
|
|
|
46
47
|
export function split(transformFn) {
|
|
47
48
|
let buffer = '';
|
|
48
49
|
const splitRegex = /\r?\n/;
|
|
50
|
+
const decoder = new StringDecoder('utf8');
|
|
49
51
|
return new Transform({
|
|
50
52
|
objectMode: Boolean(transformFn),
|
|
51
53
|
transform(chunk, _encoding, callback) {
|
|
52
|
-
buffer +=
|
|
54
|
+
buffer += decoder.write(chunk);
|
|
53
55
|
const lines = buffer.split(splitRegex);
|
|
54
56
|
// Keep the last line in buffer as it might be incomplete
|
|
55
57
|
buffer = lines.pop() ?? '';
|
|
@@ -75,6 +77,8 @@ export function split(transformFn) {
|
|
|
75
77
|
callback();
|
|
76
78
|
},
|
|
77
79
|
flush(callback) {
|
|
80
|
+
// Flush any remaining bytes from the decoder
|
|
81
|
+
buffer += decoder.end();
|
|
78
82
|
if (buffer.length === 0) {
|
|
79
83
|
callback();
|
|
80
84
|
return;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"streamHelpers.js","sourceRoot":"","sources":["../../src/util/streamHelpers.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,SAAS,EAAwC,MAAM,aAAa,CAAA;
|
|
1
|
+
{"version":3,"file":"streamHelpers.js","sourceRoot":"","sources":["../../src/util/streamHelpers.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,SAAS,EAAwC,MAAM,aAAa,CAAA;AAC5E,OAAO,EAAC,aAAa,EAAC,MAAM,qBAAqB,CAAA;AAcjD,MAAM,UAAU,OAAO,CAAC,WAA8B;IACpD,OAAO,IAAI,SAAS,CAAC;QACnB,SAAS,CAAC,KAAa,EAAE,QAAwB,EAAE,QAA2B;YAC5E,WAAW,CAAC,KAAK,EAAE,QAAQ,EAAE,QAAQ,CAAC,CAAA;QACxC,CAAC;KACF,CAAC,CAAA;AACJ,CAAC;AAED,MAAM,UAAU,UAAU,CACxB,WAAuC;IAEvC,OAAO,IAAI,SAAS,CAAC;QACnB,UAAU,EAAE,IAAI;QAChB,SAAS,CAAC,KAAQ,EAAE,QAAwB,EAAE,QAA2B;YACvE,WAAW,CAAC,KAAK,EAAE,QAAQ,EAAE,QAAQ,CAAC,CAAA;QACxC,CAAC;KACF,CAAC,CAAA;AACJ,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,GAAY;IAC3C,OAAO,CACL,GAAG,KAAK,IAAI;QACZ,OAAO,GAAG,KAAK,QAAQ;QACvB,MAAM,IAAI,GAAG;QACb,OAAO,GAAG,CAAC,IAAI,KAAK,UAAU;QAC9B,QAAQ,IAAI,GAAG;QACf,OAAO,GAAG,CAAC,MAAM,KAAK,UAAU;QAChC,gBAAgB,IAAI,GAAG;QACvB,OAAO,GAAG,CAAC,cAAc,KAAK,QAAQ,CACvC,CAAA;AACH,CAAC;AAED,MAAM,UAAU,MAAM,CAAC,MAAmC;IACxD,MAAM,MAAM,GAAc,EAAE,CAAA;IAC5B,OAAO,IAAI,SAAS,CAAC;QACnB,UAAU,EAAE,IAAI;QAChB,SAAS,CAAC,KAAc,EAAE,SAAyB,EAAE,QAA2B;YAC9E,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;YAClB,QAAQ,EAAE,CAAA;QACZ,CAAC;QACD,KAAK,CAAC,QAA2B;YAC/B,IAAI,CAAC;gBACH,MAAM,CAAC,MAAM,CAAC,CAAA;gBACd,QAAQ,EAAE,CAAA;YACZ,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,QAAQ,CAAC,GAAY,CAAC,CAAA;YACxB,CAAC;QACH,CAAC;KACF,CAAC,CAAA;AACJ,CAAC;AAED,MAAM,UAAU,KAAK,CAAC,WAAuC;IAC3D,IAAI,MAAM,GAAG,EAAE,CAAA;IACf,MAAM,UAAU,GAAG,OAAO,CAAA;IAC1B,MAAM,OAAO,GAAG,IAAI,aAAa,CAAC,MAAM,CAAC,CAAA;IAEzC,OAAO,IAAI,SAAS,CAAC;QACnB,UAAU,EAAE,OAAO,CAAC,WAAW,CAAC;QAChC,SAAS,CAAC,KAAa,EAAE,SAAyB,EAAE,QAA2B;YAC7E,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAA;YAC9B,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,UAAU,CAAC,CAAA;YAEtC,yDAAyD;YACzD,MAAM,GAAG,KAAK,CAAC,GAAG,EAAE,IAAI,EAAE,CAAA;YAE1B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;gBACzB,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;oBAAE,SAAQ;gBAE/B,IAAI,WAAW,EAAE,CAAC;oBAChB,IAAI,CAAC;wBACH,MAAM,MAAM,GAAG,WAAW,CAAC,IAAI,CAAC,CAAA;wBAChC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;4BACzB,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;wBACnB,CAAC;oBACH,CAAC;oBAAC,OAAO,GAAG,EAAE,CAAC;wBACb,QAAQ,CAAC,GAAY,CAAC,CAAA;wBACtB,OAAM;oBACR,CAAC;gBACH,CAAC;qBAAM,CAAC;oBACN,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;gBACjB,CAAC;YACH,CAAC;YACD,QAAQ,EAAE,CAAA;QACZ,CAAC;QACD,KAAK,CAAC,QAA2B;YAC/B,6CAA6C;YAC7C,MAAM,IAAI,OAAO,CAAC,GAAG,EAAE,CAAA;YAEvB,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACxB,QAAQ,EAAE,CAAA;gBACV,OAAM;YACR,CAAC;YAED,IAAI,CAAC,WAAW,EAAE,CAAC;gBACjB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;gBACtB,OAAM;YACR,CAAC;YAED,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAA;gBAClC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;oBACzB,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;gBACnB,CAAC;gBACD,QAAQ,EAAE,CAAA;YACZ,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,QAAQ,CAAC,GAAY,CAAC,CAAA;YACxB,CAAC;QACH,CAAC;KACF,CAAC,CAAA;AACJ,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sanity/export",
|
|
3
|
-
"version": "6.0.
|
|
3
|
+
"version": "6.0.4",
|
|
4
4
|
"description": "Export Sanity documents and assets",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"sanity",
|
|
@@ -30,7 +30,11 @@
|
|
|
30
30
|
},
|
|
31
31
|
"main": "./dist/index.js",
|
|
32
32
|
"types": "./dist/index.d.ts",
|
|
33
|
+
"bin": {
|
|
34
|
+
"detect-corrupt": "./bin/detect-corrupt.js"
|
|
35
|
+
},
|
|
33
36
|
"files": [
|
|
37
|
+
"bin",
|
|
34
38
|
"dist",
|
|
35
39
|
"src"
|
|
36
40
|
],
|
|
@@ -47,7 +51,8 @@
|
|
|
47
51
|
"debug": "^4.3.4",
|
|
48
52
|
"get-it": "^8.6.10",
|
|
49
53
|
"json-stream-stringify": "^3.1.6",
|
|
50
|
-
"p-queue": "^9.0.1"
|
|
54
|
+
"p-queue": "^9.0.1",
|
|
55
|
+
"tar-stream": "^3.1.7"
|
|
51
56
|
},
|
|
52
57
|
"devDependencies": {
|
|
53
58
|
"@eslint/js": "^9.39.1",
|
|
@@ -56,6 +61,7 @@
|
|
|
56
61
|
"@types/archiver": "^7.0.0",
|
|
57
62
|
"@types/debug": "^4.1.12",
|
|
58
63
|
"@types/node": "^20.19.0",
|
|
64
|
+
"@types/tar-stream": "^3.1.3",
|
|
59
65
|
"@vitest/coverage-v8": "^4.0.15",
|
|
60
66
|
"eslint": "^8.57.0",
|
|
61
67
|
"eslint-config-prettier": "^9.1.0",
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
import {createReadStream, existsSync, statSync} from 'node:fs'
|
|
2
|
+
import {basename, join} from 'node:path'
|
|
3
|
+
import {createInterface} from 'node:readline'
|
|
4
|
+
import type {Readable} from 'node:stream'
|
|
5
|
+
import {createGunzip} from 'node:zlib'
|
|
6
|
+
|
|
7
|
+
import tarStream from 'tar-stream'
|
|
8
|
+
|
|
9
|
+
// U+FFFD replacement character - appears when invalid UTF-8 sequences are decoded
|
|
10
|
+
const REPLACEMENT_CHAR = '\uFFFD'
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Information about corruption found on a specific line
|
|
14
|
+
* @public
|
|
15
|
+
*/
|
|
16
|
+
export interface CorruptionInfo {
|
|
17
|
+
/** Line number (1-indexed) */
|
|
18
|
+
line: number
|
|
19
|
+
/** Column position of first replacement char */
|
|
20
|
+
column: number
|
|
21
|
+
/** Surrounding text for context */
|
|
22
|
+
context: string
|
|
23
|
+
/** Number of replacement chars on this line */
|
|
24
|
+
count: number
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Result of scanning a file for corruption
|
|
29
|
+
* @public
|
|
30
|
+
*/
|
|
31
|
+
export interface ScanResult {
|
|
32
|
+
/** Whether corruption was detected */
|
|
33
|
+
corrupted: boolean
|
|
34
|
+
/** Map of filename to corruption info (for tar.gz, multiple files may be scanned) */
|
|
35
|
+
files: Map<string, CorruptionInfo[]>
|
|
36
|
+
/** Total number of corrupted lines across all files */
|
|
37
|
+
totalCorruptedLines: number
|
|
38
|
+
/** List of files that were scanned */
|
|
39
|
+
scannedFiles: string[]
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Scans a line for U+FFFD replacement characters
|
|
44
|
+
*/
|
|
45
|
+
function scanLine(line: string, lineNumber: number): CorruptionInfo | null {
|
|
46
|
+
const index = line.indexOf(REPLACEMENT_CHAR)
|
|
47
|
+
if (index === -1) return null
|
|
48
|
+
|
|
49
|
+
// Count total replacement chars on this line
|
|
50
|
+
let count = 0
|
|
51
|
+
for (const char of line) {
|
|
52
|
+
if (char === REPLACEMENT_CHAR) count++
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Extract context around the corruption
|
|
56
|
+
const contextStart = Math.max(0, index - 20)
|
|
57
|
+
const contextEnd = Math.min(line.length, index + 30)
|
|
58
|
+
const context = line.slice(contextStart, contextEnd)
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
line: lineNumber,
|
|
62
|
+
column: index + 1,
|
|
63
|
+
context,
|
|
64
|
+
count,
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Scans a readable stream (expecting UTF-8 text) for corruption
|
|
70
|
+
*/
|
|
71
|
+
async function scanStream(stream: Readable): Promise<CorruptionInfo[]> {
|
|
72
|
+
const corruptions: CorruptionInfo[] = []
|
|
73
|
+
let lineNumber = 0
|
|
74
|
+
|
|
75
|
+
const rl = createInterface({
|
|
76
|
+
input: stream,
|
|
77
|
+
crlfDelay: Infinity,
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
for await (const line of rl) {
|
|
81
|
+
lineNumber++
|
|
82
|
+
const corruption = scanLine(line, lineNumber)
|
|
83
|
+
if (corruption) {
|
|
84
|
+
corruptions.push(corruption)
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return corruptions
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Scans an NDJSON file for UTF-8 corruption
|
|
93
|
+
*
|
|
94
|
+
* @param filePath - Path to the ndjson file
|
|
95
|
+
* @returns Scan result with corruption information
|
|
96
|
+
* @public
|
|
97
|
+
*/
|
|
98
|
+
export async function scanNdjsonFile(filePath: string): Promise<ScanResult> {
|
|
99
|
+
const stream = createReadStream(filePath, {encoding: 'utf8'})
|
|
100
|
+
const corruptions = await scanStream(stream)
|
|
101
|
+
|
|
102
|
+
const files = new Map<string, CorruptionInfo[]>()
|
|
103
|
+
if (corruptions.length > 0) {
|
|
104
|
+
files.set(filePath, corruptions)
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return {
|
|
108
|
+
corrupted: corruptions.length > 0,
|
|
109
|
+
files,
|
|
110
|
+
totalCorruptedLines: corruptions.length,
|
|
111
|
+
scannedFiles: [filePath],
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files
|
|
117
|
+
*
|
|
118
|
+
* @param filePath - Path to the tar.gz file
|
|
119
|
+
* @returns Scan result with corruption information
|
|
120
|
+
* @public
|
|
121
|
+
*/
|
|
122
|
+
export async function scanTarGz(filePath: string): Promise<ScanResult> {
|
|
123
|
+
const extract = tarStream.extract()
|
|
124
|
+
|
|
125
|
+
const results = new Map<string, CorruptionInfo[]>()
|
|
126
|
+
const scannedFiles: string[] = []
|
|
127
|
+
const targetFiles = ['data.ndjson', 'asset.json']
|
|
128
|
+
|
|
129
|
+
return new Promise((resolve, reject) => {
|
|
130
|
+
extract.on('entry', (header, stream, next) => {
|
|
131
|
+
const fileBasename = basename(header.name)
|
|
132
|
+
|
|
133
|
+
if (targetFiles.includes(fileBasename)) {
|
|
134
|
+
scannedFiles.push(header.name)
|
|
135
|
+
const chunks: Buffer[] = []
|
|
136
|
+
|
|
137
|
+
stream.on('data', (chunk: Buffer) => {
|
|
138
|
+
chunks.push(chunk)
|
|
139
|
+
})
|
|
140
|
+
|
|
141
|
+
stream.on('end', () => {
|
|
142
|
+
// Combine all chunks and convert to string
|
|
143
|
+
const content = Buffer.concat(chunks).toString('utf8')
|
|
144
|
+
const lines = content.split(/\r?\n/)
|
|
145
|
+
const corruptions: CorruptionInfo[] = []
|
|
146
|
+
|
|
147
|
+
for (let i = 0; i < lines.length; i++) {
|
|
148
|
+
const line = lines[i]
|
|
149
|
+
if (line !== undefined && line.length > 0) {
|
|
150
|
+
const corruption = scanLine(line, i + 1)
|
|
151
|
+
if (corruption) {
|
|
152
|
+
corruptions.push(corruption)
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
if (corruptions.length > 0) {
|
|
158
|
+
results.set(header.name, corruptions)
|
|
159
|
+
}
|
|
160
|
+
next()
|
|
161
|
+
})
|
|
162
|
+
|
|
163
|
+
stream.on('error', reject)
|
|
164
|
+
} else {
|
|
165
|
+
// Skip this entry
|
|
166
|
+
stream.on('end', next)
|
|
167
|
+
stream.resume()
|
|
168
|
+
}
|
|
169
|
+
})
|
|
170
|
+
|
|
171
|
+
extract.on('finish', () => {
|
|
172
|
+
let totalCorruptedLines = 0
|
|
173
|
+
for (const corruptions of results.values()) {
|
|
174
|
+
totalCorruptedLines += corruptions.length
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
resolve({
|
|
178
|
+
corrupted: results.size > 0,
|
|
179
|
+
files: results,
|
|
180
|
+
totalCorruptedLines,
|
|
181
|
+
scannedFiles,
|
|
182
|
+
})
|
|
183
|
+
})
|
|
184
|
+
|
|
185
|
+
extract.on('error', reject)
|
|
186
|
+
|
|
187
|
+
const gunzip = createGunzip()
|
|
188
|
+
gunzip.on('error', reject)
|
|
189
|
+
|
|
190
|
+
createReadStream(filePath).pipe(gunzip).pipe(extract)
|
|
191
|
+
})
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Scans a directory for UTF-8 corruption in data.ndjson and assets.json files
|
|
196
|
+
*
|
|
197
|
+
* @param dirPath - Path to the directory
|
|
198
|
+
* @returns Scan result with corruption information
|
|
199
|
+
* @public
|
|
200
|
+
*/
|
|
201
|
+
export async function scanDirectory(dirPath: string): Promise<ScanResult> {
|
|
202
|
+
const targetFiles = ['data.ndjson', 'assets.json']
|
|
203
|
+
const foundFiles: string[] = []
|
|
204
|
+
|
|
205
|
+
for (const filename of targetFiles) {
|
|
206
|
+
const filePath = join(dirPath, filename)
|
|
207
|
+
if (existsSync(filePath)) {
|
|
208
|
+
foundFiles.push(filePath)
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
if (foundFiles.length === 0) {
|
|
213
|
+
throw new Error(
|
|
214
|
+
`No data.ndjson or assets.json found in directory: ${dirPath}`,
|
|
215
|
+
)
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
const results = new Map<string, CorruptionInfo[]>()
|
|
219
|
+
const scannedFiles: string[] = []
|
|
220
|
+
let totalCorruptedLines = 0
|
|
221
|
+
|
|
222
|
+
for (const filePath of foundFiles) {
|
|
223
|
+
const result = await scanNdjsonFile(filePath)
|
|
224
|
+
scannedFiles.push(...result.scannedFiles)
|
|
225
|
+
for (const [file, corruptions] of result.files) {
|
|
226
|
+
results.set(file, corruptions)
|
|
227
|
+
totalCorruptedLines += corruptions.length
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
return {
|
|
232
|
+
corrupted: results.size > 0,
|
|
233
|
+
files: results,
|
|
234
|
+
totalCorruptedLines,
|
|
235
|
+
scannedFiles,
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
/**
|
|
240
|
+
* Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory)
|
|
241
|
+
*
|
|
242
|
+
* The corruption manifests as U+FFFD replacement characters appearing
|
|
243
|
+
* where valid multi-byte characters (CJK, emoji, etc.) should be.
|
|
244
|
+
*
|
|
245
|
+
* @param filePath - Path to the file or directory to scan
|
|
246
|
+
* @returns Scan result with corruption information
|
|
247
|
+
* @public
|
|
248
|
+
*/
|
|
249
|
+
export async function detectCorruption(filePath: string): Promise<ScanResult> {
|
|
250
|
+
const stat = statSync(filePath)
|
|
251
|
+
|
|
252
|
+
if (stat.isDirectory()) {
|
|
253
|
+
return scanDirectory(filePath)
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
const isGzip = filePath.endsWith('.tar.gz') || filePath.endsWith('.tgz')
|
|
257
|
+
return isGzip ? scanTarGz(filePath) : scanNdjsonFile(filePath)
|
|
258
|
+
}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import {Transform, type TransformCallback, type Writable} from 'node:stream'
|
|
2
|
+
import {StringDecoder} from 'node:string_decoder'
|
|
2
3
|
|
|
3
4
|
type TransformFunction = (
|
|
4
5
|
chunk: Buffer,
|
|
@@ -66,11 +67,12 @@ export function concat(onData: (chunks: unknown[]) => void): Transform {
|
|
|
66
67
|
export function split(transformFn?: (line: string) => unknown): Transform {
|
|
67
68
|
let buffer = ''
|
|
68
69
|
const splitRegex = /\r?\n/
|
|
70
|
+
const decoder = new StringDecoder('utf8')
|
|
69
71
|
|
|
70
72
|
return new Transform({
|
|
71
73
|
objectMode: Boolean(transformFn),
|
|
72
74
|
transform(chunk: Buffer, _encoding: BufferEncoding, callback: TransformCallback) {
|
|
73
|
-
buffer +=
|
|
75
|
+
buffer += decoder.write(chunk)
|
|
74
76
|
const lines = buffer.split(splitRegex)
|
|
75
77
|
|
|
76
78
|
// Keep the last line in buffer as it might be incomplete
|
|
@@ -96,6 +98,9 @@ export function split(transformFn?: (line: string) => unknown): Transform {
|
|
|
96
98
|
callback()
|
|
97
99
|
},
|
|
98
100
|
flush(callback: TransformCallback) {
|
|
101
|
+
// Flush any remaining bytes from the decoder
|
|
102
|
+
buffer += decoder.end()
|
|
103
|
+
|
|
99
104
|
if (buffer.length === 0) {
|
|
100
105
|
callback()
|
|
101
106
|
return
|