@ebowwa/jsonl-hft 1.0.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +220 -37
- package/dist/index.d.ts +4 -1
- package/native/libjsonl_hft.dylib +0 -0
- package/package.json +15 -5
package/README.md
CHANGED
|
@@ -1,16 +1,29 @@
|
|
|
1
1
|
# @ebowwa/jsonl-hft
|
|
2
2
|
|
|
3
|
-
Generic HFT-grade JSONL parser with sub-
|
|
3
|
+
Generic HFT-grade JSONL parser with sub-microsecond latency.
|
|
4
4
|
|
|
5
5
|
**NO HARDCODED FIELDS** - Consumer defines what fields to extract.
|
|
6
6
|
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Parallel Parsing**: Multi-core processing with rayon
|
|
10
|
+
- **GZIP Support**: Read and write compressed files
|
|
11
|
+
- **Schema Validation**: Validate JSONL against type schemas
|
|
12
|
+
- **Type Inference**: Automatic field type detection
|
|
13
|
+
- **Error Recovery**: Continue parsing on malformed lines
|
|
14
|
+
- **Statistics**: Real-time parsing metrics
|
|
15
|
+
- **Memory-Mapped I/O**: Efficient large file handling
|
|
16
|
+
- **SIMD Optimized**: memchr-accelerated byte search (~19 GiB/s)
|
|
17
|
+
|
|
7
18
|
## Performance
|
|
8
19
|
|
|
9
|
-
|
|
|
10
|
-
|
|
11
|
-
|
|
|
12
|
-
|
|
|
13
|
-
|
|
|
20
|
+
| Benchmark | Time | Throughput |
|
|
21
|
+
|-----------|------|------------|
|
|
22
|
+
| parse_line | 232 ns | ~606 MiB/s |
|
|
23
|
+
| find_field (first) | 15 ns | ~5 GiB/s |
|
|
24
|
+
| find_field (last) | 58 ns | ~1.3 GiB/s |
|
|
25
|
+
| pool_parser (1000 lines) | 113 µs | ~435 MiB/s |
|
|
26
|
+
| memchr_search | 3.9 ns | ~19.7 GiB/s |
|
|
14
27
|
|
|
15
28
|
## Installation
|
|
16
29
|
|
|
@@ -20,6 +33,8 @@ bun add @ebowwa/jsonl-hft
|
|
|
20
33
|
|
|
21
34
|
## Usage
|
|
22
35
|
|
|
36
|
+
### Basic Parsing
|
|
37
|
+
|
|
23
38
|
```typescript
|
|
24
39
|
import { parseDir, parseFile, parseBuffer, getVersion } from "@ebowwa/jsonl-hft";
|
|
25
40
|
|
|
@@ -28,7 +43,6 @@ const fields = ["session_id", "timestamp", "role", "message.content"];
|
|
|
28
43
|
|
|
29
44
|
// Parse a directory (recursive, parallel)
|
|
30
45
|
const entries = parseDir("/path/to/jsonl/files", fields);
|
|
31
|
-
// entries: Array<{ session_id: string; timestamp: string; role: string; "message.content": string }>
|
|
32
46
|
|
|
33
47
|
// Parse a single file
|
|
34
48
|
const fileEntries = parseFile("/path/to/file.jsonl", fields);
|
|
@@ -37,54 +51,219 @@ const fileEntries = parseFile("/path/to/file.jsonl", fields);
|
|
|
37
51
|
const bufferEntries = parseBuffer(jsonlBuffer, fields);
|
|
38
52
|
|
|
39
53
|
// Get version
|
|
40
|
-
console.log(getVersion()); // "1.
|
|
54
|
+
console.log(getVersion()); // "1.2.0"
|
|
41
55
|
```
|
|
42
56
|
|
|
43
|
-
|
|
57
|
+
### GZIP Support
|
|
44
58
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
- Nested (dot notation): `"message.content"`, `"metadata.user.id"`
|
|
59
|
+
```typescript
|
|
60
|
+
import { isGzip, parseGzipFile, writeGzip } from "@ebowwa/jsonl-hft";
|
|
48
61
|
|
|
49
|
-
|
|
62
|
+
// Check if file is gzip compressed
|
|
63
|
+
if (isGzip("/path/to/file.jsonl.gz")) {
|
|
64
|
+
// Parse compressed file
|
|
65
|
+
const entries = parseGzipFile("/path/to/file.jsonl.gz", fields);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Write to gzip file
|
|
69
|
+
const result = writeGzip("/path/to/output.jsonl.gz", jsonlData, 9);
|
|
70
|
+
console.log(`Compression ratio: ${result.compressionRatio}`);
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Parallel Parsing
|
|
74
|
+
|
|
75
|
+
```typescript
|
|
76
|
+
import { parseFileParallel, parseFilesParallel, parseDirParallel } from "@ebowwa/jsonl-hft";
|
|
77
|
+
|
|
78
|
+
// Parse single file with parallel chunks
|
|
79
|
+
const result = parseFileParallel("/path/to/large.jsonl", fields, 0); // 0 = auto chunk size
|
|
80
|
+
console.log(`Parsed ${result.linesProcessed} lines in ${result.parseTimeNs}ns`);
|
|
81
|
+
|
|
82
|
+
// Parse multiple files in parallel
|
|
83
|
+
const files = ["/path/a.jsonl", "/path/b.jsonl", "/path/c.jsonl"];
|
|
84
|
+
const multiResult = parseFilesParallel(files, fields);
|
|
85
|
+
|
|
86
|
+
// Parse directory in parallel
|
|
87
|
+
const dirResult = parseDirParallel("/path/to/jsonl/dir", fields);
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Schema Validation
|
|
91
|
+
|
|
92
|
+
```typescript
|
|
93
|
+
import { validateFile, FieldType } from "@ebowwa/jsonl-hft";
|
|
94
|
+
|
|
95
|
+
const schema = [
|
|
96
|
+
{ name: "session_id", expectedType: FieldType.String, required: true },
|
|
97
|
+
{ name: "timestamp", expectedType: FieldType.String, required: true },
|
|
98
|
+
{ name: "value", expectedType: FieldType.Number, required: false },
|
|
99
|
+
];
|
|
100
|
+
|
|
101
|
+
const validation = validateFile("/path/to/data.jsonl", schema);
|
|
102
|
+
if (!validation.isValid) {
|
|
103
|
+
console.log(`Found ${validation.errorCount} errors`);
|
|
104
|
+
validation.errors.forEach(err => {
|
|
105
|
+
console.log(`Line ${err.lineNumber}: ${err.errorMessage}`);
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Type Inference
|
|
111
|
+
|
|
112
|
+
```typescript
|
|
113
|
+
import { inferFieldTypes, FieldType } from "@ebowwa/jsonl-hft";
|
|
114
|
+
|
|
115
|
+
const line = Buffer.from('{"id":"123","count":42,"active":true}');
|
|
116
|
+
const fields = ["id", "count", "active"];
|
|
117
|
+
const types = inferFieldTypes(line, fields);
|
|
118
|
+
|
|
119
|
+
// types[0] = FieldType.String
|
|
120
|
+
// types[1] = FieldType.Number
|
|
121
|
+
// types[2] = FieldType.Boolean
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Statistics
|
|
125
|
+
|
|
126
|
+
```typescript
|
|
127
|
+
import { getStats, resetStats, parseFileWithStats } from "@ebowwa/jsonl-hft";
|
|
128
|
+
|
|
129
|
+
// Parse with stats collection
|
|
130
|
+
const result = parseFileWithStats("/path/to/file.jsonl", fields);
|
|
131
|
+
|
|
132
|
+
// Get global stats
|
|
133
|
+
const stats = getStats();
|
|
134
|
+
console.log(`Throughput: ${stats.throughputMiBs} MiB/s`);
|
|
135
|
+
console.log(`Avg latency: ${stats.avgLatencyNs} ns`);
|
|
136
|
+
|
|
137
|
+
// Reset stats
|
|
138
|
+
resetStats();
|
|
139
|
+
```
|
|
50
140
|
|
|
51
|
-
|
|
141
|
+
### Error Recovery
|
|
52
142
|
|
|
53
|
-
|
|
54
|
-
|
|
143
|
+
```typescript
|
|
144
|
+
import { parseFileWithRecovery } from "@ebowwa/jsonl-hft";
|
|
145
|
+
|
|
146
|
+
const result = parseFileWithRecovery("/path/to/dirty.jsonl", fields);
|
|
147
|
+
console.log(`Successful: ${result.stats.successfulLines}`);
|
|
148
|
+
console.log(`Failed: ${result.stats.failedLines}`);
|
|
55
149
|
|
|
56
|
-
|
|
57
|
-
|
|
150
|
+
// Access errors
|
|
151
|
+
result.errors.forEach(err => {
|
|
152
|
+
console.log(`Line ${err.lineNumber}: ${err.errorType}`);
|
|
153
|
+
});
|
|
154
|
+
```
|
|
58
155
|
|
|
59
|
-
###
|
|
60
|
-
Parse JSONL data from a buffer or string.
|
|
156
|
+
### Batch Parsing
|
|
61
157
|
|
|
62
|
-
|
|
63
|
-
|
|
158
|
+
```typescript
|
|
159
|
+
import { parseBatch, freeBatch } from "@ebowwa/jsonl-hft";
|
|
64
160
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
161
|
+
const buffers = [buf1, buf2, buf3];
|
|
162
|
+
const lengths = [buf1.length, buf2.length, buf3.length];
|
|
163
|
+
const result = parseBatch(buffers, lengths, fields);
|
|
164
|
+
console.log(`Parsed ${result.count} entries`);
|
|
69
165
|
|
|
70
|
-
|
|
166
|
+
// Cleanup
|
|
167
|
+
freeBatch(result);
|
|
168
|
+
```
|
|
71
169
|
|
|
72
|
-
|
|
170
|
+
### Streaming API
|
|
73
171
|
|
|
74
172
|
```typescript
|
|
75
|
-
import {
|
|
173
|
+
import { parseStream } from "@ebowwa/jsonl-hft";
|
|
174
|
+
|
|
175
|
+
// Process each line with a callback
|
|
176
|
+
parseStream("/path/to/large.jsonl", fields, (entry, lineNumber) => {
|
|
177
|
+
// Process entry
|
|
178
|
+
console.log(`Line ${lineNumber}:`, entry);
|
|
179
|
+
return true; // Continue processing
|
|
180
|
+
});
|
|
181
|
+
```
|
|
76
182
|
|
|
77
|
-
|
|
78
|
-
initRingBuffer(1024 * 1024);
|
|
183
|
+
## Field Specification
|
|
79
184
|
|
|
80
|
-
|
|
81
|
-
|
|
185
|
+
Fields can be:
|
|
186
|
+
- **Simple**: `"session_id"`, `"timestamp"`, `"role"`
|
|
187
|
+
- **Nested (dot notation)**: `"message.content"`, `"metadata.user.id"`
|
|
82
188
|
|
|
83
|
-
|
|
84
|
-
const pending = ringPending();
|
|
189
|
+
The parser extracts only the fields you request - no wasted parsing.
|
|
85
190
|
|
|
86
|
-
|
|
87
|
-
|
|
191
|
+
## API Reference
|
|
192
|
+
|
|
193
|
+
### Parsing Functions
|
|
194
|
+
| Function | Description |
|
|
195
|
+
|----------|-------------|
|
|
196
|
+
| `parseDir` | Parse all JSONL files in directory recursively |
|
|
197
|
+
| `parseFile` | Parse single file with memory-mapped I/O |
|
|
198
|
+
| `parseBuffer` | Parse from buffer/string |
|
|
199
|
+
| `parseFileParallel` | Parallel file parsing with chunks |
|
|
200
|
+
| `parseFilesParallel` | Parse multiple files in parallel |
|
|
201
|
+
| `parseDirParallel` | Parallel directory parsing |
|
|
202
|
+
| `parseGzipFile` | Parse gzip-compressed file |
|
|
203
|
+
| `parseFileWithStats` | Parse with statistics collection |
|
|
204
|
+
| `parseFileWithRecovery` | Parse with error recovery |
|
|
205
|
+
| `parseBatch` | Parse multiple buffers |
|
|
206
|
+
| `parseStream` | Streaming callback-based parsing |
|
|
207
|
+
|
|
208
|
+
### Validation Functions
|
|
209
|
+
| Function | Description |
|
|
210
|
+
|----------|-------------|
|
|
211
|
+
| `validateFile` | Validate JSONL against schema |
|
|
212
|
+
| `inferFieldTypes` | Infer types for fields |
|
|
213
|
+
|
|
214
|
+
### Output Functions
|
|
215
|
+
| Function | Description |
|
|
216
|
+
|----------|-------------|
|
|
217
|
+
| `writeGzip` | Write to gzip-compressed file |
|
|
218
|
+
| `writeFile` | Write to uncompressed file |
|
|
219
|
+
|
|
220
|
+
### Utility Functions
|
|
221
|
+
| Function | Description |
|
|
222
|
+
|----------|-------------|
|
|
223
|
+
| `getVersion` | Get library version |
|
|
224
|
+
| `isGzip` | Check if file is gzip compressed |
|
|
225
|
+
| `getStats` | Get global parsing statistics |
|
|
226
|
+
| `resetStats` | Reset statistics counters |
|
|
227
|
+
|
|
228
|
+
### Types
|
|
229
|
+
|
|
230
|
+
```typescript
|
|
231
|
+
enum FieldType {
|
|
232
|
+
Unknown = 0,
|
|
233
|
+
String = 1,
|
|
234
|
+
Number = 2,
|
|
235
|
+
Boolean = 3,
|
|
236
|
+
Null = 4,
|
|
237
|
+
Array = 5,
|
|
238
|
+
Object = 6,
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
interface StatsResult {
|
|
242
|
+
totalLines: bigint;
|
|
243
|
+
successfulLines: bigint;
|
|
244
|
+
failedLines: bigint;
|
|
245
|
+
totalBytes: bigint;
|
|
246
|
+
parseTimeNs: bigint;
|
|
247
|
+
avgLatencyNs: bigint;
|
|
248
|
+
minLatencyNs: bigint;
|
|
249
|
+
maxLatencyNs: bigint;
|
|
250
|
+
throughputMiBs: number;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
interface ParallelResult {
|
|
254
|
+
entries: GenericEntry[];
|
|
255
|
+
count: number;
|
|
256
|
+
linesProcessed: bigint;
|
|
257
|
+
parseTimeNs: bigint;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
interface WriteResult {
|
|
261
|
+
success: boolean;
|
|
262
|
+
bytesWritten: bigint;
|
|
263
|
+
compressedBytes: bigint;
|
|
264
|
+
compressionRatio: number;
|
|
265
|
+
errorMessage?: string;
|
|
266
|
+
}
|
|
88
267
|
```
|
|
89
268
|
|
|
90
269
|
## Build
|
|
@@ -107,6 +286,10 @@ bun run build
|
|
|
107
286
|
│ │ • Parallel processing (rayon) │ │
|
|
108
287
|
│ │ • Zero allocation hot path │ │
|
|
109
288
|
│ │ • SIMD-friendly byte scanning (memchr) │ │
|
|
289
|
+
│ │ • GZIP compression (flate2/zlib-ng) │ │
|
|
290
|
+
│ │ • Schema validation │ │
|
|
291
|
+
│ │ • Type inference │ │
|
|
292
|
+
│ │ • Statistics collection │ │
|
|
110
293
|
│ └─────────────────────────────────────────────────┘ │
|
|
111
294
|
└─────────────────────────────────────────────────────────┘
|
|
112
295
|
```
|
package/dist/index.d.ts
CHANGED
|
@@ -29,8 +29,11 @@
|
|
|
29
29
|
export type FieldSpec = string;
|
|
30
30
|
/**
|
|
31
31
|
* Generic entry - a record of field name to string value
|
|
32
|
+
* When using parseDir, includes source_file metadata
|
|
32
33
|
*/
|
|
33
|
-
export type GenericEntry = Record<string, string
|
|
34
|
+
export type GenericEntry = Record<string, string> & {
|
|
35
|
+
source_file?: string;
|
|
36
|
+
};
|
|
34
37
|
/**
|
|
35
38
|
* Parse result with metadata
|
|
36
39
|
*/
|
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ebowwa/jsonl-hft",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"author": "ebowwa",
|
|
5
|
-
"description": "Generic HFT-grade JSONL parser - NO hardcoded fields, consumer defines what to extract. Sub-10µs latency.",
|
|
5
|
+
"description": "Generic HFT-grade JSONL parser - NO hardcoded fields, consumer defines what to extract. Sub-10µs latency with memchr SIMD optimization. Parallel parsing, gzip support, schema validation.",
|
|
6
6
|
"main": "dist/index.js",
|
|
7
7
|
"types": "dist/index.d.ts",
|
|
8
8
|
"exports": {
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
"build": "CARGO_TARGET_DIR=target cargo build --release && bun build ./src/index.ts --outdir ./dist --target=bun && tsc --emitDeclarationOnly --declaration --outDir ./dist",
|
|
16
16
|
"build:rust": "CARGO_TARGET_DIR=target cargo build --release",
|
|
17
17
|
"test": "bun test",
|
|
18
|
-
"bench": "
|
|
18
|
+
"bench": "cargo bench"
|
|
19
19
|
},
|
|
20
20
|
"keywords": [
|
|
21
21
|
"jsonl",
|
|
@@ -24,7 +24,13 @@
|
|
|
24
24
|
"hft",
|
|
25
25
|
"low-latency",
|
|
26
26
|
"configurable-fields",
|
|
27
|
-
"zero-copy"
|
|
27
|
+
"zero-copy",
|
|
28
|
+
"simd",
|
|
29
|
+
"memchr",
|
|
30
|
+
"parallel",
|
|
31
|
+
"gzip",
|
|
32
|
+
"schema-validation",
|
|
33
|
+
"type-inference"
|
|
28
34
|
],
|
|
29
35
|
"license": "MIT",
|
|
30
36
|
"files": [
|
|
@@ -40,7 +46,11 @@
|
|
|
40
46
|
"responsibilities": [
|
|
41
47
|
"generic-jsonl-parsing",
|
|
42
48
|
"high-performance-parsing",
|
|
43
|
-
"field-extraction"
|
|
49
|
+
"field-extraction",
|
|
50
|
+
"simd-optimization",
|
|
51
|
+
"parallel-processing",
|
|
52
|
+
"gzip-compression",
|
|
53
|
+
"schema-validation"
|
|
44
54
|
]
|
|
45
55
|
}
|
|
46
56
|
}
|