@ebowwa/jsonl-hft 1.0.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,16 +1,29 @@
1
1
  # @ebowwa/jsonl-hft
2
2
 
3
- Generic HFT-grade JSONL parser with sub-10us latency.
3
+ Generic HFT-grade JSONL parser with sub-microsecond latency.
4
4
 
5
5
  **NO HARDCODED FIELDS** - Consumer defines what fields to extract.
6
6
 
7
+ ## Features
8
+
9
+ - **Parallel Parsing**: Multi-core processing with rayon
10
+ - **GZIP Support**: Read and write compressed files
11
+ - **Schema Validation**: Validate JSONL against type schemas
12
+ - **Type Inference**: Automatic field type detection
13
+ - **Error Recovery**: Continue parsing on malformed lines
14
+ - **Statistics**: Real-time parsing metrics
15
+ - **Memory-Mapped I/O**: Efficient large file handling
16
+ - **SIMD Optimized**: memchr-accelerated byte search (~19 GiB/s)
17
+
7
18
  ## Performance
8
19
 
9
- | Metric | Value |
10
- |--------|-------|
11
- | Latency per entry | 6.21us |
12
- | Throughput | ~161K entries/sec |
13
- | vs old npm package | 1.13x faster |
20
+ | Benchmark | Time | Throughput |
21
+ |-----------|------|------------|
22
+ | parse_line | 232 ns | ~606 MiB/s |
23
+ | find_field (first) | 15 ns | ~5 GiB/s |
24
+ | find_field (last) | 58 ns | ~1.3 GiB/s |
25
+ | pool_parser (1000 lines) | 113 µs | ~435 MiB/s |
26
+ | memchr_search | 3.9 ns | ~19.7 GiB/s |
14
27
 
15
28
  ## Installation
16
29
 
@@ -20,6 +33,8 @@ bun add @ebowwa/jsonl-hft
20
33
 
21
34
  ## Usage
22
35
 
36
+ ### Basic Parsing
37
+
23
38
  ```typescript
24
39
  import { parseDir, parseFile, parseBuffer, getVersion } from "@ebowwa/jsonl-hft";
25
40
 
@@ -28,7 +43,6 @@ const fields = ["session_id", "timestamp", "role", "message.content"];
28
43
 
29
44
  // Parse a directory (recursive, parallel)
30
45
  const entries = parseDir("/path/to/jsonl/files", fields);
31
- // entries: Array<{ session_id: string; timestamp: string; role: string; "message.content": string }>
32
46
 
33
47
  // Parse a single file
34
48
  const fileEntries = parseFile("/path/to/file.jsonl", fields);
@@ -37,54 +51,219 @@ const fileEntries = parseFile("/path/to/file.jsonl", fields);
37
51
  const bufferEntries = parseBuffer(jsonlBuffer, fields);
38
52
 
39
53
  // Get version
40
- console.log(getVersion()); // "1.0.0"
54
+ console.log(getVersion()); // "1.2.0"
41
55
  ```
42
56
 
43
- ## Field Specification
57
+ ### GZIP Support
44
58
 
45
- Fields can be:
46
- - Simple: `"session_id"`, `"timestamp"`, `"role"`
47
- - Nested (dot notation): `"message.content"`, `"metadata.user.id"`
59
+ ```typescript
60
+ import { isGzip, parseGzipFile, writeGzip } from "@ebowwa/jsonl-hft";
48
61
 
49
- The parser extracts only the fields you request - no wasted parsing.
62
+ // Check if file is gzip compressed
63
+ if (isGzip("/path/to/file.jsonl.gz")) {
64
+ // Parse compressed file
65
+ const entries = parseGzipFile("/path/to/file.jsonl.gz", fields);
66
+ }
67
+
68
+ // Write to gzip file
69
+ const result = writeGzip("/path/to/output.jsonl.gz", jsonlData, 9);
70
+ console.log(`Compression ratio: ${result.compressionRatio}`);
71
+ ```
72
+
73
+ ### Parallel Parsing
74
+
75
+ ```typescript
76
+ import { parseFileParallel, parseFilesParallel, parseDirParallel } from "@ebowwa/jsonl-hft";
77
+
78
+ // Parse single file with parallel chunks
79
+ const result = parseFileParallel("/path/to/large.jsonl", fields, 0); // 0 = auto chunk size
80
+ console.log(`Parsed ${result.linesProcessed} lines in ${result.parseTimeNs}ns`);
81
+
82
+ // Parse multiple files in parallel
83
+ const files = ["/path/a.jsonl", "/path/b.jsonl", "/path/c.jsonl"];
84
+ const multiResult = parseFilesParallel(files, fields);
85
+
86
+ // Parse directory in parallel
87
+ const dirResult = parseDirParallel("/path/to/jsonl/dir", fields);
88
+ ```
89
+
90
+ ### Schema Validation
91
+
92
+ ```typescript
93
+ import { validateFile, FieldType } from "@ebowwa/jsonl-hft";
94
+
95
+ const schema = [
96
+ { name: "session_id", expectedType: FieldType.String, required: true },
97
+ { name: "timestamp", expectedType: FieldType.String, required: true },
98
+ { name: "value", expectedType: FieldType.Number, required: false },
99
+ ];
100
+
101
+ const validation = validateFile("/path/to/data.jsonl", schema);
102
+ if (!validation.isValid) {
103
+ console.log(`Found ${validation.errorCount} errors`);
104
+ validation.errors.forEach(err => {
105
+ console.log(`Line ${err.lineNumber}: ${err.errorMessage}`);
106
+ });
107
+ }
108
+ ```
109
+
110
+ ### Type Inference
111
+
112
+ ```typescript
113
+ import { inferFieldTypes, FieldType } from "@ebowwa/jsonl-hft";
114
+
115
+ const line = Buffer.from('{"id":"123","count":42,"active":true}');
116
+ const fields = ["id", "count", "active"];
117
+ const types = inferFieldTypes(line, fields);
118
+
119
+ // types[0] = FieldType.String
120
+ // types[1] = FieldType.Number
121
+ // types[2] = FieldType.Boolean
122
+ ```
123
+
124
+ ### Statistics
125
+
126
+ ```typescript
127
+ import { getStats, resetStats, parseFileWithStats } from "@ebowwa/jsonl-hft";
128
+
129
+ // Parse with stats collection
130
+ const result = parseFileWithStats("/path/to/file.jsonl", fields);
131
+
132
+ // Get global stats
133
+ const stats = getStats();
134
+ console.log(`Throughput: ${stats.throughputMiBs} MiB/s`);
135
+ console.log(`Avg latency: ${stats.avgLatencyNs} ns`);
136
+
137
+ // Reset stats
138
+ resetStats();
139
+ ```
50
140
 
51
- ## API
141
+ ### Error Recovery
52
142
 
53
- ### `parseDir<T>(dirPath: string, fields: FieldSpec[]): T[]`
54
- Parse all JSONL files in a directory recursively. Files are parsed in parallel using rayon.
143
+ ```typescript
144
+ import { parseFileWithRecovery } from "@ebowwa/jsonl-hft";
145
+
146
+ const result = parseFileWithRecovery("/path/to/dirty.jsonl", fields);
147
+ console.log(`Successful: ${result.stats.successfulLines}`);
148
+ console.log(`Failed: ${result.stats.failedLines}`);
55
149
 
56
- ### `parseFile<T>(filePath: string, fields: FieldSpec[]): T[]`
57
- Parse a single JSONL file using memory-mapped I/O.
150
+ // Access errors
151
+ result.errors.forEach(err => {
152
+ console.log(`Line ${err.lineNumber}: ${err.errorType}`);
153
+ });
154
+ ```
58
155
 
59
- ### `parseBuffer<T>(input: Buffer | Uint8Array | string, fields: FieldSpec[]): T[]`
60
- Parse JSONL data from a buffer or string.
156
+ ### Batch Parsing
61
157
 
62
- ### `getVersion(): string`
63
- Get the library version.
158
+ ```typescript
159
+ import { parseBatch, freeBatch } from "@ebowwa/jsonl-hft";
64
160
 
65
- ### Preset Field Arrays
66
- - `CLAUDE_CODE_FIELDS` - For Claude Code history format
67
- - `TRADE_FIELDS` - For trading/market data
68
- - `LOG_FIELDS` - For log file parsing
161
+ const buffers = [buf1, buf2, buf3];
162
+ const lengths = [buf1.length, buf2.length, buf3.length];
163
+ const result = parseBatch(buffers, lengths, fields);
164
+ console.log(`Parsed ${result.count} entries`);
69
165
 
70
- ## Ring Buffer (HFT Streaming)
166
+ // Cleanup
167
+ freeBatch(result);
168
+ ```
71
169
 
72
- For high-frequency streaming scenarios:
170
+ ### Streaming API
73
171
 
74
172
  ```typescript
75
- import { initRingBuffer, ringWrite, ringRead, ringPending } from "@ebowwa/jsonl-hft";
173
+ import { parseStream } from "@ebowwa/jsonl-hft";
174
+
175
+ // Process each line with a callback
176
+ parseStream("/path/to/large.jsonl", fields, (entry, lineNumber) => {
177
+ // Process entry
178
+ console.log(`Line ${lineNumber}:`, entry);
179
+ return true; // Continue processing
180
+ });
181
+ ```
76
182
 
77
- // Initialize 1MB ring buffer
78
- initRingBuffer(1024 * 1024);
183
+ ## Field Specification
79
184
 
80
- // Write data
81
- ringWrite(data);
185
+ Fields can be:
186
+ - **Simple**: `"session_id"`, `"timestamp"`, `"role"`
187
+ - **Nested (dot notation)**: `"message.content"`, `"metadata.user.id"`
82
188
 
83
- // Check pending bytes
84
- const pending = ringPending();
189
+ The parser extracts only the fields you request - no wasted parsing.
85
190
 
86
- // Read data
87
- const readData = ringRead(maxLen);
191
+ ## API Reference
192
+
193
+ ### Parsing Functions
194
+ | Function | Description |
195
+ |----------|-------------|
196
+ | `parseDir` | Parse all JSONL files in directory recursively |
197
+ | `parseFile` | Parse single file with memory-mapped I/O |
198
+ | `parseBuffer` | Parse from buffer/string |
199
+ | `parseFileParallel` | Parallel file parsing with chunks |
200
+ | `parseFilesParallel` | Parse multiple files in parallel |
201
+ | `parseDirParallel` | Parallel directory parsing |
202
+ | `parseGzipFile` | Parse gzip-compressed file |
203
+ | `parseFileWithStats` | Parse with statistics collection |
204
+ | `parseFileWithRecovery` | Parse with error recovery |
205
+ | `parseBatch` | Parse multiple buffers |
206
+ | `parseStream` | Streaming callback-based parsing |
207
+
208
+ ### Validation Functions
209
+ | Function | Description |
210
+ |----------|-------------|
211
+ | `validateFile` | Validate JSONL against schema |
212
+ | `inferFieldTypes` | Infer types for fields |
213
+
214
+ ### Output Functions
215
+ | Function | Description |
216
+ |----------|-------------|
217
+ | `writeGzip` | Write to gzip-compressed file |
218
+ | `writeFile` | Write to uncompressed file |
219
+
220
+ ### Utility Functions
221
+ | Function | Description |
222
+ |----------|-------------|
223
+ | `getVersion` | Get library version |
224
+ | `isGzip` | Check if file is gzip compressed |
225
+ | `getStats` | Get global parsing statistics |
226
+ | `resetStats` | Reset statistics counters |
227
+
228
+ ### Types
229
+
230
+ ```typescript
231
+ enum FieldType {
232
+ Unknown = 0,
233
+ String = 1,
234
+ Number = 2,
235
+ Boolean = 3,
236
+ Null = 4,
237
+ Array = 5,
238
+ Object = 6,
239
+ }
240
+
241
+ interface StatsResult {
242
+ totalLines: bigint;
243
+ successfulLines: bigint;
244
+ failedLines: bigint;
245
+ totalBytes: bigint;
246
+ parseTimeNs: bigint;
247
+ avgLatencyNs: bigint;
248
+ minLatencyNs: bigint;
249
+ maxLatencyNs: bigint;
250
+ throughputMiBs: number;
251
+ }
252
+
253
+ interface ParallelResult {
254
+ entries: GenericEntry[];
255
+ count: number;
256
+ linesProcessed: bigint;
257
+ parseTimeNs: bigint;
258
+ }
259
+
260
+ interface WriteResult {
261
+ success: boolean;
262
+ bytesWritten: bigint;
263
+ compressedBytes: bigint;
264
+ compressionRatio: number;
265
+ errorMessage?: string;
266
+ }
88
267
  ```
89
268
 
90
269
  ## Build
@@ -107,6 +286,10 @@ bun run build
107
286
  │ │ • Parallel processing (rayon) │ │
108
287
  │ │ • Zero allocation hot path │ │
109
288
  │ │ • SIMD-friendly byte scanning (memchr) │ │
289
+ │ │ • GZIP compression (flate2/zlib-ng) │ │
290
+ │ │ • Schema validation │ │
291
+ │ │ • Type inference │ │
292
+ │ │ • Statistics collection │ │
110
293
  │ └─────────────────────────────────────────────────┘ │
111
294
  └─────────────────────────────────────────────────────────┘
112
295
  ```
package/dist/index.d.ts CHANGED
@@ -29,8 +29,11 @@
29
29
  export type FieldSpec = string;
30
30
  /**
31
31
  * Generic entry - a record of field name to string value
32
+ * When using parseDir, includes source_file metadata
32
33
  */
33
- export type GenericEntry = Record<string, string>;
34
+ export type GenericEntry = Record<string, string> & {
35
+ source_file?: string;
36
+ };
34
37
  /**
35
38
  * Parse result with metadata
36
39
  */
Binary file
package/package.json CHANGED
@@ -1,8 +1,8 @@
1
1
  {
2
2
  "name": "@ebowwa/jsonl-hft",
3
- "version": "1.0.0",
3
+ "version": "1.3.0",
4
4
  "author": "ebowwa",
5
- "description": "Generic HFT-grade JSONL parser - NO hardcoded fields, consumer defines what to extract. Sub-10µs latency.",
5
+ "description": "Generic HFT-grade JSONL parser - NO hardcoded fields, consumer defines what to extract. Sub-10µs latency with memchr SIMD optimization. Parallel parsing, gzip support, schema validation.",
6
6
  "main": "dist/index.js",
7
7
  "types": "dist/index.d.ts",
8
8
  "exports": {
@@ -15,7 +15,7 @@
15
15
  "build": "CARGO_TARGET_DIR=target cargo build --release && bun build ./src/index.ts --outdir ./dist --target=bun && tsc --emitDeclarationOnly --declaration --outDir ./dist",
16
16
  "build:rust": "CARGO_TARGET_DIR=target cargo build --release",
17
17
  "test": "bun test",
18
- "bench": "bun run benchmark.ts"
18
+ "bench": "cargo bench"
19
19
  },
20
20
  "keywords": [
21
21
  "jsonl",
@@ -24,7 +24,13 @@
24
24
  "hft",
25
25
  "low-latency",
26
26
  "configurable-fields",
27
- "zero-copy"
27
+ "zero-copy",
28
+ "simd",
29
+ "memchr",
30
+ "parallel",
31
+ "gzip",
32
+ "schema-validation",
33
+ "type-inference"
28
34
  ],
29
35
  "license": "MIT",
30
36
  "files": [
@@ -40,7 +46,11 @@
40
46
  "responsibilities": [
41
47
  "generic-jsonl-parsing",
42
48
  "high-performance-parsing",
43
- "field-extraction"
49
+ "field-extraction",
50
+ "simd-optimization",
51
+ "parallel-processing",
52
+ "gzip-compression",
53
+ "schema-validation"
44
54
  ]
45
55
  }
46
56
  }