biosyntax 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,413 @@
1
+ #ifndef BIOSYNTAX_H
2
+ #define BIOSYNTAX_H
3
+
4
+ /*
5
+ * libbiosyntax: dependency-free C tokenizer/highlighter core for biological files.
6
+ * The core performs no IO; callers pass one already-read text line at a time.
7
+ * SPDX-License-Identifier: GPL-3.0-only
8
+ */
9
+
10
+ #include <stddef.h>
11
+ #include <stdint.h>
12
+
13
+ #ifdef __cplusplus
14
+ extern "C" {
15
+ #endif
16
+
17
+ #if defined(BIOSYN_STATIC)
18
+ # define BIOSYN_API
19
+ #elif defined(_WIN32) || defined(__CYGWIN__)
20
+ # if defined(BIOSYN_BUILDING_LIBRARY)
21
+ # define BIOSYN_API __declspec(dllexport)
22
+ # else
23
+ # define BIOSYN_API __declspec(dllimport)
24
+ # endif
25
+ #else
26
+ # if defined(BIOSYN_BUILDING_LIBRARY) && defined(__GNUC__)
27
+ # define BIOSYN_API __attribute__((visibility("default")))
28
+ # else
29
+ # define BIOSYN_API
30
+ # endif
31
+ #endif
32
+
33
+ #define BIOSYN_VERSION_MAJOR 0
34
+ #define BIOSYN_VERSION_MINOR 1
35
+ #define BIOSYN_VERSION_PATCH 0
36
+ #define BIOSYN_ABI_VERSION 1u
37
+
38
+ #define BIOSYN_STRINGIFY_DETAIL(x) #x
39
+ #define BIOSYN_STRINGIFY(x) BIOSYN_STRINGIFY_DETAIL(x)
40
+ #define BIOSYN_VERSION_STRING \
41
+ BIOSYN_STRINGIFY(BIOSYN_VERSION_MAJOR) "." \
42
+ BIOSYN_STRINGIFY(BIOSYN_VERSION_MINOR) "." \
43
+ BIOSYN_STRINGIFY(BIOSYN_VERSION_PATCH)
44
+
45
+ typedef uint32_t biosyn_format_t;
46
+ typedef uint32_t biosyn_class_t;
47
+
48
+ enum {
49
+ BIOSYN_FORMAT_UNKNOWN = 0,
50
+ BIOSYN_FORMAT_FASTA = 1,
51
+ BIOSYN_FORMAT_FASTQ = 2,
52
+ BIOSYN_FORMAT_SAM = 3,
53
+ BIOSYN_FORMAT_VCF = 4,
54
+ BIOSYN_FORMAT_BED = 5,
55
+ BIOSYN_FORMAT_GTF = 6,
56
+ BIOSYN_FORMAT_GFF = 7,
57
+ BIOSYN_FORMAT_PDB = 8,
58
+ BIOSYN_FORMAT_CLUSTAL = 9,
59
+ BIOSYN_FORMAT_FAIDX = 10,
60
+ BIOSYN_FORMAT_FLAGSTAT = 11,
61
+ BIOSYN_FORMAT_WIG = 12,
62
+ BIOSYN_FORMAT_FASTA_NT = 13,
63
+ BIOSYN_FORMAT_FASTA_HC = 14,
64
+ BIOSYN_FORMAT_FASTA_CLUSTAL = 15,
65
+ BIOSYN_FORMAT_FASTA_HYDRO = 16,
66
+ BIOSYN_FORMAT_FASTA_TAYLOR = 17,
67
+ BIOSYN_FORMAT_FASTA_ZAPPO = 18,
68
+ BIOSYN_FORMAT_FASTA_ORF = 19,
69
+ BIOSYN_FORMAT__COUNT
70
+ };
71
+
72
+ enum {
73
+ BIOSYN_CLASS_PLAIN = 0,
74
+ BIOSYN_CLASS_HEADER = 1,
75
+ BIOSYN_CLASS_COMMENT = 2,
76
+ BIOSYN_CLASS_CHROM = 3,
77
+ BIOSYN_CLASS_POSITION = 4,
78
+ BIOSYN_CLASS_NAME = 5,
79
+ BIOSYN_CLASS_SAMPLE = 6,
80
+ BIOSYN_CLASS_SOFTWARE = 7,
81
+ BIOSYN_CLASS_COMMANDLINE = 8,
82
+ BIOSYN_CLASS_STRING = 9,
83
+ BIOSYN_CLASS_QUOTED_STRING = 10,
84
+ BIOSYN_CLASS_NUMBER = 11,
85
+ BIOSYN_CLASS_NUMBER_ALT = 12,
86
+ BIOSYN_CLASS_URL = 13,
87
+ BIOSYN_CLASS_GOOD = 14,
88
+ BIOSYN_CLASS_BAD = 15,
89
+ BIOSYN_CLASS_KEYWORD = 16,
90
+ BIOSYN_CLASS_KEYWORD2 = 17,
91
+ BIOSYN_CLASS_KEYWORD3 = 18,
92
+ BIOSYN_CLASS_KEYWORD4 = 19,
93
+ BIOSYN_CLASS_KEYWORD5 = 20,
94
+ BIOSYN_CLASS_KEYWORD6 = 21,
95
+ BIOSYN_CLASS_STRAND_PLUS = 22,
96
+ BIOSYN_CLASS_STRAND_MINUS = 23,
97
+ BIOSYN_CLASS_STRAND_NONE = 24,
98
+ BIOSYN_CLASS_FEATURE_GENE = 25,
99
+ BIOSYN_CLASS_FEATURE_TRANSCRIPT = 26,
100
+ BIOSYN_CLASS_FEATURE_EXON = 27,
101
+ BIOSYN_CLASS_FEATURE_CDS = 28,
102
+ BIOSYN_CLASS_FEATURE_START_CODON = 29,
103
+ BIOSYN_CLASS_FEATURE_STOP_CODON = 30,
104
+ BIOSYN_CLASS_FEATURE_UTR = 31,
105
+ BIOSYN_CLASS_FEATURE_INTER = 32,
106
+ BIOSYN_CLASS_FEATURE_INTRON_CNS = 33,
107
+ BIOSYN_CLASS_SEQUENCE_LENGTH = 34,
108
+ BIOSYN_CLASS_FILE_OFFSET = 35,
109
+ BIOSYN_CLASS_LINE_BASES = 36,
110
+ BIOSYN_CLASS_LINE_WIDTH = 37,
111
+ BIOSYN_CLASS_QC_PASSED = 38,
112
+ BIOSYN_CLASS_QC_FAILED = 39,
113
+ BIOSYN_CLASS_PERCENT = 40,
114
+ BIOSYN_CLASS_NT_A = 41,
115
+ BIOSYN_CLASS_NT_C = 42,
116
+ BIOSYN_CLASS_NT_G = 43,
117
+ BIOSYN_CLASS_NT_T = 44,
118
+ BIOSYN_CLASS_NT_U = 45,
119
+ BIOSYN_CLASS_NT_N = 46,
120
+ BIOSYN_CLASS_NT_R = 47,
121
+ BIOSYN_CLASS_NT_Y = 48,
122
+ BIOSYN_CLASS_NT_S = 49,
123
+ BIOSYN_CLASS_NT_W = 50,
124
+ BIOSYN_CLASS_NT_M = 51,
125
+ BIOSYN_CLASS_NT_K = 52,
126
+ BIOSYN_CLASS_NT_D = 53,
127
+ BIOSYN_CLASS_NT_B = 54,
128
+ BIOSYN_CLASS_NT_V = 55,
129
+ BIOSYN_CLASS_NT_H = 56,
130
+ BIOSYN_CLASS_NT_X = 57,
131
+ BIOSYN_CLASS_AMINO_HYDRO = 58,
132
+ BIOSYN_CLASS_AMINO_POS = 59,
133
+ BIOSYN_CLASS_AMINO_NEG = 60,
134
+ BIOSYN_CLASS_AMINO_POLAR = 61,
135
+ BIOSYN_CLASS_AMINO_CYS = 62,
136
+ BIOSYN_CLASS_AMINO_GLY = 63,
137
+ BIOSYN_CLASS_AMINO_PRO = 64,
138
+ BIOSYN_CLASS_AMINO_ARO = 65,
139
+ BIOSYN_CLASS_AA_A = 66,
140
+ BIOSYN_CLASS_AA_R = 67,
141
+ BIOSYN_CLASS_AA_N = 68,
142
+ BIOSYN_CLASS_AA_D = 69,
143
+ BIOSYN_CLASS_AA_C = 70,
144
+ BIOSYN_CLASS_AA_Q = 71,
145
+ BIOSYN_CLASS_AA_E = 72,
146
+ BIOSYN_CLASS_AA_G = 73,
147
+ BIOSYN_CLASS_AA_H = 74,
148
+ BIOSYN_CLASS_AA_I = 75,
149
+ BIOSYN_CLASS_AA_L = 76,
150
+ BIOSYN_CLASS_AA_K = 77,
151
+ BIOSYN_CLASS_AA_M = 78,
152
+ BIOSYN_CLASS_AA_F = 79,
153
+ BIOSYN_CLASS_AA_P = 80,
154
+ BIOSYN_CLASS_AA_S = 81,
155
+ BIOSYN_CLASS_AA_T = 82,
156
+ BIOSYN_CLASS_AA_W = 83,
157
+ BIOSYN_CLASS_AA_Y = 84,
158
+ BIOSYN_CLASS_AA_V = 85,
159
+ BIOSYN_CLASS_AA_B = 86,
160
+ BIOSYN_CLASS_AA_X = 87,
161
+ BIOSYN_CLASS_AA_Z = 88,
162
+ BIOSYN_CLASS_ZAPPO_A = 89,
163
+ BIOSYN_CLASS_ZAPPO_R = 90,
164
+ BIOSYN_CLASS_ZAPPO_N = 91,
165
+ BIOSYN_CLASS_ZAPPO_D = 92,
166
+ BIOSYN_CLASS_ZAPPO_C = 93,
167
+ BIOSYN_CLASS_ZAPPO_Q = 94,
168
+ BIOSYN_CLASS_ZAPPO_E = 95,
169
+ BIOSYN_CLASS_ZAPPO_G = 96,
170
+ BIOSYN_CLASS_ZAPPO_H = 97,
171
+ BIOSYN_CLASS_ZAPPO_I = 98,
172
+ BIOSYN_CLASS_ZAPPO_L = 99,
173
+ BIOSYN_CLASS_ZAPPO_K = 100,
174
+ BIOSYN_CLASS_ZAPPO_M = 101,
175
+ BIOSYN_CLASS_ZAPPO_F = 102,
176
+ BIOSYN_CLASS_ZAPPO_P = 103,
177
+ BIOSYN_CLASS_ZAPPO_S = 104,
178
+ BIOSYN_CLASS_ZAPPO_T = 105,
179
+ BIOSYN_CLASS_ZAPPO_W = 106,
180
+ BIOSYN_CLASS_ZAPPO_Y = 107,
181
+ BIOSYN_CLASS_ZAPPO_V = 108,
182
+ BIOSYN_CLASS_ZAPPO_B = 109,
183
+ BIOSYN_CLASS_ZAPPO_X = 110,
184
+ BIOSYN_CLASS_ZAPPO_Z = 111,
185
+ BIOSYN_CLASS_TAYLOR_A = 112,
186
+ BIOSYN_CLASS_TAYLOR_R = 113,
187
+ BIOSYN_CLASS_TAYLOR_N = 114,
188
+ BIOSYN_CLASS_TAYLOR_D = 115,
189
+ BIOSYN_CLASS_TAYLOR_C = 116,
190
+ BIOSYN_CLASS_TAYLOR_Q = 117,
191
+ BIOSYN_CLASS_TAYLOR_E = 118,
192
+ BIOSYN_CLASS_TAYLOR_G = 119,
193
+ BIOSYN_CLASS_TAYLOR_H = 120,
194
+ BIOSYN_CLASS_TAYLOR_I = 121,
195
+ BIOSYN_CLASS_TAYLOR_L = 122,
196
+ BIOSYN_CLASS_TAYLOR_K = 123,
197
+ BIOSYN_CLASS_TAYLOR_M = 124,
198
+ BIOSYN_CLASS_TAYLOR_F = 125,
199
+ BIOSYN_CLASS_TAYLOR_P = 126,
200
+ BIOSYN_CLASS_TAYLOR_S = 127,
201
+ BIOSYN_CLASS_TAYLOR_T = 128,
202
+ BIOSYN_CLASS_TAYLOR_W = 129,
203
+ BIOSYN_CLASS_TAYLOR_Y = 130,
204
+ BIOSYN_CLASS_TAYLOR_V = 131,
205
+ BIOSYN_CLASS_TAYLOR_B = 132,
206
+ BIOSYN_CLASS_TAYLOR_X = 133,
207
+ BIOSYN_CLASS_TAYLOR_Z = 134,
208
+ BIOSYN_CLASS_HYDRO_A = 135,
209
+ BIOSYN_CLASS_HYDRO_R = 136,
210
+ BIOSYN_CLASS_HYDRO_N = 137,
211
+ BIOSYN_CLASS_HYDRO_D = 138,
212
+ BIOSYN_CLASS_HYDRO_C = 139,
213
+ BIOSYN_CLASS_HYDRO_Q = 140,
214
+ BIOSYN_CLASS_HYDRO_E = 141,
215
+ BIOSYN_CLASS_HYDRO_G = 142,
216
+ BIOSYN_CLASS_HYDRO_H = 143,
217
+ BIOSYN_CLASS_HYDRO_I = 144,
218
+ BIOSYN_CLASS_HYDRO_L = 145,
219
+ BIOSYN_CLASS_HYDRO_K = 146,
220
+ BIOSYN_CLASS_HYDRO_M = 147,
221
+ BIOSYN_CLASS_HYDRO_F = 148,
222
+ BIOSYN_CLASS_HYDRO_P = 149,
223
+ BIOSYN_CLASS_HYDRO_S = 150,
224
+ BIOSYN_CLASS_HYDRO_T = 151,
225
+ BIOSYN_CLASS_HYDRO_W = 152,
226
+ BIOSYN_CLASS_HYDRO_Y = 153,
227
+ BIOSYN_CLASS_HYDRO_V = 154,
228
+ BIOSYN_CLASS_HYDRO_B = 155,
229
+ BIOSYN_CLASS_HYDRO_X = 156,
230
+ BIOSYN_CLASS_HYDRO_Z = 157,
231
+ BIOSYN_CLASS_HC_A = 158,
232
+ BIOSYN_CLASS_HC_T = 159,
233
+ BIOSYN_CLASS_HC_G = 160,
234
+ BIOSYN_CLASS_HC_C = 161,
235
+ BIOSYN_CLASS_HC_U = 162,
236
+ BIOSYN_CLASS_HC_R = 163,
237
+ BIOSYN_CLASS_HC_Y = 164,
238
+ BIOSYN_CLASS_HC_S = 165,
239
+ BIOSYN_CLASS_HC_W = 166,
240
+ BIOSYN_CLASS_HC_M = 167,
241
+ BIOSYN_CLASS_HC_K = 168,
242
+ BIOSYN_CLASS_HC_D = 169,
243
+ BIOSYN_CLASS_HC_B = 170,
244
+ BIOSYN_CLASS_HC_V = 171,
245
+ BIOSYN_CLASS_HC_H = 172,
246
+ BIOSYN_CLASS_HC_N = 173,
247
+ BIOSYN_CLASS_HC_X = 174,
248
+ BIOSYN_CLASS_HC_GAP = 175,
249
+ BIOSYN_CLASS_CIGAR_MATCH = 176,
250
+ BIOSYN_CLASS_CIGAR_MISMATCH = 177,
251
+ BIOSYN_CLASS_CIGAR_CLIP = 178,
252
+ BIOSYN_CLASS_CIGAR_INSERTION = 179,
253
+ BIOSYN_CLASS_CIGAR_DELETION = 180,
254
+ BIOSYN_CLASS_QUAL_1 = 181,
255
+ BIOSYN_CLASS_QUAL_3 = 182,
256
+ BIOSYN_CLASS_QUAL_6 = 183,
257
+ BIOSYN_CLASS_QUAL_8 = 184,
258
+ BIOSYN_CLASS_QUAL_10 = 185,
259
+ BIOSYN_CLASS_QUAL_10B = 186,
260
+ BIOSYN_CLASS_QUAL_10I = 187,
261
+ BIOSYN_CLASS_GRAD_0 = 188,
262
+ BIOSYN_CLASS_GRAD_1 = 189,
263
+ BIOSYN_CLASS_GRAD_2 = 190,
264
+ BIOSYN_CLASS_GRAD_3 = 191,
265
+ BIOSYN_CLASS_GRAD_4 = 192,
266
+ BIOSYN_CLASS_GRAD_5 = 193,
267
+ BIOSYN_CLASS_GRAD_6 = 194,
268
+ BIOSYN_CLASS_GRAD_7 = 195,
269
+ BIOSYN_CLASS_GRAD_8 = 196,
270
+ BIOSYN_CLASS_GRAD_9 = 197,
271
+ BIOSYN_CLASS_GRAD_10 = 198,
272
+ BIOSYN_CLASS_GRADBW_0 = 199,
273
+ BIOSYN_CLASS_GRADBW_1 = 200,
274
+ BIOSYN_CLASS_GRADBW_2 = 201,
275
+ BIOSYN_CLASS_GRADBW_3 = 202,
276
+ BIOSYN_CLASS_GRADBW_4 = 203,
277
+ BIOSYN_CLASS_GRADBW_5 = 204,
278
+ BIOSYN_CLASS_GRADBW_6 = 205,
279
+ BIOSYN_CLASS_GRADBW_7 = 206,
280
+ BIOSYN_CLASS_GRADBW_8 = 207,
281
+ BIOSYN_CLASS_GRADBW_9 = 208,
282
+ BIOSYN_CLASS_GRADBW_10 = 209,
283
+ BIOSYN_CLASS_GAP = 210,
284
+ BIOSYN_CLASS_NULL = 211,
285
+ BIOSYN_CLASS_ERROR = 212,
286
+ BIOSYN_CLASS_ORF_START = 213,
287
+ BIOSYN_CLASS_ORF_CODING = 214,
288
+ BIOSYN_CLASS_ORF_STOP = 215,
289
+ BIOSYN_CLASS__COUNT
290
+ };
291
+
292
+ typedef struct biosyn_span {
293
+ uint64_t start;
294
+ uint64_t length;
295
+ biosyn_class_t class_id;
296
+ uint32_t reserved;
297
+ } biosyn_span_t;
298
+
299
+ typedef struct biosyn_state {
300
+ uint32_t abi_version;
301
+ biosyn_format_t format;
302
+ uint64_t line_no;
303
+ uint32_t fastq_phase;
304
+ uint32_t wig_mode;
305
+ uint32_t reserved[7];
306
+ } biosyn_state_t;
307
+
308
+ typedef struct biosyn_class_info {
309
+ const char *name;
310
+ const char *scope;
311
+ const char *foreground;
312
+ const char *background;
313
+ const char *font_style;
314
+ const char *ansi_sgr;
315
+ } biosyn_class_info_t;
316
+
317
+ typedef struct biosyn_format_info {
318
+ const char *name;
319
+ const char *description;
320
+ uint32_t stateful;
321
+ uint32_t reserved;
322
+ } biosyn_format_info_t;
323
+
324
+ typedef struct biosyn_ansi_style {
325
+ biosyn_class_t class_id;
326
+ uint32_t reserved;
327
+ const char *ansi_sgr;
328
+ } biosyn_ansi_style_t;
329
+
330
+ BIOSYN_API uint32_t biosyn_abi_version(void);
331
+ BIOSYN_API const char *biosyn_version(void);
332
+
333
+ BIOSYN_API biosyn_format_t biosyn_format_from_name(const char *name);
334
+ BIOSYN_API biosyn_format_t biosyn_guess_format_from_path(const char *path_or_extension);
335
+ BIOSYN_API const char *biosyn_format_name(biosyn_format_t format);
336
+ BIOSYN_API uint32_t biosyn_format_count(void);
337
+ BIOSYN_API int biosyn_format_info(biosyn_format_t format, biosyn_format_info_t *out);
338
+
339
+ BIOSYN_API const char *biosyn_class_name(biosyn_class_t class_id);
340
+ BIOSYN_API const char *biosyn_class_scope(biosyn_class_t class_id);
341
+ BIOSYN_API const char *biosyn_class_ansi_sgr(biosyn_class_t class_id);
342
+ BIOSYN_API const char *biosyn_class_default_foreground(biosyn_class_t class_id);
343
+ BIOSYN_API const char *biosyn_class_default_background(biosyn_class_t class_id);
344
+ BIOSYN_API const char *biosyn_class_default_font_style(biosyn_class_t class_id);
345
+ BIOSYN_API uint32_t biosyn_class_count(void);
346
+ BIOSYN_API int biosyn_class_info(biosyn_class_t class_id, biosyn_class_info_t *out);
347
+
348
+ BIOSYN_API void biosyn_state_init(biosyn_state_t *state, biosyn_format_t format);
349
+ BIOSYN_API biosyn_state_t *biosyn_state_new(biosyn_format_t format);
350
+ BIOSYN_API void biosyn_state_free(biosyn_state_t *state);
351
+
352
+ /*
353
+ * Returns the number of spans required. If out_cap is smaller than the return
354
+ * value, only the first out_cap spans are written. The line need not be NUL
355
+ * terminated. Newline/CRLF terminators are ignored for tokenization.
356
+ *
357
+ * Format IDs describe text syntax families. Compressed or binary containers
358
+ * such as BAM, CRAM, and BCF must be decoded by the caller before highlighting.
359
+ */
360
+ BIOSYN_API uint64_t biosyn_highlight_line(
361
+ biosyn_format_t format,
362
+ const char *line,
363
+ uint64_t len,
364
+ uint64_t zero_based_line_no,
365
+ biosyn_span_t *out,
366
+ uint64_t out_cap
367
+ );
368
+
369
+ /*
370
+ * Stateful API. Prefer this for FASTQ and WIG streams. If out_cap is too small,
371
+ * the required span count is returned and state is left unchanged so callers can
372
+ * retry with a larger buffer. State advances only when the returned count is
373
+ * less than or equal to out_cap.
374
+ */
375
+ BIOSYN_API uint64_t biosyn_highlight_next_line(
376
+ biosyn_state_t *state,
377
+ const char *line,
378
+ uint64_t len,
379
+ biosyn_span_t *out,
380
+ uint64_t out_cap
381
+ );
382
+
383
+ /* ANSI renderer for examples/CLI. Returns bytes required, excluding NUL. */
384
+ BIOSYN_API uint64_t biosyn_render_ansi_line(
385
+ const char *line,
386
+ uint64_t len,
387
+ const biosyn_span_t *spans,
388
+ uint64_t span_count,
389
+ char *out,
390
+ uint64_t out_cap
391
+ );
392
+
393
+ /*
394
+ * ANSI renderer with optional style overrides. Each style entry maps class_id
395
+ * to an ANSI SGR fragment such as "38;2;255;0;0" or "1;31". Unknown classes
396
+ * and missing overrides use the built-in default style.
397
+ */
398
+ BIOSYN_API uint64_t biosyn_render_ansi_line_with_styles(
399
+ const char *line,
400
+ uint64_t len,
401
+ const biosyn_span_t *spans,
402
+ uint64_t span_count,
403
+ const biosyn_ansi_style_t *styles,
404
+ uint64_t style_count,
405
+ char *out,
406
+ uint64_t out_cap
407
+ );
408
+
409
+ #ifdef __cplusplus
410
+ }
411
+ #endif
412
+
413
+ #endif /* BIOSYNTAX_H */