biosyntax 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/biosyntax.rb ADDED
@@ -0,0 +1,538 @@
1
+ require_relative 'biosyntax/version'
2
+ require_relative 'biosyntax/biosyntax_ext'
3
+
4
+ # Ruby bindings for the vendored `libbiosyntax` tokenizer/highlighter.
5
+ #
6
+ # BioSyntax highlights biological text formats one line at a time. The parser
7
+ # and ANSI renderer are implemented by a native C extension; this module exposes
8
+ # Ruby value objects and convenience factories around that extension.
9
+ #
10
+ # @example Highlight a VCF line and inspect spans
11
+ # highlighter = BioSyntax.vcf
12
+ # line = "chr1\t42\trs1\tA\tT\t99\tPASS\tDP=10\n"
13
+ # highlighter.highlight(line).each do |span|
14
+ # puts [span.start, span.end, span.kind_name, span.scope].join("\t")
15
+ # end
16
+ #
17
+ # @example Render ANSI-colored output
18
+ # highlighter = BioSyntax.fastq
19
+ # File.foreach("reads.fastq", chomp: false) do |line|
20
+ # print highlighter.colorize(line)
21
+ # end
22
+ module BioSyntax
23
+ # Base error class for BioSyntax exceptions.
24
+ class Error < StandardError; end
25
+
26
+ # Raised when a format name or id is not supported by libbiosyntax.
27
+ class UnsupportedFormatError < Error; end
28
+
29
+ # Raised when a token kind name or id is not known.
30
+ class UnknownKindError < Error; end
31
+
32
+ class << self
33
+ private
34
+
35
+ def normalize_name(value)
36
+ value.to_s.downcase.to_sym
37
+ end
38
+
39
+ def constant_name(value)
40
+ value.to_s.gsub(/[^0-9A-Za-z]+/, '_').upcase.sub(/\A_+/, '').sub(/_+\z/, '')
41
+ end
42
+ end
43
+
44
+ # Metadata for a supported input format.
45
+ #
46
+ # Format objects are generated from the native extension at load time. They are
47
+ # immutable and comparable by native format id.
48
+ #
49
+ # @see BioSyntax::FORMATS
50
+ # @see BioSyntax.format
51
+ class Format
52
+ # @return [Integer] native format id
53
+ # @return [Symbol] canonical format name, such as `:vcf` or `:"fasta-nt"`
54
+ # @return [String] human-readable format description
55
+ attr_reader :id, :name, :description
56
+
57
+ # @api private
58
+ def initialize(id:, name:, description:, stateful:)
59
+ @id = Integer(id)
60
+ @name = BioSyntax.__send__(:normalize_name, name)
61
+ @description = String(description).freeze
62
+ @stateful = !!stateful
63
+ freeze
64
+ end
65
+
66
+ # @return [Boolean] true when highlighting depends on previous lines
67
+ def stateful?
68
+ @stateful
69
+ end
70
+
71
+ # Ruby factory method name for this format.
72
+ #
73
+ # Hyphenated format names are exposed with underscores, for example
74
+ # `:"fasta-nt"` becomes `:fasta_nt`.
75
+ #
76
+ # @return [Symbol]
77
+ def method_name
78
+ name.to_s.tr('-', '_').to_sym
79
+ end
80
+
81
+ # @return [String] canonical format name
82
+ def to_s
83
+ name.to_s
84
+ end
85
+
86
+ # @return [Hash] serializable metadata for this format
87
+ def to_h
88
+ {
89
+ id: id,
90
+ name: name,
91
+ description: description,
92
+ stateful: stateful?
93
+ }
94
+ end
95
+
96
+ # @param other [Object]
97
+ # @return [Boolean]
98
+ def ==(other)
99
+ other.is_a?(Format) && other.id == id
100
+ end
101
+ alias eql? ==
102
+
103
+ # @return [Integer]
104
+ def hash
105
+ [self.class, id].hash
106
+ end
107
+
108
+ # @return [String]
109
+ def inspect
110
+ "#<#{self.class} name=#{name.inspect} id=#{id} stateful=#{stateful?}>"
111
+ end
112
+ end
113
+
114
+ # Metadata for a semantic token kind.
115
+ #
116
+ # Kinds describe spans returned by {Highlighter#highlight}. They include a
117
+ # TextMate-style scope and the ANSI SGR sequence used by {Highlighter#colorize}.
118
+ #
119
+ # @see BioSyntax::KINDS
120
+ # @see BioSyntax.kind
121
+ class Kind
122
+ # @return [Integer] native kind id
123
+ # @return [Symbol] canonical kind name, such as `:chrom`
124
+ # @return [String] semantic scope, such as `"biosyntax.chrom"`
125
+ # @return [String] foreground color as a hex string, or an empty string
126
+ # @return [String] background color as a hex string, or an empty string
127
+ # @return [String] font style, or an empty string
128
+ # @return [String] ANSI SGR sequence without the surrounding escape bytes
129
+ attr_reader :id, :name, :scope, :foreground, :background, :font_style, :ansi_sgr
130
+
131
+ # @api private
132
+ def initialize(id:, name:, scope:, foreground:, background:, font_style:, ansi_sgr:)
133
+ @id = Integer(id)
134
+ @name = BioSyntax.__send__(:normalize_name, name)
135
+ @scope = String(scope).freeze
136
+ @foreground = String(foreground).freeze
137
+ @background = String(background).freeze
138
+ @font_style = String(font_style).freeze
139
+ @ansi_sgr = String(ansi_sgr).freeze
140
+ freeze
141
+ end
142
+
143
+ # @return [String] canonical kind name
144
+ def to_s
145
+ name.to_s
146
+ end
147
+
148
+ # @return [Hash] serializable metadata for this kind
149
+ def to_h
150
+ {
151
+ id: id,
152
+ name: name,
153
+ scope: scope,
154
+ foreground: foreground,
155
+ background: background,
156
+ font_style: font_style,
157
+ ansi_sgr: ansi_sgr
158
+ }
159
+ end
160
+
161
+ # @param other [Object]
162
+ # @return [Boolean]
163
+ def ==(other)
164
+ other.is_a?(Kind) && other.id == id
165
+ end
166
+ alias eql? ==
167
+
168
+ # @return [Integer]
169
+ def hash
170
+ [self.class, id].hash
171
+ end
172
+
173
+ # @return [String]
174
+ def inspect
175
+ "#<#{self.class} name=#{name.inspect} id=#{id} scope=#{scope.inspect}>"
176
+ end
177
+ end
178
+
179
+ # A highlighted byte range within one input line.
180
+ #
181
+ # Offsets are byte offsets into the original line, not character indexes. This
182
+ # matches the native C API and keeps slicing correct for arbitrary encodings.
183
+ #
184
+ # @example Extract the text covered by a span
185
+ # text = line.byteslice(span.start, span.length)
186
+ class Span
187
+ # @return [Integer] byte offset at the start of the span
188
+ # @return [Integer] byte length of the span
189
+ # @return [Integer] native kind id for this span
190
+ attr_reader :start, :length, :kind_id
191
+
192
+ # @api private
193
+ def initialize(start, length, kind_id)
194
+ @start = Integer(start)
195
+ @length = Integer(length)
196
+ @kind_id = Integer(kind_id)
197
+ freeze
198
+ end
199
+
200
+ # @return [Integer] byte offset just after the span
201
+ def end
202
+ @start + @length
203
+ end
204
+
205
+ # @return [Kind] token kind metadata for this span
206
+ def kind
207
+ BioSyntax.kind(@kind_id)
208
+ end
209
+
210
+ # @return [Symbol] token kind name
211
+ def kind_name
212
+ kind.name
213
+ end
214
+
215
+ # @return [String] semantic scope for this span
216
+ def scope
217
+ kind.scope
218
+ end
219
+
220
+ # @return [Range<Integer>] byte range covered by this span
221
+ def range
222
+ @start...self.end
223
+ end
224
+
225
+ # @return [Array(Integer, Integer, Symbol)] start offset, end offset, and kind name
226
+ def to_a
227
+ [@start, self.end, kind.name]
228
+ end
229
+
230
+ # Pattern matching support.
231
+ #
232
+ # @return [Array(Integer, Integer, Symbol)]
233
+ def deconstruct
234
+ to_a
235
+ end
236
+
237
+ # @return [Hash] serializable metadata for this span
238
+ def to_h
239
+ {
240
+ start: @start,
241
+ end: self.end,
242
+ length: @length,
243
+ kind: kind.name,
244
+ kind_id: @kind_id,
245
+ scope: kind.scope
246
+ }
247
+ end
248
+
249
+ # @param other [Object]
250
+ # @return [Boolean]
251
+ def ==(other)
252
+ other.is_a?(Span) &&
253
+ other.start == @start &&
254
+ other.length == @length &&
255
+ other.kind_id == @kind_id
256
+ end
257
+ alias eql? ==
258
+
259
+ # @return [Integer]
260
+ def hash
261
+ [self.class, @start, @length, @kind_id].hash
262
+ end
263
+
264
+ # @return [String]
265
+ def inspect
266
+ "#<#{self.class} start=#{@start} end=#{self.end} kind=#{kind.name.inspect}>"
267
+ end
268
+ end
269
+
270
+ # Stateful highlighter for one input format.
271
+ #
272
+ # Reuse one highlighter for one logical input stream. Some formats, such as
273
+ # FASTQ and WIG, need line-to-line state. Call {#reset} before reusing the
274
+ # object for another stream.
275
+ #
276
+ # @example
277
+ # highlighter = BioSyntax[:vcf]
278
+ # File.foreach("sample.vcf", chomp: false) do |line|
279
+ # print highlighter.colorize(line)
280
+ # end
281
+ class Highlighter
282
+ # @return [Format] format metadata for this highlighter
283
+ attr_reader :format
284
+
285
+ # @param format [Format, Symbol, String, Integer] format object, name, alias, or native id
286
+ # @raise [UnsupportedFormatError] if the format is not supported
287
+ def initialize(format)
288
+ @format = BioSyntax.format(format)
289
+ @state = Native::State.new(@format.id)
290
+ end
291
+
292
+ # @return [Symbol] canonical format name
293
+ def format_name
294
+ @format.name
295
+ end
296
+
297
+ # Highlight one input line and return semantic spans.
298
+ #
299
+ # @param line [String] one input line
300
+ # @return [Array<Span>] highlighted spans for the line
301
+ def highlight(line)
302
+ @state.highlight(line)
303
+ end
304
+ alias highlight_line highlight
305
+
306
+ # Highlight one input line and return ANSI-colored text.
307
+ #
308
+ # @param line [String] one input line
309
+ # @return [String] line with ANSI SGR escape sequences
310
+ def colorize(line)
311
+ @state.colorize(line)
312
+ end
313
+ alias colorize_line colorize
314
+ alias render_ansi colorize
315
+ alias render_ansi_line colorize
316
+
317
+ # Reset line-oriented parser state.
318
+ #
319
+ # @return [Highlighter] self
320
+ def reset
321
+ @state.reset(@format.id)
322
+ self
323
+ end
324
+
325
+ # Number of lines processed since initialization or the last reset.
326
+ #
327
+ # @return [Integer]
328
+ def line_no
329
+ @state.line_no
330
+ end
331
+
332
+ # @return [Boolean] true when this format depends on previous lines
333
+ def stateful?
334
+ @format.stateful?
335
+ end
336
+
337
+ # @return [String]
338
+ def inspect
339
+ "#<#{self.class} format=#{format_name.inspect} line_no=#{line_no}>"
340
+ end
341
+ end
342
+
343
+ # Version string reported by the vendored native `libbiosyntax` core.
344
+ # @return [String]
345
+ LIBBIOSYNTAX_VERSION = Native.libbiosyntax_version.freeze
346
+
347
+ # ABI version reported by the vendored native `libbiosyntax` core.
348
+ # @return [Integer]
349
+ LIBBIOSYNTAX_ABI_VERSION = Native.abi_version
350
+
351
+ RAW_FORMATS = Native.formats_raw.freeze
352
+ RAW_KINDS = Native.kinds_raw.freeze
353
+ private_constant :RAW_FORMATS, :RAW_KINDS
354
+
355
+ # Supported formats keyed by canonical format name.
356
+ #
357
+ # @return [Hash{Symbol => Format}]
358
+ FORMATS = RAW_FORMATS.each_with_object({}) do |row, hash|
359
+ next if row.fetch(:id).zero?
360
+
361
+ format = Format.new(
362
+ id: row.fetch(:id),
363
+ name: row.fetch(:name),
364
+ description: row.fetch(:description),
365
+ stateful: row.fetch(:stateful)
366
+ )
367
+ hash[format.name] = format
368
+ end.freeze
369
+
370
+ # Known token kinds keyed by canonical kind name.
371
+ #
372
+ # @return [Hash{Symbol => Kind}]
373
+ KINDS = RAW_KINDS.each_with_object({}) do |row, hash|
374
+ kind = Kind.new(
375
+ id: row.fetch(:id),
376
+ name: row.fetch(:name),
377
+ scope: row.fetch(:scope),
378
+ foreground: row.fetch(:foreground),
379
+ background: row.fetch(:background),
380
+ font_style: row.fetch(:font_style),
381
+ ansi_sgr: row.fetch(:ansi_sgr)
382
+ )
383
+ hash[kind.name] = kind
384
+ end.freeze
385
+
386
+ # @return [Array<Symbol>] supported canonical format names
387
+ FORMAT_NAMES = FORMATS.keys.freeze
388
+
389
+ # @return [Array<Symbol>] known canonical kind names
390
+ KIND_NAMES = KINDS.keys.freeze
391
+ FORMATS_BY_ID = FORMATS.values.to_h { |format| [format.id, format] }.freeze
392
+ KINDS_BY_ID = KINDS.values.to_h { |kind| [kind.id, kind] }.freeze
393
+ private_constant :FORMATS_BY_ID, :KINDS_BY_ID
394
+
395
+ # Token kinds grouped by semantic scope.
396
+ #
397
+ # @return [Hash{String => Array<Kind>}]
398
+ SCOPES = KINDS.values.each_with_object(Hash.new { |hash, key| hash[key] = [] }) do |kind, hash|
399
+ hash[kind.scope] << kind
400
+ end.each_with_object({}) do |(scope, kinds), hash|
401
+ hash[scope.freeze] = kinds.freeze
402
+ end.freeze
403
+
404
+ class << self
405
+ # Create a highlighter for a format.
406
+ #
407
+ # @param format [Format, Symbol, String, Integer] format object, name, alias, or native id
408
+ # @return [Highlighter]
409
+ # @raise [UnsupportedFormatError] if the format is not supported
410
+ def [](format)
411
+ Highlighter.new(format)
412
+ end
413
+ alias highlighter []
414
+
415
+ # @return [Array<Symbol>] supported canonical format names
416
+ def formats
417
+ FORMAT_NAMES
418
+ end
419
+
420
+ # @return [Array<Symbol>] known canonical kind names
421
+ def kinds
422
+ KIND_NAMES
423
+ end
424
+
425
+ # Resolve a format object from a name, alias, id, or existing object.
426
+ #
427
+ # @param value [Format, Symbol, String, Integer]
428
+ # @return [Format]
429
+ # @raise [UnsupportedFormatError] if the format is not supported
430
+ def format(value)
431
+ return value if value.is_a?(Format)
432
+
433
+ found = case value
434
+ when Integer
435
+ FORMATS_BY_ID[value]
436
+ else
437
+ name = value.to_s.downcase
438
+ FORMATS[name.to_sym] ||
439
+ FORMATS[name.tr('_', '-').to_sym] ||
440
+ FORMATS_BY_ID[Native.format_id_from_name(name)]
441
+ end
442
+
443
+ return found if found
444
+
445
+ raise UnsupportedFormatError, "unsupported format: #{value.inspect}"
446
+ end
447
+
448
+ # Resolve the canonical name for a format.
449
+ #
450
+ # @param value [Format, Symbol, String, Integer]
451
+ # @return [Symbol, nil]
452
+ def format_name(value)
453
+ format(value).name
454
+ rescue UnsupportedFormatError
455
+ nil
456
+ end
457
+
458
+ # @param value [Format, Symbol, String, Integer]
459
+ # @return [Boolean]
460
+ def format_supported?(value)
461
+ !format_name(value).nil?
462
+ end
463
+
464
+ # Resolve token kind metadata from a name, id, or existing object.
465
+ #
466
+ # @param value [Kind, Symbol, String, Integer]
467
+ # @return [Kind]
468
+ # @raise [UnknownKindError] if the kind is not known
469
+ def kind(value)
470
+ return value if value.is_a?(Kind)
471
+
472
+ found = case value
473
+ when Integer
474
+ KINDS_BY_ID[value]
475
+ else
476
+ KINDS[normalize_name(value)] || KINDS[value.to_s.downcase.tr('-', '_').to_sym]
477
+ end
478
+
479
+ return found if found
480
+
481
+ raise UnknownKindError, "unknown kind: #{value.inspect}"
482
+ end
483
+
484
+ # Resolve the canonical name for a token kind.
485
+ #
486
+ # @param value [Kind, Symbol, String, Integer]
487
+ # @return [Symbol, nil]
488
+ def kind_name(value)
489
+ kind(value).name
490
+ rescue UnknownKindError
491
+ nil
492
+ end
493
+
494
+ # @param value [Kind, Symbol, String, Integer]
495
+ # @return [Boolean]
496
+ def kind_known?(value)
497
+ !kind_name(value).nil?
498
+ end
499
+
500
+ # Guess a format from a path or extension.
501
+ #
502
+ # @param path_or_extension [String, #to_s]
503
+ # @return [Symbol, nil] canonical format name if recognized
504
+ def guess_format(path_or_extension)
505
+ id = Native.guess_format_id(path_or_extension.to_s)
506
+ format = FORMATS_BY_ID[id]
507
+ format&.name
508
+ end
509
+
510
+ # Guess a format from a path or extension and create a highlighter.
511
+ #
512
+ # @param path_or_extension [String, #to_s]
513
+ # @return [Highlighter, nil]
514
+ def guess(path_or_extension)
515
+ name = guess_format(path_or_extension)
516
+ name && Highlighter.new(name)
517
+ end
518
+ end
519
+
520
+ FORMATS.each_value do |format|
521
+ const_name = constant_name(format.name)
522
+ Format.const_set(const_name, format) unless Format.const_defined?(const_name, false)
523
+
524
+ method_name = format.method_name
525
+ next unless method_name.to_s.match?(/\A[a-z_]\w*\z/)
526
+ next if singleton_class.method_defined?(method_name) ||
527
+ singleton_class.private_method_defined?(method_name)
528
+
529
+ define_singleton_method(method_name) do
530
+ Highlighter.new(format)
531
+ end
532
+ end
533
+
534
+ KINDS.each_value do |kind|
535
+ const_name = constant_name(kind.name)
536
+ Kind.const_set(const_name, kind) unless Kind.const_defined?(const_name, false)
537
+ end
538
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: biosyntax
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - kojix2
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies: []
12
+ email: 2xijok@gmail.com
13
+ executables: []
14
+ extensions:
15
+ - ext/biosyntax/extconf.rb
16
+ extra_rdoc_files: []
17
+ files:
18
+ - LICENSE.md
19
+ - README.md
20
+ - ext/biosyntax/biosyntax.c
21
+ - ext/biosyntax/biosyntax.h
22
+ - ext/biosyntax/biosyntax_ext.c
23
+ - ext/biosyntax/extconf.rb
24
+ - lib/biosyntax.rb
25
+ - lib/biosyntax/version.rb
26
+ homepage: https://github.com/kojix2/biosyntax
27
+ licenses:
28
+ - GPL-3.0-only
29
+ metadata: {}
30
+ rdoc_options: []
31
+ require_paths:
32
+ - lib
33
+ required_ruby_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '3.1'
38
+ required_rubygems_version: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ requirements: []
44
+ rubygems_version: 4.0.10
45
+ specification_version: 4
46
+ summary: Ruby native binding for libbiosyntax
47
+ test_files: []