arcana 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/arcana.rb +542 -0
  3. metadata +47 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: f0a10f52dd42cc9e11a7fa6418595c7eb1f965ff8300dc360710afa796a730cd
4
+ data.tar.gz: 359c8880bfe4beaf147fa7cd324436508f8f25c46dc5a49e6d66a3ed5009362e
5
+ SHA512:
6
+ metadata.gz: 5738e67b9051481f8a9fe42f861c973dcc9637c3bb1baeabf9e433ea655a53e26d3bc0e4a689ecaa4838e1717931f3e6a38bea10e3b4ab64b877c533e91fd418
7
+ data.tar.gz: 8cf7e4af39af148a55932e115b0b26d86ac017cde9e5db8847d06c7448fd3212e340d6630a3545053f6ac5bb963c69fc2c909a63a3f156f5396e59d9a7f0f0d8
data/lib/arcana.rb ADDED
@@ -0,0 +1,542 @@
1
+ class Arcana
2
+ EMPTY_ARRAY = [].freeze
3
+
4
+ class Cursor
5
+ attr_reader :buf, :offset
6
+
7
+ def initialize(buf)
8
+ @buf = buf
9
+ @base = @offset = 0
10
+ end
11
+
12
+ def eof?
13
+ @offset >= @buf.size
14
+ end
15
+
16
+ def read(n)
17
+ ret = peek(n)
18
+ seek_relative(n)
19
+ ret
20
+ end
21
+
22
+ def peek(n)
23
+ @buf[@offset, n]
24
+ end
25
+
26
+ def mark_base
27
+ @base += @offset
28
+ @offset = 0
29
+ end
30
+
31
+ def seek_absolute(offset)
32
+ if offset < 0
33
+ @offset = @buf.size + offset
34
+ else
35
+ @offset = offset
36
+ end
37
+ end
38
+
39
+ def seek_pos(offset)
40
+ seek_absolute(@base + offset)
41
+ end
42
+
43
+ def seek_relative(offset)
44
+ @offset += offset
45
+ end
46
+
47
+ def restore
48
+ prev = @offset, @base
49
+ yield
50
+ ensure
51
+ @offset, @base = prev
52
+ end
53
+
54
+ def inspect
55
+ "#<#{self.class} offset=#{@offset}>"
56
+ end
57
+ end
58
+
59
+ class Offset
60
+ def initialize(str)
61
+ @str = str
62
+ end
63
+
64
+ def exact?
65
+ @str.match?(/\A(?:-?[0-9]+|0x[0-9a-fA-F]+)\z/)
66
+ end
67
+
68
+ def indirect?
69
+ @str.start_with?("(")
70
+ end
71
+
72
+ def relative?
73
+ @str.start_with?("&")
74
+ end
75
+
76
+ def seek(input)
77
+ pos = position(input)
78
+ return if pos.nil? # FIXME: raise?
79
+ input.seek_pos(pos)
80
+ end
81
+
82
+ def position(input)
83
+ if exact?
84
+ Integer(@str)
85
+ elsif indirect?
86
+ @str.match(/\A\(([0-9]+|0x[0-9a-fA-F]+)([.,])([bBcCeEfFgGhHiIlLmsSqQ])([+-](?:[0-9]+|0x[0-9a-fA-F]+))?\)\z/) || return
87
+ add = $4 ? Integer($4) : 0
88
+ value = read_indirect(input, offset: Integer($1), signed: ($2 == ","), type: $3)
89
+ return unless value # fixme
90
+ value + add
91
+ else
92
+ binding.irb
93
+ end
94
+ end
95
+
96
+ def to_s
97
+ @str
98
+ end
99
+
100
+ private
101
+
102
+ def read_indirect(input, offset:, type:, signed:)
103
+ input.seek_absolute(offset)
104
+ return if input.eof? # FIXME
105
+
106
+ case type
107
+ when "b", "c", "B", "C"
108
+ input.read(1).ord
109
+ when "h", "s"
110
+ input.read(2).unpack("s<")[0]
111
+ when "H", "S"
112
+ input.read(2).unpack("s>")[0]
113
+ when "l"
114
+ # also default?
115
+ input.read(2).unpack("l<")[0]
116
+ when "L"
117
+ # also default?
118
+ input.read(2).unpack("l>")[0]
119
+ when "I"
120
+ # https://stackoverflow.com/questions/5223025/why-do-mp3-files-use-synchsafe-integers
121
+ bytes = input.read(4).bytes
122
+ bytes[0] << 21 | bytes[1] << 14 | bytes[2] << 7 | bytes[3]
123
+ else
124
+ binding.irb
125
+ raise "unsupported indirect type: #{type}"
126
+ end
127
+ end
128
+ end
129
+
130
+ class Pattern
131
+ attr_reader :type, :flags, :value
132
+
133
+ def initialize(type, value)
134
+ type, *@flags = type.split("/")
135
+ @type, *@type_ops = type.split(/(?=[&%+-])/)
136
+ @value = value
137
+ end
138
+
139
+ def match?(input)
140
+ return true if @value == "x"
141
+
142
+ return if !input
143
+ return if input.eof?
144
+ flags = @flags.dup
145
+
146
+ case @type
147
+ when "string", "ustring"
148
+ flags.delete("b") # force on binary files
149
+ flags.delete("t") # force on text files
150
+
151
+ flags.delete("w") # FIXME: blanks
152
+ flags.delete("W") # FIXME: blanks
153
+ flags.delete("c") # FIXME: case insensitive
154
+ flags.delete("C") # FIXME: case insensitive
155
+
156
+ if @value.start_with?("!")
157
+ test_string = parse_string(@value[1..])
158
+ input.read(test_string.length) != test_string
159
+ elsif @value.start_with?("=")
160
+ test_string = parse_string(@value[1..])
161
+ input.read(test_string.length) == test_string
162
+ else
163
+ test_string = parse_string(@value)
164
+ input.read(test_string.length) == test_string
165
+ end
166
+ when "byte"
167
+ match_packed_integer?(input, "c", 1)
168
+ when "ubyte"
169
+ match_packed_integer?(input, "C", 1)
170
+ when "short"
171
+ match_packed_integer?(input, "s", 2)
172
+ when "ushort"
173
+ match_packed_integer?(input, "S", 2)
174
+ when "long"
175
+ match_packed_integer?(input, "l", 4)
176
+ when "ulong"
177
+ match_packed_integer?(input, "L", 4)
178
+ when "quad"
179
+ match_packed_integer?(input, "q", 8)
180
+ when "uquad"
181
+ match_packed_integer?(input, "Q", 8)
182
+ when "leshort"
183
+ match_packed_integer?(input, "s<", 2)
184
+ when "uleshort"
185
+ match_packed_integer?(input, "S<", 2)
186
+ when "beshort"
187
+ match_packed_integer?(input, "s>", 2)
188
+ when "ubeshort"
189
+ match_packed_integer?(input, "S>", 2)
190
+ when "lelong"
191
+ match_packed_integer?(input, "l<", 4)
192
+ when "ulelong"
193
+ match_packed_integer?(input, "L<", 4)
194
+ when "belong"
195
+ match_packed_integer?(input, "l>", 4)
196
+ when "ubelong"
197
+ match_packed_integer?(input, "L>", 4)
198
+ when "bequad"
199
+ match_packed_integer?(input, "q>", 8)
200
+ when "ubequad"
201
+ match_packed_integer?(input, "Q>", 8)
202
+ when "lequad"
203
+ match_packed_integer?(input, "q<", 8)
204
+ when "ulequad"
205
+ match_packed_integer?(input, "Q<", 8)
206
+ when "pstring"
207
+ return false # FIXME
208
+ when "guid"
209
+ return false # FIXME
210
+ when "der"
211
+ return false # FIXME
212
+ when "lestring16"
213
+ return false # FIXME
214
+ when "default"
215
+ return true # FIXME
216
+ when "clear"
217
+ return true # FIXME
218
+ when "name"
219
+ return false
220
+ when "use"
221
+ return false
222
+ when "offset"
223
+ match_integer?(input.offset)
224
+ when "indirect"
225
+ return false # FIXME
226
+ when "ledate"
227
+ return false # FIXME
228
+ when "bedate"
229
+ return false # FIXME
230
+ when "beldate"
231
+ return false # FIXME
232
+ when "beqdate"
233
+ return false # FIXME
234
+ when "lefloat"
235
+ return false # FIXME
236
+ when "regex"
237
+ if length = flags[0]
238
+ if length.end_with?("l")
239
+ # lines
240
+ length = 8196
241
+ elsif length.match?(/\A[0-9]+\z/)
242
+ length = Integer(length)
243
+ else
244
+ return false # FIXME
245
+ end
246
+ else
247
+ length = 8196
248
+ end
249
+ regex = parse_string(@value)
250
+ # FIXME: seek input to result location
251
+ input.peek(length).match?(regex)
252
+ when "search"
253
+ flags = @flags
254
+
255
+ flags.delete("b") # force on binary files
256
+ flags.delete("t") # force on text files
257
+
258
+ flags.delete("c") # FIXME: case insensitive
259
+ flags.delete("C") # FIXME: case insensitive
260
+
261
+ flags = ["1"] if flags.empty? # FIXME: WTF?
262
+ search_input = input.peek(@value.size + Integer(flags[0]) - 1)
263
+ flags = flags[1..]
264
+
265
+ value = parse_string(@value)
266
+
267
+ # FIXME: seek input to result location
268
+ search_input.include?(value)
269
+ else
270
+ raise "Unsupported match type: #{@type}"
271
+ end
272
+ end
273
+
274
+ private
275
+
276
+ def parse_string(value)
277
+ value = value.dup.b
278
+ value.gsub!(/\\([0-7]{1,3})/) { |match| Integer($1, 8).chr rescue binding.irb }
279
+ value.gsub!(/\\x([0-9a-fA-F]{2})/) { |match| Integer($1, 16).chr }
280
+ value.gsub!(/\\(.)/) do
281
+ case $1
282
+ when "n" then "\n"
283
+ when "t" then "\t"
284
+ when "f" then "\f"
285
+ when "r" then "\r"
286
+ else $1
287
+ end
288
+ end
289
+ value
290
+ end
291
+
292
+ def match_packed_integer?(input, pack_str, length)
293
+ input = input.read(length)
294
+ return false unless input && input.length == length
295
+ val = input.unpack(pack_str)[0]
296
+ match_integer?(val, bitwidth: length*8)
297
+ end
298
+
299
+ def match_integer?(val, bitwidth: 64, match_value: @value)
300
+ return true if match_value == "x"
301
+ return false unless val
302
+
303
+ @type_ops.each do |op|
304
+ op.match(/\A([&%])?(0x[0-9a-fA-F]+|-?[0-9]+)[lL]?\z/) || raise
305
+ operand = Integer($2)
306
+ case $1
307
+ when "&"
308
+ val &= operand
309
+ when "%"
310
+ val %= operand
311
+ end
312
+ end
313
+
314
+ if match_value.match(/\A([=><!&^])? ?(0x[0-9a-fA-F]+|-?[0-9]+)[lL]?\z/)
315
+ operator = $1
316
+ comparison = Integer($2)
317
+
318
+ if $2.start_with?("0x") && !@type.start_with?("u")
319
+ # is it signed?
320
+ if comparison.anybits?(1 << (bitwidth - 1))
321
+ comparison = -(((1 << bitwidth) - 1) ^ comparison) - 1
322
+ end
323
+ end
324
+
325
+ if @type_ops.any?
326
+ comparison &= (1 << bitwidth) - 1
327
+ end
328
+
329
+ case operator
330
+ when "=", nil
331
+ val == comparison
332
+ when "<"
333
+ val < comparison
334
+ when ">"
335
+ val > comparison
336
+ when "!"
337
+ val != comparison
338
+ when "&"
339
+ (val & comparison) == comparison
340
+ when "^"
341
+ (val & comparison) == 0
342
+ end
343
+ else
344
+ binding.irb
345
+ false # FIXME
346
+ end
347
+ end
348
+ end
349
+
350
+ class Result
351
+ attr_reader :ruleset
352
+
353
+ def initialize(ruleset, stack=[])
354
+ @ruleset = ruleset
355
+ @stack = stack
356
+ end
357
+
358
+ def add(rule)
359
+ Result.new(ruleset, @stack + [rule])
360
+ end
361
+
362
+ def mime_type
363
+ @stack.map(&:mime_type).compact.last
364
+ end
365
+
366
+ def full_message
367
+ @stack.map(&:message).compact.join(" ")
368
+ end
369
+
370
+ def last_rule
371
+ @stack.last
372
+ end
373
+
374
+ def inspect
375
+ "#<Arcana::Result mime=#{mime_type.inspect} message=#{full_message.inspect} stack=#{@stack.inspect}>"
376
+ end
377
+ end
378
+
379
+ class Rule
380
+ attr_reader :offset, :pattern, :message, :extras, :children
381
+
382
+ def initialize(offset, pattern, message)
383
+ @offset = offset
384
+ @pattern = pattern
385
+ @message = message
386
+ @extras = {}
387
+ @children = []
388
+ end
389
+
390
+ def match(input, match)
391
+ return EMPTY_ARRAY if @offset.relative?
392
+ #return EMPTY_ARRAY unless @offset.exact?
393
+ ruleset = match.ruleset
394
+
395
+ input = Cursor.new(input) unless Cursor === input
396
+ @offset.seek(input)
397
+
398
+ if pattern.type == "use"
399
+ return EMPTY_ARRAY if pattern.value.start_with?("\\^") # FIXME: endianness swap
400
+ use = ruleset.names.fetch(pattern.value)
401
+ input.restore do
402
+ input.mark_base # FIXME: no idea if this works
403
+ return use.visit_children(input, match)
404
+ end
405
+ elsif pattern.type == "indirect"
406
+ # FIXME: do this better
407
+ original_input = input.buf
408
+ return match.ruleset.match(original_input[input.offset..], match)
409
+ end
410
+
411
+ if @pattern.match?(input)
412
+ match = match.add(self)
413
+ child_matches = visit_children(input, match)
414
+ if child_matches.any?
415
+ child_matches
416
+ else
417
+ match
418
+ end
419
+ else
420
+ EMPTY_ARRAY
421
+ end
422
+ end
423
+
424
+ def visit_children(input, match)
425
+ children.flat_map do |child|
426
+ input.restore do
427
+ child.match(input, match)
428
+ end
429
+ end
430
+ end
431
+
432
+ def mime_type
433
+ @extras["mime"]
434
+ end
435
+
436
+ def inspect
437
+ "<#{self.class} #{@offset} #{@pattern.inspect} #{@message}>"
438
+ end
439
+ end
440
+
441
+ class RuleSet
442
+ def initialize(rules)
443
+ @rules = rules
444
+ end
445
+
446
+ def match(string, result=Result.new(self))
447
+ @rules.flat_map do |rule|
448
+ rule.match(string, result)
449
+ end
450
+ end
451
+
452
+ def names
453
+ return @names if defined?(@names)
454
+ @names = {}
455
+ @rules.each do |rule|
456
+ next unless rule.pattern.type == "name"
457
+ @names[rule.pattern.value] = rule
458
+ end
459
+ @names
460
+ end
461
+
462
+ def inspect
463
+ "#<#{self.class} #{@rules.size} rules>"
464
+ end
465
+ end
466
+
467
+ class File
468
+ def initialize(path)
469
+ @path = path
470
+ @rules = parse
471
+ end
472
+
473
+ def raw_rules
474
+ @rules
475
+ end
476
+
477
+ def rules
478
+ RuleSet.new(@rules)
479
+ end
480
+
481
+ def parse
482
+ rules = []
483
+ stack = []
484
+
485
+ ::File.foreach(@path) do |line|
486
+ if line.start_with?("#")
487
+ # comment
488
+ elsif line.match?(/\A\s+\z/)
489
+ # blank
490
+ elsif line.start_with?("!")
491
+ if line =~ /\A!:([a-z]+)\s+(.*)\n\z/
492
+ raise if stack.empty?
493
+ stack.last.extras[$1] = $2
494
+ else
495
+ raise "couldn't parse #{line}"
496
+ end
497
+ else
498
+ fields = line.chomp.split(/(?<![\\<>])\s+/, 4)
499
+ offset, type, test, message = fields
500
+ nesting = offset[/\A>*/].size
501
+
502
+ stack = stack[0, nesting]
503
+
504
+ offset = Offset.new offset[nesting..]
505
+ pattern = Pattern.new(type, test)
506
+
507
+ rule = Rule.new(offset, pattern, message)
508
+ if stack.empty?
509
+ rules << rule
510
+ else
511
+ stack.last.children << rule
512
+ end
513
+ stack << rule
514
+ end
515
+ end
516
+ rules
517
+ end
518
+ end
519
+
520
+ class Magdir
521
+ def initialize(dir)
522
+ @dir = dir
523
+ end
524
+
525
+ def open(path)
526
+ Arcana::File.new(::File.join(@dir, path))
527
+ end
528
+
529
+ def files
530
+ Dir.children(@dir).map do |path|
531
+ open(path)
532
+ end
533
+ end
534
+
535
+ def rules
536
+ RuleSet.new(files.flat_map(&:raw_rules))
537
+ end
538
+ end
539
+
540
+ DB_PATH = "../file/magic/Magdir"
541
+ DB = Magdir.new(DB_PATH)
542
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: arcana
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - John Hawthorn
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2021-04-05 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email: john@hawthorn.email
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/arcana.rb
20
+ homepage: https://github.com/jhawthorn/arcana
21
+ licenses:
22
+ - BSD-2-Clause
23
+ metadata:
24
+ bug_tracker_uri: https://github.com/jhawthorn/arcana/issues
25
+ source_code_uri: https://github.com/jhawthorn/arcana
26
+ documentation_uri: https://www.rubydoc.info/gems/arcana
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubygems_version: 3.2.3
44
+ signing_key:
45
+ specification_version: 4
46
+ summary: file/libmagic based mime sniffing in pure Ruby
47
+ test_files: []