arcana 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/arcana.rb +542 -0
  3. metadata +47 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: f0a10f52dd42cc9e11a7fa6418595c7eb1f965ff8300dc360710afa796a730cd
4
+ data.tar.gz: 359c8880bfe4beaf147fa7cd324436508f8f25c46dc5a49e6d66a3ed5009362e
5
+ SHA512:
6
+ metadata.gz: 5738e67b9051481f8a9fe42f861c973dcc9637c3bb1baeabf9e433ea655a53e26d3bc0e4a689ecaa4838e1717931f3e6a38bea10e3b4ab64b877c533e91fd418
7
+ data.tar.gz: 8cf7e4af39af148a55932e115b0b26d86ac017cde9e5db8847d06c7448fd3212e340d6630a3545053f6ac5bb963c69fc2c909a63a3f156f5396e59d9a7f0f0d8
data/lib/arcana.rb ADDED
@@ -0,0 +1,542 @@
1
+ class Arcana
2
+ EMPTY_ARRAY = [].freeze
3
+
4
+ class Cursor
5
+ attr_reader :buf, :offset
6
+
7
+ def initialize(buf)
8
+ @buf = buf
9
+ @base = @offset = 0
10
+ end
11
+
12
+ def eof?
13
+ @offset >= @buf.size
14
+ end
15
+
16
+ def read(n)
17
+ ret = peek(n)
18
+ seek_relative(n)
19
+ ret
20
+ end
21
+
22
+ def peek(n)
23
+ @buf[@offset, n]
24
+ end
25
+
26
+ def mark_base
27
+ @base += @offset
28
+ @offset = 0
29
+ end
30
+
31
+ def seek_absolute(offset)
32
+ if offset < 0
33
+ @offset = @buf.size + offset
34
+ else
35
+ @offset = offset
36
+ end
37
+ end
38
+
39
+ def seek_pos(offset)
40
+ seek_absolute(@base + offset)
41
+ end
42
+
43
+ def seek_relative(offset)
44
+ @offset += offset
45
+ end
46
+
47
+ def restore
48
+ prev = @offset, @base
49
+ yield
50
+ ensure
51
+ @offset, @base = prev
52
+ end
53
+
54
+ def inspect
55
+ "#<#{self.class} offset=#{@offset}>"
56
+ end
57
+ end
58
+
59
+ class Offset
60
+ def initialize(str)
61
+ @str = str
62
+ end
63
+
64
+ def exact?
65
+ @str.match?(/\A(?:-?[0-9]+|0x[0-9a-fA-F]+)\z/)
66
+ end
67
+
68
+ def indirect?
69
+ @str.start_with?("(")
70
+ end
71
+
72
+ def relative?
73
+ @str.start_with?("&")
74
+ end
75
+
76
+ def seek(input)
77
+ pos = position(input)
78
+ return if pos.nil? # FIXME: raise?
79
+ input.seek_pos(pos)
80
+ end
81
+
82
+ def position(input)
83
+ if exact?
84
+ Integer(@str)
85
+ elsif indirect?
86
+ @str.match(/\A\(([0-9]+|0x[0-9a-fA-F]+)([.,])([bBcCeEfFgGhHiIlLmsSqQ])([+-](?:[0-9]+|0x[0-9a-fA-F]+))?\)\z/) || return
87
+ add = $4 ? Integer($4) : 0
88
+ value = read_indirect(input, offset: Integer($1), signed: ($2 == ","), type: $3)
89
+ return unless value # fixme
90
+ value + add
91
+ else
92
+ binding.irb
93
+ end
94
+ end
95
+
96
+ def to_s
97
+ @str
98
+ end
99
+
100
+ private
101
+
102
+ def read_indirect(input, offset:, type:, signed:)
103
+ input.seek_absolute(offset)
104
+ return if input.eof? # FIXME
105
+
106
+ case type
107
+ when "b", "c", "B", "C"
108
+ input.read(1).ord
109
+ when "h", "s"
110
+ input.read(2).unpack("s<")[0]
111
+ when "H", "S"
112
+ input.read(2).unpack("s>")[0]
113
+ when "l"
114
+ # also default?
115
+ input.read(2).unpack("l<")[0]
116
+ when "L"
117
+ # also default?
118
+ input.read(2).unpack("l>")[0]
119
+ when "I"
120
+ # https://stackoverflow.com/questions/5223025/why-do-mp3-files-use-synchsafe-integers
121
+ bytes = input.read(4).bytes
122
+ bytes[0] << 21 | bytes[1] << 14 | bytes[2] << 7 | bytes[3]
123
+ else
124
+ binding.irb
125
+ raise "unsupported indirect type: #{type}"
126
+ end
127
+ end
128
+ end
129
+
130
+ class Pattern
131
+ attr_reader :type, :flags, :value
132
+
133
+ def initialize(type, value)
134
+ type, *@flags = type.split("/")
135
+ @type, *@type_ops = type.split(/(?=[&%+-])/)
136
+ @value = value
137
+ end
138
+
139
+ def match?(input)
140
+ return true if @value == "x"
141
+
142
+ return if !input
143
+ return if input.eof?
144
+ flags = @flags.dup
145
+
146
+ case @type
147
+ when "string", "ustring"
148
+ flags.delete("b") # force on binary files
149
+ flags.delete("t") # force on text files
150
+
151
+ flags.delete("w") # FIXME: blanks
152
+ flags.delete("W") # FIXME: blanks
153
+ flags.delete("c") # FIXME: case insensitive
154
+ flags.delete("C") # FIXME: case insensitive
155
+
156
+ if @value.start_with?("!")
157
+ test_string = parse_string(@value[1..])
158
+ input.read(test_string.length) != test_string
159
+ elsif @value.start_with?("=")
160
+ test_string = parse_string(@value[1..])
161
+ input.read(test_string.length) == test_string
162
+ else
163
+ test_string = parse_string(@value)
164
+ input.read(test_string.length) == test_string
165
+ end
166
+ when "byte"
167
+ match_packed_integer?(input, "c", 1)
168
+ when "ubyte"
169
+ match_packed_integer?(input, "C", 1)
170
+ when "short"
171
+ match_packed_integer?(input, "s", 2)
172
+ when "ushort"
173
+ match_packed_integer?(input, "S", 2)
174
+ when "long"
175
+ match_packed_integer?(input, "l", 4)
176
+ when "ulong"
177
+ match_packed_integer?(input, "L", 4)
178
+ when "quad"
179
+ match_packed_integer?(input, "q", 8)
180
+ when "uquad"
181
+ match_packed_integer?(input, "Q", 8)
182
+ when "leshort"
183
+ match_packed_integer?(input, "s<", 2)
184
+ when "uleshort"
185
+ match_packed_integer?(input, "S<", 2)
186
+ when "beshort"
187
+ match_packed_integer?(input, "s>", 2)
188
+ when "ubeshort"
189
+ match_packed_integer?(input, "S>", 2)
190
+ when "lelong"
191
+ match_packed_integer?(input, "l<", 4)
192
+ when "ulelong"
193
+ match_packed_integer?(input, "L<", 4)
194
+ when "belong"
195
+ match_packed_integer?(input, "l>", 4)
196
+ when "ubelong"
197
+ match_packed_integer?(input, "L>", 4)
198
+ when "bequad"
199
+ match_packed_integer?(input, "q>", 8)
200
+ when "ubequad"
201
+ match_packed_integer?(input, "Q>", 8)
202
+ when "lequad"
203
+ match_packed_integer?(input, "q<", 8)
204
+ when "ulequad"
205
+ match_packed_integer?(input, "Q<", 8)
206
+ when "pstring"
207
+ return false # FIXME
208
+ when "guid"
209
+ return false # FIXME
210
+ when "der"
211
+ return false # FIXME
212
+ when "lestring16"
213
+ return false # FIXME
214
+ when "default"
215
+ return true # FIXME
216
+ when "clear"
217
+ return true # FIXME
218
+ when "name"
219
+ return false
220
+ when "use"
221
+ return false
222
+ when "offset"
223
+ match_integer?(input.offset)
224
+ when "indirect"
225
+ return false # FIXME
226
+ when "ledate"
227
+ return false # FIXME
228
+ when "bedate"
229
+ return false # FIXME
230
+ when "beldate"
231
+ return false # FIXME
232
+ when "beqdate"
233
+ return false # FIXME
234
+ when "lefloat"
235
+ return false # FIXME
236
+ when "regex"
237
+ if length = flags[0]
238
+ if length.end_with?("l")
239
+ # lines
240
+ length = 8196
241
+ elsif length.match?(/\A[0-9]+\z/)
242
+ length = Integer(length)
243
+ else
244
+ return false # FIXME
245
+ end
246
+ else
247
+ length = 8196
248
+ end
249
+ regex = parse_string(@value)
250
+ # FIXME: seek input to result location
251
+ input.peek(length).match?(regex)
252
+ when "search"
253
+ flags = @flags
254
+
255
+ flags.delete("b") # force on binary files
256
+ flags.delete("t") # force on text files
257
+
258
+ flags.delete("c") # FIXME: case insensitive
259
+ flags.delete("C") # FIXME: case insensitive
260
+
261
+ flags = ["1"] if flags.empty? # FIXME: WTF?
262
+ search_input = input.peek(@value.size + Integer(flags[0]) - 1)
263
+ flags = flags[1..]
264
+
265
+ value = parse_string(@value)
266
+
267
+ # FIXME: seek input to result location
268
+ search_input.include?(value)
269
+ else
270
+ raise "Unsupported match type: #{@type}"
271
+ end
272
+ end
273
+
274
+ private
275
+
276
+ def parse_string(value)
277
+ value = value.dup.b
278
+ value.gsub!(/\\([0-7]{1,3})/) { |match| Integer($1, 8).chr rescue binding.irb }
279
+ value.gsub!(/\\x([0-9a-fA-F]{2})/) { |match| Integer($1, 16).chr }
280
+ value.gsub!(/\\(.)/) do
281
+ case $1
282
+ when "n" then "\n"
283
+ when "t" then "\t"
284
+ when "f" then "\f"
285
+ when "r" then "\r"
286
+ else $1
287
+ end
288
+ end
289
+ value
290
+ end
291
+
292
+ def match_packed_integer?(input, pack_str, length)
293
+ input = input.read(length)
294
+ return false unless input && input.length == length
295
+ val = input.unpack(pack_str)[0]
296
+ match_integer?(val, bitwidth: length*8)
297
+ end
298
+
299
+ def match_integer?(val, bitwidth: 64, match_value: @value)
300
+ return true if match_value == "x"
301
+ return false unless val
302
+
303
+ @type_ops.each do |op|
304
+ op.match(/\A([&%])?(0x[0-9a-fA-F]+|-?[0-9]+)[lL]?\z/) || raise
305
+ operand = Integer($2)
306
+ case $1
307
+ when "&"
308
+ val &= operand
309
+ when "%"
310
+ val %= operand
311
+ end
312
+ end
313
+
314
+ if match_value.match(/\A([=><!&^])? ?(0x[0-9a-fA-F]+|-?[0-9]+)[lL]?\z/)
315
+ operator = $1
316
+ comparison = Integer($2)
317
+
318
+ if $2.start_with?("0x") && !@type.start_with?("u")
319
+ # is it signed?
320
+ if comparison.anybits?(1 << (bitwidth - 1))
321
+ comparison = -(((1 << bitwidth) - 1) ^ comparison) - 1
322
+ end
323
+ end
324
+
325
+ if @type_ops.any?
326
+ comparison &= (1 << bitwidth) - 1
327
+ end
328
+
329
+ case operator
330
+ when "=", nil
331
+ val == comparison
332
+ when "<"
333
+ val < comparison
334
+ when ">"
335
+ val > comparison
336
+ when "!"
337
+ val != comparison
338
+ when "&"
339
+ (val & comparison) == comparison
340
+ when "^"
341
+ (val & comparison) == 0
342
+ end
343
+ else
344
+ binding.irb
345
+ false # FIXME
346
+ end
347
+ end
348
+ end
349
+
350
+ class Result
351
+ attr_reader :ruleset
352
+
353
+ def initialize(ruleset, stack=[])
354
+ @ruleset = ruleset
355
+ @stack = stack
356
+ end
357
+
358
+ def add(rule)
359
+ Result.new(ruleset, @stack + [rule])
360
+ end
361
+
362
+ def mime_type
363
+ @stack.map(&:mime_type).compact.last
364
+ end
365
+
366
+ def full_message
367
+ @stack.map(&:message).compact.join(" ")
368
+ end
369
+
370
+ def last_rule
371
+ @stack.last
372
+ end
373
+
374
+ def inspect
375
+ "#<Arcana::Result mime=#{mime_type.inspect} message=#{full_message.inspect} stack=#{@stack.inspect}>"
376
+ end
377
+ end
378
+
379
+ class Rule
380
+ attr_reader :offset, :pattern, :message, :extras, :children
381
+
382
+ def initialize(offset, pattern, message)
383
+ @offset = offset
384
+ @pattern = pattern
385
+ @message = message
386
+ @extras = {}
387
+ @children = []
388
+ end
389
+
390
+ def match(input, match)
391
+ return EMPTY_ARRAY if @offset.relative?
392
+ #return EMPTY_ARRAY unless @offset.exact?
393
+ ruleset = match.ruleset
394
+
395
+ input = Cursor.new(input) unless Cursor === input
396
+ @offset.seek(input)
397
+
398
+ if pattern.type == "use"
399
+ return EMPTY_ARRAY if pattern.value.start_with?("\\^") # FIXME: endianness swap
400
+ use = ruleset.names.fetch(pattern.value)
401
+ input.restore do
402
+ input.mark_base # FIXME: no idea if this works
403
+ return use.visit_children(input, match)
404
+ end
405
+ elsif pattern.type == "indirect"
406
+ # FIXME: do this better
407
+ original_input = input.buf
408
+ return match.ruleset.match(original_input[input.offset..], match)
409
+ end
410
+
411
+ if @pattern.match?(input)
412
+ match = match.add(self)
413
+ child_matches = visit_children(input, match)
414
+ if child_matches.any?
415
+ child_matches
416
+ else
417
+ match
418
+ end
419
+ else
420
+ EMPTY_ARRAY
421
+ end
422
+ end
423
+
424
+ def visit_children(input, match)
425
+ children.flat_map do |child|
426
+ input.restore do
427
+ child.match(input, match)
428
+ end
429
+ end
430
+ end
431
+
432
+ def mime_type
433
+ @extras["mime"]
434
+ end
435
+
436
+ def inspect
437
+ "<#{self.class} #{@offset} #{@pattern.inspect} #{@message}>"
438
+ end
439
+ end
440
+
441
+ class RuleSet
442
+ def initialize(rules)
443
+ @rules = rules
444
+ end
445
+
446
+ def match(string, result=Result.new(self))
447
+ @rules.flat_map do |rule|
448
+ rule.match(string, result)
449
+ end
450
+ end
451
+
452
+ def names
453
+ return @names if defined?(@names)
454
+ @names = {}
455
+ @rules.each do |rule|
456
+ next unless rule.pattern.type == "name"
457
+ @names[rule.pattern.value] = rule
458
+ end
459
+ @names
460
+ end
461
+
462
+ def inspect
463
+ "#<#{self.class} #{@rules.size} rules>"
464
+ end
465
+ end
466
+
467
+ class File
468
+ def initialize(path)
469
+ @path = path
470
+ @rules = parse
471
+ end
472
+
473
+ def raw_rules
474
+ @rules
475
+ end
476
+
477
+ def rules
478
+ RuleSet.new(@rules)
479
+ end
480
+
481
+ def parse
482
+ rules = []
483
+ stack = []
484
+
485
+ ::File.foreach(@path) do |line|
486
+ if line.start_with?("#")
487
+ # comment
488
+ elsif line.match?(/\A\s+\z/)
489
+ # blank
490
+ elsif line.start_with?("!")
491
+ if line =~ /\A!:([a-z]+)\s+(.*)\n\z/
492
+ raise if stack.empty?
493
+ stack.last.extras[$1] = $2
494
+ else
495
+ raise "couldn't parse #{line}"
496
+ end
497
+ else
498
+ fields = line.chomp.split(/(?<![\\<>])\s+/, 4)
499
+ offset, type, test, message = fields
500
+ nesting = offset[/\A>*/].size
501
+
502
+ stack = stack[0, nesting]
503
+
504
+ offset = Offset.new offset[nesting..]
505
+ pattern = Pattern.new(type, test)
506
+
507
+ rule = Rule.new(offset, pattern, message)
508
+ if stack.empty?
509
+ rules << rule
510
+ else
511
+ stack.last.children << rule
512
+ end
513
+ stack << rule
514
+ end
515
+ end
516
+ rules
517
+ end
518
+ end
519
+
520
+ class Magdir
521
+ def initialize(dir)
522
+ @dir = dir
523
+ end
524
+
525
+ def open(path)
526
+ Arcana::File.new(::File.join(@dir, path))
527
+ end
528
+
529
+ def files
530
+ Dir.children(@dir).map do |path|
531
+ open(path)
532
+ end
533
+ end
534
+
535
+ def rules
536
+ RuleSet.new(files.flat_map(&:raw_rules))
537
+ end
538
+ end
539
+
540
+ DB_PATH = "../file/magic/Magdir"
541
+ DB = Magdir.new(DB_PATH)
542
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: arcana
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - John Hawthorn
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2021-04-05 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email: john@hawthorn.email
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/arcana.rb
20
+ homepage: https://github.com/jhawthorn/arcana
21
+ licenses:
22
+ - BSD-2-Clause
23
+ metadata:
24
+ bug_tracker_uri: https://github.com/jhawthorn/arcana/issues
25
+ source_code_uri: https://github.com/jhawthorn/arcana
26
+ documentation_uri: https://www.rubydoc.info/gems/arcana
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubygems_version: 3.2.3
44
+ signing_key:
45
+ specification_version: 4
46
+ summary: file/libmagic based mime sniffing in pure Ruby
47
+ test_files: []