magika 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -1
- data/README.md +4 -1
- data/lib/magika.rb +69 -59
- data/magika.gemspec +3 -2
- data/test/fixtures/padded-long-whitespace.txt +1 -0
- data/test/helper.rb +5 -0
- data/test/test_magika.rb +16 -3
- data/test/test_package.rb +1 -1
- data/test/test_smoke.rb +4 -4
- metadata +17 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1242b81f1cb54d8d60b1849246ebdecfc288fad8e4521c98e2d51ceb00d6d699
|
|
4
|
+
data.tar.gz: 0aacd82439630b208f1d87c912276a3aff364b8acb3514678d5a06f55fef1cb0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: bfabdaeb971e6564385dbde08c0b5fb7269b2ad97f0e02eebf7010ea95349997e5fadb8e9c7d828c2f69ee724b5e7e165576e67e5e798ccda696be0452b321ac
|
|
7
|
+
data.tar.gz: 716c087f89004d48096592fc27c6120ee14ed0a9fcb1b78fe3561e4acaf1e7f861544de5a3c9fcb9f7e46bc779e4643ee4e55dc9b76f30c2aab95837331d446f
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
|
@@ -12,7 +12,10 @@ SYNOPSIS
|
|
|
12
12
|
require "magika"
|
|
13
13
|
|
|
14
14
|
magika = Magika.new
|
|
15
|
-
magika.
|
|
15
|
+
magika.identify_path("path/to/file") # => #<Magika::FileType label=onnx score=0.9998390674591064 application/octet-stream "Open Neural Network Exchange" (archive)>,
|
|
16
|
+
|
|
17
|
+
require "striongio"
|
|
18
|
+
magika.identify_io(StringIO.new("some text content")) # => #<Magika::FileType label=txt score=0.8081967830657959 text/plain "Generic text document" (text)>
|
|
16
19
|
```
|
|
17
20
|
|
|
18
21
|
INSTALLATION
|
data/lib/magika.rb
CHANGED
|
@@ -11,8 +11,7 @@ class Magika
|
|
|
11
11
|
|
|
12
12
|
MODEL_CONFIG = JSON.load_file(MODEL_CONFIG_PATH, symbolize_names: true)
|
|
13
13
|
|
|
14
|
-
TARGET_LABELS_SPACE = MODEL_CONFIG[:target_labels_space]
|
|
15
|
-
TARGET_LABELS_SPACE.collect!(&:to_sym)
|
|
14
|
+
TARGET_LABELS_SPACE = Numo::RObject[MODEL_CONFIG[:target_labels_space].collect!(&:to_sym)]
|
|
16
15
|
MIN_FILE_SIZE_FOR_DL = MODEL_CONFIG[:min_file_size_for_dl]
|
|
17
16
|
VERSION_MAJOR = MODEL_CONFIG[:version_major]
|
|
18
17
|
|
|
@@ -20,13 +19,23 @@ class Magika
|
|
|
20
19
|
@session = OnnxRuntime::InferenceSession.new(MODEL_PATH)
|
|
21
20
|
end
|
|
22
21
|
|
|
23
|
-
def
|
|
24
|
-
|
|
22
|
+
def identify_path(path)
|
|
23
|
+
stat = File.lstat(path)
|
|
24
|
+
if stat.directory?
|
|
25
|
+
return FileType.new(:directory, 1.0)
|
|
26
|
+
end
|
|
27
|
+
if stat.symlink?
|
|
28
|
+
return FileType.new(:symlink, 1.0)
|
|
29
|
+
end
|
|
30
|
+
File.open(path, "rb", encoding: Encoding::BINARY) {|file|
|
|
31
|
+
identify_io(file)
|
|
32
|
+
}
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def identify_io(io)
|
|
36
|
+
features, label = extract_features_or_label(io)
|
|
25
37
|
if features
|
|
26
|
-
|
|
27
|
-
index = target_label.argmax
|
|
28
|
-
score = target_label[index]
|
|
29
|
-
FileType.new(TARGET_LABELS_SPACE[index], score)
|
|
38
|
+
FileType.from_inference(infer_from_features(features))
|
|
30
39
|
else
|
|
31
40
|
FileType.new(label, 1.0)
|
|
32
41
|
end
|
|
@@ -41,75 +50,68 @@ class Magika
|
|
|
41
50
|
BLOCK_SIZE = MODEL_CONFIG[:block_size]
|
|
42
51
|
PADDING_TOKEN = MODEL_CONFIG[:padding_token]
|
|
43
52
|
|
|
44
|
-
def extract_features_or_label(
|
|
45
|
-
|
|
46
|
-
if
|
|
47
|
-
return nil, :directory
|
|
48
|
-
end
|
|
49
|
-
if stat.symlink?
|
|
50
|
-
return nil, :symlink
|
|
51
|
-
end
|
|
52
|
-
file_size = stat.size
|
|
53
|
-
if file_size == 0
|
|
53
|
+
def extract_features_or_label(io)
|
|
54
|
+
io_size = io.size
|
|
55
|
+
if io_size == 0
|
|
54
56
|
return nil, :empty
|
|
55
57
|
end
|
|
56
|
-
if
|
|
57
|
-
return
|
|
58
|
+
if io_size < MIN_FILE_SIZE_FOR_DL
|
|
59
|
+
return nil, detect_small_content_label(io.read)
|
|
58
60
|
end
|
|
59
|
-
features =
|
|
61
|
+
features = extract_features_from_io(io)
|
|
60
62
|
if features[MIN_FILE_SIZE_FOR_DL - 1] == PADDING_TOKEN
|
|
61
|
-
return
|
|
63
|
+
return nil, detect_small_content_label(features[0...io_size].to_binary)
|
|
62
64
|
else
|
|
63
65
|
return features, nil
|
|
64
66
|
end
|
|
65
67
|
end
|
|
66
68
|
|
|
67
|
-
def
|
|
68
|
-
content
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
return
|
|
72
|
-
rescue EncodingError
|
|
73
|
-
return nil, :unknown
|
|
69
|
+
def detect_small_content_label(content)
|
|
70
|
+
if content.force_encoding(Encoding::UTF_8).valid_encoding?
|
|
71
|
+
return :txt
|
|
72
|
+
else
|
|
73
|
+
return :unknown
|
|
74
74
|
end
|
|
75
75
|
end
|
|
76
76
|
|
|
77
|
-
def
|
|
77
|
+
def extract_features_from_io(io)
|
|
78
78
|
features = Numo::Int32.new(FEATURES_SIZE).fill(PADDING_TOKEN)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
end
|
|
79
|
+
io_size = io.size
|
|
80
|
+
return features if io_size == 0
|
|
81
|
+
|
|
82
|
+
block_size = [BLOCK_SIZE, io_size].min
|
|
83
|
+
|
|
84
|
+
beg_block = io.read(block_size)
|
|
85
|
+
# String#lstrip strips null characters, which reduces accuracy
|
|
86
|
+
beg_str = beg_block.sub(/\A\s+/, "").freeze
|
|
87
|
+
unless beg_str.empty?
|
|
88
|
+
beg_content = Numo::UInt8.from_binary(beg_str)
|
|
89
|
+
beg_size = [BEG_SIZE, beg_content.size].min
|
|
90
|
+
features[0...beg_size] = beg_content[0...beg_size]
|
|
91
|
+
end
|
|
93
92
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
end
|
|
93
|
+
# Skip middle feature
|
|
94
|
+
|
|
95
|
+
end_block = if block_size == io_size
|
|
96
|
+
beg_block
|
|
97
|
+
else
|
|
98
|
+
io.seek(-block_size, IO::SEEK_END)
|
|
99
|
+
io.read(block_size)
|
|
100
|
+
end
|
|
101
|
+
# String#rstrip strips null characters, which reduces accuracy
|
|
102
|
+
end_str = end_block.reverse.sub(/\A\s+/, "").reverse.freeze
|
|
103
|
+
unless end_str.empty?
|
|
104
|
+
end_content = Numo::UInt8.from_binary(end_str)
|
|
105
|
+
end_size = [END_SIZE, end_content.size].min
|
|
106
|
+
features[-end_size..] = end_content[-end_size..]
|
|
109
107
|
end
|
|
110
108
|
features
|
|
111
109
|
end
|
|
112
110
|
|
|
111
|
+
def infer_from_features(features)
|
|
112
|
+
@session.run([:target_label], {bytes: OnnxRuntime::OrtValue.from_numo(features.reshape(1, FEATURES_SIZE))}, output_type: :numo)[0]
|
|
113
|
+
end
|
|
114
|
+
|
|
113
115
|
class FileType
|
|
114
116
|
CONTENT_TYPES_PATH = File.join(ASSETS_DIR, "content_types_kb.min.json")
|
|
115
117
|
CONTENT_TYPES = JSON.load_file(CONTENT_TYPES_PATH, symbolize_names: true)
|
|
@@ -118,6 +120,14 @@ class Magika
|
|
|
118
120
|
|
|
119
121
|
attr_reader :score
|
|
120
122
|
|
|
123
|
+
class << self
|
|
124
|
+
def from_inference(inference)
|
|
125
|
+
index = inference.argmax
|
|
126
|
+
score = inference[index]
|
|
127
|
+
FileType.new(TARGET_LABELS_SPACE[index], score)
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
121
131
|
def initialize(label, score)
|
|
122
132
|
@inferred_label = label
|
|
123
133
|
@score = score
|
data/magika.gemspec
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Gem::Specification.new do |s|
|
|
2
2
|
s.name = "magika"
|
|
3
|
-
s.version = "0.0.
|
|
3
|
+
s.version = "0.0.2"
|
|
4
4
|
s.authors = ["Kitaiti Makoto"]
|
|
5
5
|
s.email = ["KitaitiMakoto@gmail.com"]
|
|
6
6
|
s.summary = "Determines file content types using AI"
|
|
7
|
-
s.description = "Determines file content types using original
|
|
7
|
+
s.description = "Determines file content types using original Magika's ONNX model"
|
|
8
8
|
s.license = "Apache-2.0"
|
|
9
9
|
s.homepage = "https://gitlab.com/KitaitiMakoto/magika-rb"
|
|
10
10
|
|
|
@@ -25,4 +25,5 @@ Gem::Specification.new do |s|
|
|
|
25
25
|
s.add_development_dependency "test-unit"
|
|
26
26
|
s.add_development_dependency "test-unit-notify"
|
|
27
27
|
s.add_development_dependency "terminal-notifier" if RUBY_PLATFORM.match?(/darwin/)
|
|
28
|
+
s.add_development_dependency "simplecov"
|
|
28
29
|
end
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
hello
|
data/test/helper.rb
CHANGED
data/test/test_magika.rb
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
require_relative "helper"
|
|
2
|
+
require "stringio"
|
|
2
3
|
|
|
3
4
|
class TestMagika < TestBase
|
|
4
5
|
def test_initialize
|
|
@@ -6,16 +7,28 @@ class TestMagika < TestBase
|
|
|
6
7
|
assert_instance_of Magika, magika
|
|
7
8
|
end
|
|
8
9
|
|
|
9
|
-
|
|
10
10
|
data("empty file", [fixture_file_path("empty"), :empty])
|
|
11
11
|
data("small text file", [fixture_file_path("hello.txt"), :txt])
|
|
12
|
+
data("padded long whitespace text file", [fixture_file_path("padded-long-whitespace.txt"), :txt])
|
|
12
13
|
data("small binary fall", [fixture_file_path("small.bin"), :unknown])
|
|
13
14
|
data("ONNX model file", [Magika::MODEL_PATH, :onnx])
|
|
14
15
|
data("Ruby file", [__FILE__, :ruby])
|
|
15
16
|
data("directory", [__dir__, :directory])
|
|
16
17
|
data("symlink", [fixture_file_path("symlink"), :symlink])
|
|
17
|
-
def
|
|
18
|
+
def test_identify_path(data)
|
|
19
|
+
path, label = data
|
|
20
|
+
assert_equal label, magika.identify_path(path).label
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
data("empty file", [fixture_file_path("empty"), :empty])
|
|
24
|
+
data("small text file", [fixture_file_path("hello.txt"), :txt])
|
|
25
|
+
data("small binary fall", [fixture_file_path("small.bin"), :unknown])
|
|
26
|
+
data("ONNX model file", [Magika::MODEL_PATH, :onnx])
|
|
27
|
+
def test_identify_io(data)
|
|
18
28
|
path, label = data
|
|
19
|
-
|
|
29
|
+
File.open(path, "rb", encoding: Encoding::BINARY) do |io|
|
|
30
|
+
assert_equal label, magika.identify_io(io).label
|
|
31
|
+
end
|
|
32
|
+
assert_equal label, magika.identify_io(StringIO.new(File.binread(path))).label
|
|
20
33
|
end
|
|
21
34
|
end
|
data/test/test_package.rb
CHANGED
|
@@ -28,7 +28,7 @@ class TestPackage < Test::Unit::TestCase
|
|
|
28
28
|
|
|
29
29
|
lib_dir = File.join(gem_dir, "lib")
|
|
30
30
|
|
|
31
|
-
assert_equal "onnx", `ruby -I #{lib_dir.shellescape} -r magika -e 'print Magika.new.
|
|
31
|
+
assert_equal "onnx", `ruby -I #{lib_dir.shellescape} -r magika -e 'print Magika.new.identify_path("#{model_path.shellescape}").label'`
|
|
32
32
|
end
|
|
33
33
|
end
|
|
34
34
|
end
|
data/test/test_smoke.rb
CHANGED
|
@@ -5,16 +5,16 @@ class TestSmoke < TestBase
|
|
|
5
5
|
data do
|
|
6
6
|
JSON.load(`magika --recursive --json vendor/bundle`).collect {|result|
|
|
7
7
|
path = result["path"]
|
|
8
|
-
label = result["result"]["value"]["output"]["label"]
|
|
8
|
+
label = result["result"]["value"]["output"]["label"].to_sym
|
|
9
9
|
score = result["result"]["value"]["score"]
|
|
10
10
|
[path, [path, label, score]]
|
|
11
11
|
}.to_h
|
|
12
12
|
end
|
|
13
13
|
def test_smoke(data)
|
|
14
14
|
path, label, score = data
|
|
15
|
-
result = magika.
|
|
16
|
-
assert_equal label
|
|
17
|
-
|
|
15
|
+
result = magika.identify_path(path)
|
|
16
|
+
assert_equal label, result.label
|
|
17
|
+
assert_in_delta score, result.score
|
|
18
18
|
end
|
|
19
19
|
end
|
|
20
20
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: magika
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Kitaiti Makoto
|
|
@@ -107,7 +107,21 @@ dependencies:
|
|
|
107
107
|
- - ">="
|
|
108
108
|
- !ruby/object:Gem::Version
|
|
109
109
|
version: '0'
|
|
110
|
-
|
|
110
|
+
- !ruby/object:Gem::Dependency
|
|
111
|
+
name: simplecov
|
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
|
113
|
+
requirements:
|
|
114
|
+
- - ">="
|
|
115
|
+
- !ruby/object:Gem::Version
|
|
116
|
+
version: '0'
|
|
117
|
+
type: :development
|
|
118
|
+
prerelease: false
|
|
119
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
120
|
+
requirements:
|
|
121
|
+
- - ">="
|
|
122
|
+
- !ruby/object:Gem::Version
|
|
123
|
+
version: '0'
|
|
124
|
+
description: Determines file content types using original Magika's ONNX model
|
|
111
125
|
email:
|
|
112
126
|
- KitaitiMakoto@gmail.com
|
|
113
127
|
executables: []
|
|
@@ -126,6 +140,7 @@ files:
|
|
|
126
140
|
- magika.gemspec
|
|
127
141
|
- test/fixtures/empty
|
|
128
142
|
- test/fixtures/hello.txt
|
|
143
|
+
- test/fixtures/padded-long-whitespace.txt
|
|
129
144
|
- test/fixtures/small.bin
|
|
130
145
|
- test/fixtures/symlink
|
|
131
146
|
- test/helper.rb
|