magika 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d0fd99e8ceb2f0563b0afb70bebb2caf0b5aedcd776a05a512f47e39d86eac5b
4
- data.tar.gz: b1699a79078fde7fa3c007924d2603b1bf44849fd540fd166010240219710e22
3
+ metadata.gz: 1242b81f1cb54d8d60b1849246ebdecfc288fad8e4521c98e2d51ceb00d6d699
4
+ data.tar.gz: 0aacd82439630b208f1d87c912276a3aff364b8acb3514678d5a06f55fef1cb0
5
5
  SHA512:
6
- metadata.gz: 27b7cd2007f40ca98dae78691fdb1bd4360f7f194b86e7c128500b652ea093a556c26d1db50ad7fa8297a1ea6e511fefdb6dd56ae5dafe35cd2e0f685f3abbd0
7
- data.tar.gz: 8d069b640a54ac91690e67a2efb4b619987c3a9804a9098d3b179cf85ffee4aa7453e0bb939337c6fd4126b77e2cb20ad553435ee4b32474796ed0740609061d
6
+ metadata.gz: bfabdaeb971e6564385dbde08c0b5fb7269b2ad97f0e02eebf7010ea95349997e5fadb8e9c7d828c2f69ee724b5e7e165576e67e5e798ccda696be0452b321ac
7
+ data.tar.gz: 716c087f89004d48096592fc27c6120ee14ed0a9fcb1b78fe3561e4acaf1e7f861544de5a3c9fcb9f7e46bc779e4643ee4e55dc9b76f30c2aab95837331d446f
data/CHANGELOG.md CHANGED
@@ -1,4 +1,12 @@
1
- 0.1.0
1
+ 0.0.2
2
+ =====
3
+
4
+ * Pass Numo::NArray to OnnxRuntiem
5
+ * Fix features type
6
+ * Rename method: Magika#identify -> Magika#identify_path
7
+ * Add Magika#identify_io
8
+
9
+ 0.0.1
2
10
  =====
3
11
 
4
12
  * Initial release
data/README.md CHANGED
@@ -12,7 +12,10 @@ SYNOPSIS
12
12
  require "magika"
13
13
 
14
14
  magika = Magika.new
15
- magika.identify("path/to/file") # => #<Magika::FileType label=onnx score=0.9998390674591064 application/octet-stream "Open Neural Network Exchange" (archive)>,
15
+ magika.identify_path("path/to/file") # => #<Magika::FileType label=onnx score=0.9998390674591064 application/octet-stream "Open Neural Network Exchange" (archive)>,
16
+
17
+ require "striongio"
18
+ magika.identify_io(StringIO.new("some text content")) # => #<Magika::FileType label=txt score=0.8081967830657959 text/plain "Generic text document" (text)>
16
19
  ```
17
20
 
18
21
  INSTALLATION
data/lib/magika.rb CHANGED
@@ -11,8 +11,7 @@ class Magika
11
11
 
12
12
  MODEL_CONFIG = JSON.load_file(MODEL_CONFIG_PATH, symbolize_names: true)
13
13
 
14
- TARGET_LABELS_SPACE = MODEL_CONFIG[:target_labels_space]
15
- TARGET_LABELS_SPACE.collect!(&:to_sym)
14
+ TARGET_LABELS_SPACE = Numo::RObject[MODEL_CONFIG[:target_labels_space].collect!(&:to_sym)]
16
15
  MIN_FILE_SIZE_FOR_DL = MODEL_CONFIG[:min_file_size_for_dl]
17
16
  VERSION_MAJOR = MODEL_CONFIG[:version_major]
18
17
 
@@ -20,13 +19,23 @@ class Magika
20
19
  @session = OnnxRuntime::InferenceSession.new(MODEL_PATH)
21
20
  end
22
21
 
23
- def identify(path)
24
- features, label = extract_features_or_label(path)
22
+ def identify_path(path)
23
+ stat = File.lstat(path)
24
+ if stat.directory?
25
+ return FileType.new(:directory, 1.0)
26
+ end
27
+ if stat.symlink?
28
+ return FileType.new(:symlink, 1.0)
29
+ end
30
+ File.open(path, "rb", encoding: Encoding::BINARY) {|file|
31
+ identify_io(file)
32
+ }
33
+ end
34
+
35
+ def identify_io(io)
36
+ features, label = extract_features_or_label(io)
25
37
  if features
26
- target_label = @session.run([:target_label], {bytes: features.reshape(1, FEATURES_SIZE)}, output_type: :numo)[0]
27
- index = target_label.argmax
28
- score = target_label[index]
29
- FileType.new(TARGET_LABELS_SPACE[index], score)
38
+ FileType.from_inference(infer_from_features(features))
30
39
  else
31
40
  FileType.new(label, 1.0)
32
41
  end
@@ -41,75 +50,68 @@ class Magika
41
50
  BLOCK_SIZE = MODEL_CONFIG[:block_size]
42
51
  PADDING_TOKEN = MODEL_CONFIG[:padding_token]
43
52
 
44
- def extract_features_or_label(path)
45
- stat = File.lstat(path)
46
- if stat.directory?
47
- return nil, :directory
48
- end
49
- if stat.symlink?
50
- return nil, :symlink
51
- end
52
- file_size = stat.size
53
- if file_size == 0
53
+ def extract_features_or_label(io)
54
+ io_size = io.size
55
+ if io_size == 0
54
56
  return nil, :empty
55
57
  end
56
- if file_size < MIN_FILE_SIZE_FOR_DL
57
- return small_content_result(path)
58
+ if io_size < MIN_FILE_SIZE_FOR_DL
59
+ return nil, detect_small_content_label(io.read)
58
60
  end
59
- features = extract_features(path)
61
+ features = extract_features_from_io(io)
60
62
  if features[MIN_FILE_SIZE_FOR_DL - 1] == PADDING_TOKEN
61
- return small_content_result(path)
63
+ return nil, detect_small_content_label(features[0...io_size].to_binary)
62
64
  else
63
65
  return features, nil
64
66
  end
65
67
  end
66
68
 
67
- def small_content_result(path)
68
- content = File.read(path, encoding: Encoding::BINARY)
69
- begin
70
- content.encode! Encoding::UTF_8
71
- return nil, :txt
72
- rescue EncodingError
73
- return nil, :unknown
69
+ def detect_small_content_label(content)
70
+ if content.force_encoding(Encoding::UTF_8).valid_encoding?
71
+ return :txt
72
+ else
73
+ return :unknown
74
74
  end
75
75
  end
76
76
 
77
- def extract_features(path)
77
+ def extract_features_from_io(io)
78
78
  features = Numo::Int32.new(FEATURES_SIZE).fill(PADDING_TOKEN)
79
- File.open(path, "rb", encoding: Encoding::BINARY) do |file|
80
- file_size = file.size
81
- return features if file_size == 0
82
-
83
- block_size = [BLOCK_SIZE, file_size].min
84
-
85
- beg_block = file.read(block_size)
86
- # String#lstrip strips null characters, which reduces accuracy
87
- beg_str = beg_block.sub(/\A\s+/, "").freeze
88
- unless beg_str.empty?
89
- beg_content = Numo::Int8.from_binary(beg_str)
90
- beg_size = [BEG_SIZE, beg_content.size].min
91
- features[0...beg_size] = beg_content[0...beg_size]
92
- end
79
+ io_size = io.size
80
+ return features if io_size == 0
81
+
82
+ block_size = [BLOCK_SIZE, io_size].min
83
+
84
+ beg_block = io.read(block_size)
85
+ # String#lstrip strips null characters, which reduces accuracy
86
+ beg_str = beg_block.sub(/\A\s+/, "").freeze
87
+ unless beg_str.empty?
88
+ beg_content = Numo::UInt8.from_binary(beg_str)
89
+ beg_size = [BEG_SIZE, beg_content.size].min
90
+ features[0...beg_size] = beg_content[0...beg_size]
91
+ end
93
92
 
94
- # Skip middle feature
95
-
96
- end_block = if block_size == file_size
97
- beg_block
98
- else
99
- file.seek(-block_size, IO::SEEK_END)
100
- file.read(block_size)
101
- end
102
- # String#rstrip strips null characters, which reduces accuracy
103
- end_str = end_block.reverse.sub(/\A\s+/, "").reverse.freeze
104
- unless end_str.empty?
105
- end_content = Numo::Int8.from_binary(end_str)
106
- end_size = [END_SIZE, end_content.size].min
107
- features[-end_size..] = end_content[-end_size..]
108
- end
93
+ # Skip middle feature
94
+
95
+ end_block = if block_size == io_size
96
+ beg_block
97
+ else
98
+ io.seek(-block_size, IO::SEEK_END)
99
+ io.read(block_size)
100
+ end
101
+ # String#rstrip strips null characters, which reduces accuracy
102
+ end_str = end_block.reverse.sub(/\A\s+/, "").reverse.freeze
103
+ unless end_str.empty?
104
+ end_content = Numo::UInt8.from_binary(end_str)
105
+ end_size = [END_SIZE, end_content.size].min
106
+ features[-end_size..] = end_content[-end_size..]
109
107
  end
110
108
  features
111
109
  end
112
110
 
111
+ def infer_from_features(features)
112
+ @session.run([:target_label], {bytes: OnnxRuntime::OrtValue.from_numo(features.reshape(1, FEATURES_SIZE))}, output_type: :numo)[0]
113
+ end
114
+
113
115
  class FileType
114
116
  CONTENT_TYPES_PATH = File.join(ASSETS_DIR, "content_types_kb.min.json")
115
117
  CONTENT_TYPES = JSON.load_file(CONTENT_TYPES_PATH, symbolize_names: true)
@@ -118,6 +120,14 @@ class Magika
118
120
 
119
121
  attr_reader :score
120
122
 
123
+ class << self
124
+ def from_inference(inference)
125
+ index = inference.argmax
126
+ score = inference[index]
127
+ FileType.new(TARGET_LABELS_SPACE[index], score)
128
+ end
129
+ end
130
+
121
131
  def initialize(label, score)
122
132
  @inferred_label = label
123
133
  @score = score
data/magika.gemspec CHANGED
@@ -1,10 +1,10 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "magika"
3
- s.version = "0.0.1"
3
+ s.version = "0.0.2"
4
4
  s.authors = ["Kitaiti Makoto"]
5
5
  s.email = ["KitaitiMakoto@gmail.com"]
6
6
  s.summary = "Determines file content types using AI"
7
- s.description = "Determines file content types using original Msgika's ONNX model"
7
+ s.description = "Determines file content types using original Magika's ONNX model"
8
8
  s.license = "Apache-2.0"
9
9
  s.homepage = "https://gitlab.com/KitaitiMakoto/magika-rb"
10
10
 
@@ -25,4 +25,5 @@ Gem::Specification.new do |s|
25
25
  s.add_development_dependency "test-unit"
26
26
  s.add_development_dependency "test-unit-notify"
27
27
  s.add_development_dependency "terminal-notifier" if RUBY_PLATFORM.match?(/darwin/)
28
+ s.add_development_dependency "simplecov"
28
29
  end
@@ -0,0 +1 @@
1
+ hello
data/test/helper.rb CHANGED
@@ -1,3 +1,8 @@
1
+ require "simplecov"
2
+ SimpleCov.start do
3
+ add_filter /test|vendor/
4
+ end
5
+
1
6
  require "test/unit"
2
7
  require "test/unit/notify"
3
8
  require "terminal-notifier" if RUBY_PLATFORM.match?(/darwin/)
data/test/test_magika.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require_relative "helper"
2
+ require "stringio"
2
3
 
3
4
  class TestMagika < TestBase
4
5
  def test_initialize
@@ -6,16 +7,28 @@ class TestMagika < TestBase
6
7
  assert_instance_of Magika, magika
7
8
  end
8
9
 
9
-
10
10
  data("empty file", [fixture_file_path("empty"), :empty])
11
11
  data("small text file", [fixture_file_path("hello.txt"), :txt])
12
+ data("padded long whitespace text file", [fixture_file_path("padded-long-whitespace.txt"), :txt])
12
13
  data("small binary fall", [fixture_file_path("small.bin"), :unknown])
13
14
  data("ONNX model file", [Magika::MODEL_PATH, :onnx])
14
15
  data("Ruby file", [__FILE__, :ruby])
15
16
  data("directory", [__dir__, :directory])
16
17
  data("symlink", [fixture_file_path("symlink"), :symlink])
17
- def test_identify(data)
18
+ def test_identify_path(data)
19
+ path, label = data
20
+ assert_equal label, magika.identify_path(path).label
21
+ end
22
+
23
+ data("empty file", [fixture_file_path("empty"), :empty])
24
+ data("small text file", [fixture_file_path("hello.txt"), :txt])
25
+ data("small binary fall", [fixture_file_path("small.bin"), :unknown])
26
+ data("ONNX model file", [Magika::MODEL_PATH, :onnx])
27
+ def test_identify_io(data)
18
28
  path, label = data
19
- assert_equal label, magika.identify(path).label
29
+ File.open(path, "rb", encoding: Encoding::BINARY) do |io|
30
+ assert_equal label, magika.identify_io(io).label
31
+ end
32
+ assert_equal label, magika.identify_io(StringIO.new(File.binread(path))).label
20
33
  end
21
34
  end
data/test/test_package.rb CHANGED
@@ -28,7 +28,7 @@ class TestPackage < Test::Unit::TestCase
28
28
 
29
29
  lib_dir = File.join(gem_dir, "lib")
30
30
 
31
- assert_equal "onnx", `ruby -I #{lib_dir.shellescape} -r magika -e 'print Magika.new.identify("#{model_path.shellescape}").label'`
31
+ assert_equal "onnx", `ruby -I #{lib_dir.shellescape} -r magika -e 'print Magika.new.identify_path("#{model_path.shellescape}").label'`
32
32
  end
33
33
  end
34
34
  end
data/test/test_smoke.rb CHANGED
@@ -5,16 +5,16 @@ class TestSmoke < TestBase
5
5
  data do
6
6
  JSON.load(`magika --recursive --json vendor/bundle`).collect {|result|
7
7
  path = result["path"]
8
- label = result["result"]["value"]["output"]["label"]
8
+ label = result["result"]["value"]["output"]["label"].to_sym
9
9
  score = result["result"]["value"]["score"]
10
10
  [path, [path, label, score]]
11
11
  }.to_h
12
12
  end
13
13
  def test_smoke(data)
14
14
  path, label, score = data
15
- result = magika.identify(path)
16
- assert_equal label.to_sym, result.label
17
- # assert_in_delta score, result.score
15
+ result = magika.identify_path(path)
16
+ assert_equal label, result.label
17
+ assert_in_delta score, result.score
18
18
  end
19
19
  end
20
20
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: magika
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kitaiti Makoto
@@ -107,7 +107,21 @@ dependencies:
107
107
  - - ">="
108
108
  - !ruby/object:Gem::Version
109
109
  version: '0'
110
- description: Determines file content types using original Msgika's ONNX model
110
+ - !ruby/object:Gem::Dependency
111
+ name: simplecov
112
+ requirement: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ type: :development
118
+ prerelease: false
119
+ version_requirements: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ description: Determines file content types using original Magika's ONNX model
111
125
  email:
112
126
  - KitaitiMakoto@gmail.com
113
127
  executables: []
@@ -126,6 +140,7 @@ files:
126
140
  - magika.gemspec
127
141
  - test/fixtures/empty
128
142
  - test/fixtures/hello.txt
143
+ - test/fixtures/padded-long-whitespace.txt
129
144
  - test/fixtures/small.bin
130
145
  - test/fixtures/symlink
131
146
  - test/helper.rb