tlsh 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rubocop.yml +2 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +55 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/tlsh.rb +8 -0
- data/lib/tlsh/buckets.rb +97 -0
- data/lib/tlsh/digest_hash/pearson.rb +30 -0
- data/lib/tlsh/distance/distance.rb +61 -0
- data/lib/tlsh/distance/precomputed_bits.rb +4626 -0
- data/lib/tlsh/quartiles.rb +137 -0
- data/lib/tlsh/tlsh.rb +74 -0
- data/lib/tlsh/tlsh_instance.rb +39 -0
- data/lib/tlsh/version.rb +3 -0
- data/tlsh.gemspec +41 -0
- metadata +143 -0
@@ -0,0 +1,137 @@
|
|
1
|
+
# Buckets provides utility computation methods for computation of quartile statistics
|
2
|
+
module Quartiles
|
3
|
+
class << self
|
4
|
+
EFF_BUCKETS = 128
|
5
|
+
|
6
|
+
def quartile_points(buckets)
|
7
|
+
spl = spr = 0
|
8
|
+
q1 = 0
|
9
|
+
|
10
|
+
p1 = EFF_BUCKETS / 4 - 1
|
11
|
+
p2 = EFF_BUCKETS / 2 - 1
|
12
|
+
p_end = EFF_BUCKETS - 1
|
13
|
+
|
14
|
+
buckets_cpy = buckets.dup[0..EFF_BUCKETS]
|
15
|
+
|
16
|
+
cut_left = []
|
17
|
+
cut_right = []
|
18
|
+
|
19
|
+
l = 0
|
20
|
+
r = p_end
|
21
|
+
loop do
|
22
|
+
ret = partition(buckets_cpy, l, r)
|
23
|
+
if ret > p2
|
24
|
+
r = ret - 1
|
25
|
+
cut_right[spr] = ret
|
26
|
+
spr += 1
|
27
|
+
elsif ret < p2
|
28
|
+
l = ret + 1
|
29
|
+
cut_left[spl] = ret
|
30
|
+
spl += 1
|
31
|
+
else
|
32
|
+
q1 = buckets_cpy[p2]
|
33
|
+
break
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
cut_left[spl] = p2 - 1
|
38
|
+
cut_right[spr] = p2 + 1
|
39
|
+
|
40
|
+
q2 = get_q2(buckets_cpy, cut_left, spl, p1)
|
41
|
+
q3 = get_q3(buckets_cpy, cut_right, spr, p_end)
|
42
|
+
|
43
|
+
[q1, q2, q3]
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def partition(buffer, left, right)
|
49
|
+
return left if left == right
|
50
|
+
|
51
|
+
if left + 1 == right
|
52
|
+
if buffer[left] > buffer[right]
|
53
|
+
buffer[right], buffer[left] = buffer[left], buffer[right]
|
54
|
+
end
|
55
|
+
return left
|
56
|
+
end
|
57
|
+
|
58
|
+
ret = left
|
59
|
+
|
60
|
+
partition_buffer(buffer, ret, left, right)
|
61
|
+
end
|
62
|
+
|
63
|
+
def partition_buffer(buffer, ret, left, right)
|
64
|
+
pivot = (left + right) >> 1
|
65
|
+
value = buffer[pivot]
|
66
|
+
|
67
|
+
buffer[pivot] = buffer[right]
|
68
|
+
buffer[right] = value
|
69
|
+
|
70
|
+
(left..right).each do |i|
|
71
|
+
if buffer[i] < value
|
72
|
+
buffer[i], buffer[ret] = buffer[ret], buffer[i]
|
73
|
+
ret += 1
|
74
|
+
end
|
75
|
+
|
76
|
+
buffer[right] = buffer[ret]
|
77
|
+
buffer[ret] = value
|
78
|
+
end
|
79
|
+
|
80
|
+
ret
|
81
|
+
end
|
82
|
+
|
83
|
+
def get_q2(buckets, cut_left, spl, p1)
|
84
|
+
i = l = 0
|
85
|
+
while i <= spl
|
86
|
+
r = cut_left[i]
|
87
|
+
|
88
|
+
if r > p1
|
89
|
+
loop do
|
90
|
+
ret = partition(buckets, l, r)
|
91
|
+
if ret > p1
|
92
|
+
r = ret - 1
|
93
|
+
elsif ret < p1
|
94
|
+
l = ret + 1
|
95
|
+
else
|
96
|
+
return buckets[p1]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
i += 1
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def get_q3(buckets, cut_right, spr, p_end)
|
106
|
+
p3 = EFF_BUCKETS - EFF_BUCKETS / 4 - 1
|
107
|
+
q3 = 0
|
108
|
+
|
109
|
+
i = 0
|
110
|
+
r = p_end
|
111
|
+
while i <= spr
|
112
|
+
l = cut_right[i]
|
113
|
+
if l < p3
|
114
|
+
loop do
|
115
|
+
ret = partition(buckets, l, r)
|
116
|
+
if ret > p3
|
117
|
+
r = ret - 1
|
118
|
+
elsif ret < p3
|
119
|
+
l = ret + 1
|
120
|
+
else
|
121
|
+
q3 = buckets[p3]
|
122
|
+
break
|
123
|
+
end
|
124
|
+
end
|
125
|
+
break
|
126
|
+
elsif l > p3
|
127
|
+
r = l
|
128
|
+
else
|
129
|
+
q3 = buckets[p3]
|
130
|
+
end
|
131
|
+
|
132
|
+
i += 1
|
133
|
+
end
|
134
|
+
q3
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
data/lib/tlsh/tlsh.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'tlsh/version'
|
2
|
+
require 'tlsh/distance/distance'
|
3
|
+
require 'tlsh/digest_hash/pearson'
|
4
|
+
|
5
|
+
# Tlsh module implement interface for TLSH (Trend Micro Locality Sensitive Hash) computation.
|
6
|
+
# TLSH is usable for diff and similarity computations of binary data, because of the locality sensitivity.
|
7
|
+
module Tlsh
|
8
|
+
LOG1_5 = 0.4054651
|
9
|
+
LOG1_3 = 0.26236426
|
10
|
+
LOG1_1 = 0.095310180
|
11
|
+
|
12
|
+
class << self
|
13
|
+
def diff_files(filename, other_filename)
|
14
|
+
file_a = File.read(filename)
|
15
|
+
file_b = File.read(other_filename)
|
16
|
+
|
17
|
+
tslh_a = tlsh_hash(file_a.bytes)
|
18
|
+
tslh_b = tlsh_hash(file_b.bytes)
|
19
|
+
tslh_a.diff(tslh_b)
|
20
|
+
end
|
21
|
+
|
22
|
+
# hash_file calculates the TLSH for the input file
|
23
|
+
def hash_file(filename)
|
24
|
+
file = File.read(filename)
|
25
|
+
tlsh_hash(file.bytes)
|
26
|
+
end
|
27
|
+
|
28
|
+
def hash_bytes(blob)
|
29
|
+
tlsh_hash(blob)
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def tlsh_hash(input)
|
35
|
+
buckets, checksum, filesize = Buckets.fill_buckets(input)
|
36
|
+
|
37
|
+
# get the quartiles and their ratio
|
38
|
+
q1, q2, q3 = Quartiles.quartile_points(buckets)
|
39
|
+
q1_ratio = (q1 * 100 / q3) % 16
|
40
|
+
q2_ratio = (q2 * 100 / q3) % 16
|
41
|
+
q_ratio = ((q1_ratio & 0xF) << 4) | (q2_ratio & 0xF)
|
42
|
+
|
43
|
+
# get the binary buckets representation
|
44
|
+
bin_hash = Buckets.buckets_binary(buckets, q1, q2, q3)
|
45
|
+
|
46
|
+
TlshInstance.new(checksum: checksum, l_value: l_value(filesize), q1_ratio: q1_ratio, q2_ratio: q2_ratio, q_ratio: q_ratio, body: bin_hash)
|
47
|
+
end
|
48
|
+
|
49
|
+
def l_value(length)
|
50
|
+
l = if length <= 656
|
51
|
+
l_value_small(length)
|
52
|
+
|
53
|
+
elsif length <= 3199
|
54
|
+
l_value_medium(length)
|
55
|
+
|
56
|
+
else
|
57
|
+
l_value_large(length)
|
58
|
+
end
|
59
|
+
l & 255
|
60
|
+
end
|
61
|
+
|
62
|
+
def l_value_small(length)
|
63
|
+
Float(Math.log(length) / LOG1_5).floor.to_i
|
64
|
+
end
|
65
|
+
|
66
|
+
def l_value_medium(length)
|
67
|
+
Float(Math.log(length) / LOG1_3 - 8.72777).floor.to_i
|
68
|
+
end
|
69
|
+
|
70
|
+
def l_value_large(length)
|
71
|
+
Float(Math.log(length) / LOG1_1 - 62.5472).floor.to_i
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Tlsh
|
2
|
+
# TlshInstance represents single TLSH instance
|
3
|
+
class TlshInstance
|
4
|
+
attr_accessor :checksum, :l_value, :q1_ratio, :q2_ratio, :q_ratio, :body
|
5
|
+
|
6
|
+
def initialize(params = {})
|
7
|
+
params.each do |key, value|
|
8
|
+
setter = "#{key}="
|
9
|
+
send(setter, value) if respond_to?(setter.to_sym, false)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# returns diff against another TlshInstance. The closer to 0, the smaller the diff.
|
14
|
+
def diff(other)
|
15
|
+
Distance.diff_total(self, other, true)
|
16
|
+
end
|
17
|
+
|
18
|
+
# returns the binary representation of the hash
|
19
|
+
def binary
|
20
|
+
[swap_byte(checksum), swap_byte(l_value), q_ratio] + body
|
21
|
+
end
|
22
|
+
|
23
|
+
# returns the string representation of the hash
|
24
|
+
def string
|
25
|
+
binary.map { |i| i.to_i.to_s(16) }.join('')
|
26
|
+
end
|
27
|
+
|
28
|
+
def comparable?
|
29
|
+
checksum && l_value && q1_ratio && q2_ratio && q_ratio && body
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def swap_byte(input)
|
35
|
+
out = ((input & 0xF0) >> 4) & 0x0F
|
36
|
+
out | ((input & 0x0F) << 4) & 0xF0
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/tlsh/version.rb
ADDED
data/tlsh.gemspec
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
lib = File.expand_path('../lib', __FILE__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'tlsh/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |spec|
|
8
|
+
spec.name = 'tlsh'
|
9
|
+
spec.version = Tlsh::VERSION
|
10
|
+
spec.authors = ['adamliesko']
|
11
|
+
spec.email = ['adamliesko@gmail.com']
|
12
|
+
|
13
|
+
spec.summary = 'A fuzzy matching library which creates hashes that can be used for similarity comparisons.'
|
14
|
+
spec.description = <<DESC
|
15
|
+
tlsh is a fuzzy matching library, which hashes can be used for similarity comparison.
|
16
|
+
Given a byte stream with a minimum length of 256 bytes, TLSH generates a hash value
|
17
|
+
which can be used for similarity comparisons. Similar objects will have similar hash
|
18
|
+
values which allow for the detection of similar objects by comparing their hash values.
|
19
|
+
|
20
|
+
The computed hash is 35 bytes long (output as 70 hexadecimal characters).
|
21
|
+
The first 3 bytes are used to capture the information about the file as a whole (length, ...),
|
22
|
+
while the last 32 bytes are used to capture information about incremental parts of the file.
|
23
|
+
DESC
|
24
|
+
|
25
|
+
spec.homepage = 'https://github.com/adamliesko/tlsh'
|
26
|
+
spec.license = 'MIT'
|
27
|
+
|
28
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
29
|
+
f.match(%r{^(test|spec|features)/})
|
30
|
+
end
|
31
|
+
|
32
|
+
spec.bindir = 'exe'
|
33
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
34
|
+
spec.require_paths = ['lib']
|
35
|
+
|
36
|
+
spec.add_development_dependency 'bundler', '~> 1.15'
|
37
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
38
|
+
spec.add_development_dependency 'minitest', '~> 5.0'
|
39
|
+
spec.add_development_dependency 'coveralls', '~> 0'
|
40
|
+
spec.add_development_dependency 'simplecov', '~> 0'
|
41
|
+
end
|
metadata
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tlsh
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- adamliesko
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-08-18 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.15'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.15'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: minitest
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '5.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '5.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: coveralls
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: simplecov
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description: |
|
84
|
+
tlsh is a fuzzy matching library, which hashes can be used for similarity comparison.
|
85
|
+
Given a byte stream with a minimum length of 256 bytes, TLSH generates a hash value
|
86
|
+
which can be used for similarity comparisons. Similar objects will have similar hash
|
87
|
+
values which allow for the detection of similar objects by comparing their hash values.
|
88
|
+
|
89
|
+
The computed hash is 35 bytes long (output as 70 hexadecimal characters).
|
90
|
+
The first 3 bytes are used to capture the information about the file as a whole (length, ...),
|
91
|
+
while the last 32 bytes are used to capture information about incremental parts of the file.
|
92
|
+
email:
|
93
|
+
- adamliesko@gmail.com
|
94
|
+
executables: []
|
95
|
+
extensions: []
|
96
|
+
extra_rdoc_files: []
|
97
|
+
files:
|
98
|
+
- ".gitignore"
|
99
|
+
- ".rubocop.yml"
|
100
|
+
- ".travis.yml"
|
101
|
+
- CODE_OF_CONDUCT.md
|
102
|
+
- Gemfile
|
103
|
+
- LICENSE.txt
|
104
|
+
- README.md
|
105
|
+
- Rakefile
|
106
|
+
- bin/console
|
107
|
+
- bin/setup
|
108
|
+
- lib/tlsh.rb
|
109
|
+
- lib/tlsh/buckets.rb
|
110
|
+
- lib/tlsh/digest_hash/pearson.rb
|
111
|
+
- lib/tlsh/distance/distance.rb
|
112
|
+
- lib/tlsh/distance/precomputed_bits.rb
|
113
|
+
- lib/tlsh/quartiles.rb
|
114
|
+
- lib/tlsh/tlsh.rb
|
115
|
+
- lib/tlsh/tlsh_instance.rb
|
116
|
+
- lib/tlsh/version.rb
|
117
|
+
- tlsh.gemspec
|
118
|
+
homepage: https://github.com/adamliesko/tlsh
|
119
|
+
licenses:
|
120
|
+
- MIT
|
121
|
+
metadata: {}
|
122
|
+
post_install_message:
|
123
|
+
rdoc_options: []
|
124
|
+
require_paths:
|
125
|
+
- lib
|
126
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - ">="
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '0'
|
131
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: '0'
|
136
|
+
requirements: []
|
137
|
+
rubyforge_project:
|
138
|
+
rubygems_version: 2.6.12
|
139
|
+
signing_key:
|
140
|
+
specification_version: 4
|
141
|
+
summary: A fuzzy matching library which creates hashes that can be used for similarity
|
142
|
+
comparisons.
|
143
|
+
test_files: []
|