tlsh 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rubocop.yml +2 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +55 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/tlsh.rb +8 -0
- data/lib/tlsh/buckets.rb +97 -0
- data/lib/tlsh/digest_hash/pearson.rb +30 -0
- data/lib/tlsh/distance/distance.rb +61 -0
- data/lib/tlsh/distance/precomputed_bits.rb +4626 -0
- data/lib/tlsh/quartiles.rb +137 -0
- data/lib/tlsh/tlsh.rb +74 -0
- data/lib/tlsh/tlsh_instance.rb +39 -0
- data/lib/tlsh/version.rb +3 -0
- data/tlsh.gemspec +41 -0
- metadata +143 -0
@@ -0,0 +1,137 @@
|
|
1
|
+
# Buckets provides utility computation methods for computation of quartile statistics
|
2
|
+
module Quartiles
|
3
|
+
class << self
|
4
|
+
EFF_BUCKETS = 128
|
5
|
+
|
6
|
+
def quartile_points(buckets)
|
7
|
+
spl = spr = 0
|
8
|
+
q1 = 0
|
9
|
+
|
10
|
+
p1 = EFF_BUCKETS / 4 - 1
|
11
|
+
p2 = EFF_BUCKETS / 2 - 1
|
12
|
+
p_end = EFF_BUCKETS - 1
|
13
|
+
|
14
|
+
buckets_cpy = buckets.dup[0..EFF_BUCKETS]
|
15
|
+
|
16
|
+
cut_left = []
|
17
|
+
cut_right = []
|
18
|
+
|
19
|
+
l = 0
|
20
|
+
r = p_end
|
21
|
+
loop do
|
22
|
+
ret = partition(buckets_cpy, l, r)
|
23
|
+
if ret > p2
|
24
|
+
r = ret - 1
|
25
|
+
cut_right[spr] = ret
|
26
|
+
spr += 1
|
27
|
+
elsif ret < p2
|
28
|
+
l = ret + 1
|
29
|
+
cut_left[spl] = ret
|
30
|
+
spl += 1
|
31
|
+
else
|
32
|
+
q1 = buckets_cpy[p2]
|
33
|
+
break
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
cut_left[spl] = p2 - 1
|
38
|
+
cut_right[spr] = p2 + 1
|
39
|
+
|
40
|
+
q2 = get_q2(buckets_cpy, cut_left, spl, p1)
|
41
|
+
q3 = get_q3(buckets_cpy, cut_right, spr, p_end)
|
42
|
+
|
43
|
+
[q1, q2, q3]
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def partition(buffer, left, right)
|
49
|
+
return left if left == right
|
50
|
+
|
51
|
+
if left + 1 == right
|
52
|
+
if buffer[left] > buffer[right]
|
53
|
+
buffer[right], buffer[left] = buffer[left], buffer[right]
|
54
|
+
end
|
55
|
+
return left
|
56
|
+
end
|
57
|
+
|
58
|
+
ret = left
|
59
|
+
|
60
|
+
partition_buffer(buffer, ret, left, right)
|
61
|
+
end
|
62
|
+
|
63
|
+
def partition_buffer(buffer, ret, left, right)
|
64
|
+
pivot = (left + right) >> 1
|
65
|
+
value = buffer[pivot]
|
66
|
+
|
67
|
+
buffer[pivot] = buffer[right]
|
68
|
+
buffer[right] = value
|
69
|
+
|
70
|
+
(left..right).each do |i|
|
71
|
+
if buffer[i] < value
|
72
|
+
buffer[i], buffer[ret] = buffer[ret], buffer[i]
|
73
|
+
ret += 1
|
74
|
+
end
|
75
|
+
|
76
|
+
buffer[right] = buffer[ret]
|
77
|
+
buffer[ret] = value
|
78
|
+
end
|
79
|
+
|
80
|
+
ret
|
81
|
+
end
|
82
|
+
|
83
|
+
def get_q2(buckets, cut_left, spl, p1)
|
84
|
+
i = l = 0
|
85
|
+
while i <= spl
|
86
|
+
r = cut_left[i]
|
87
|
+
|
88
|
+
if r > p1
|
89
|
+
loop do
|
90
|
+
ret = partition(buckets, l, r)
|
91
|
+
if ret > p1
|
92
|
+
r = ret - 1
|
93
|
+
elsif ret < p1
|
94
|
+
l = ret + 1
|
95
|
+
else
|
96
|
+
return buckets[p1]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
i += 1
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def get_q3(buckets, cut_right, spr, p_end)
|
106
|
+
p3 = EFF_BUCKETS - EFF_BUCKETS / 4 - 1
|
107
|
+
q3 = 0
|
108
|
+
|
109
|
+
i = 0
|
110
|
+
r = p_end
|
111
|
+
while i <= spr
|
112
|
+
l = cut_right[i]
|
113
|
+
if l < p3
|
114
|
+
loop do
|
115
|
+
ret = partition(buckets, l, r)
|
116
|
+
if ret > p3
|
117
|
+
r = ret - 1
|
118
|
+
elsif ret < p3
|
119
|
+
l = ret + 1
|
120
|
+
else
|
121
|
+
q3 = buckets[p3]
|
122
|
+
break
|
123
|
+
end
|
124
|
+
end
|
125
|
+
break
|
126
|
+
elsif l > p3
|
127
|
+
r = l
|
128
|
+
else
|
129
|
+
q3 = buckets[p3]
|
130
|
+
end
|
131
|
+
|
132
|
+
i += 1
|
133
|
+
end
|
134
|
+
q3
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
data/lib/tlsh/tlsh.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'tlsh/version'
|
2
|
+
require 'tlsh/distance/distance'
|
3
|
+
require 'tlsh/digest_hash/pearson'
|
4
|
+
|
5
|
+
# Tlsh module implement interface for TLSH (Trend Micro Locality Sensitive Hash) computation.
|
6
|
+
# TLSH is usable for diff and similarity computations of binary data, because of the locality sensitivity.
|
7
|
+
module Tlsh
|
8
|
+
LOG1_5 = 0.4054651
|
9
|
+
LOG1_3 = 0.26236426
|
10
|
+
LOG1_1 = 0.095310180
|
11
|
+
|
12
|
+
class << self
|
13
|
+
def diff_files(filename, other_filename)
|
14
|
+
file_a = File.read(filename)
|
15
|
+
file_b = File.read(other_filename)
|
16
|
+
|
17
|
+
tslh_a = tlsh_hash(file_a.bytes)
|
18
|
+
tslh_b = tlsh_hash(file_b.bytes)
|
19
|
+
tslh_a.diff(tslh_b)
|
20
|
+
end
|
21
|
+
|
22
|
+
# hash_file calculates the TLSH for the input file
|
23
|
+
def hash_file(filename)
|
24
|
+
file = File.read(filename)
|
25
|
+
tlsh_hash(file.bytes)
|
26
|
+
end
|
27
|
+
|
28
|
+
def hash_bytes(blob)
|
29
|
+
tlsh_hash(blob)
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def tlsh_hash(input)
|
35
|
+
buckets, checksum, filesize = Buckets.fill_buckets(input)
|
36
|
+
|
37
|
+
# get the quartiles and their ratio
|
38
|
+
q1, q2, q3 = Quartiles.quartile_points(buckets)
|
39
|
+
q1_ratio = (q1 * 100 / q3) % 16
|
40
|
+
q2_ratio = (q2 * 100 / q3) % 16
|
41
|
+
q_ratio = ((q1_ratio & 0xF) << 4) | (q2_ratio & 0xF)
|
42
|
+
|
43
|
+
# get the binary buckets representation
|
44
|
+
bin_hash = Buckets.buckets_binary(buckets, q1, q2, q3)
|
45
|
+
|
46
|
+
TlshInstance.new(checksum: checksum, l_value: l_value(filesize), q1_ratio: q1_ratio, q2_ratio: q2_ratio, q_ratio: q_ratio, body: bin_hash)
|
47
|
+
end
|
48
|
+
|
49
|
+
def l_value(length)
|
50
|
+
l = if length <= 656
|
51
|
+
l_value_small(length)
|
52
|
+
|
53
|
+
elsif length <= 3199
|
54
|
+
l_value_medium(length)
|
55
|
+
|
56
|
+
else
|
57
|
+
l_value_large(length)
|
58
|
+
end
|
59
|
+
l & 255
|
60
|
+
end
|
61
|
+
|
62
|
+
def l_value_small(length)
|
63
|
+
Float(Math.log(length) / LOG1_5).floor.to_i
|
64
|
+
end
|
65
|
+
|
66
|
+
def l_value_medium(length)
|
67
|
+
Float(Math.log(length) / LOG1_3 - 8.72777).floor.to_i
|
68
|
+
end
|
69
|
+
|
70
|
+
def l_value_large(length)
|
71
|
+
Float(Math.log(length) / LOG1_1 - 62.5472).floor.to_i
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Tlsh
|
2
|
+
# TlshInstance represents single TLSH instance
|
3
|
+
class TlshInstance
|
4
|
+
attr_accessor :checksum, :l_value, :q1_ratio, :q2_ratio, :q_ratio, :body
|
5
|
+
|
6
|
+
def initialize(params = {})
|
7
|
+
params.each do |key, value|
|
8
|
+
setter = "#{key}="
|
9
|
+
send(setter, value) if respond_to?(setter.to_sym, false)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# returns diff against another TlshInstance. The closer to 0, the smaller the diff.
|
14
|
+
def diff(other)
|
15
|
+
Distance.diff_total(self, other, true)
|
16
|
+
end
|
17
|
+
|
18
|
+
# returns the binary representation of the hash
|
19
|
+
def binary
|
20
|
+
[swap_byte(checksum), swap_byte(l_value), q_ratio] + body
|
21
|
+
end
|
22
|
+
|
23
|
+
# returns the string representation of the hash
|
24
|
+
def string
|
25
|
+
binary.map { |i| i.to_i.to_s(16) }.join('')
|
26
|
+
end
|
27
|
+
|
28
|
+
def comparable?
|
29
|
+
checksum && l_value && q1_ratio && q2_ratio && q_ratio && body
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def swap_byte(input)
|
35
|
+
out = ((input & 0xF0) >> 4) & 0x0F
|
36
|
+
out | ((input & 0x0F) << 4) & 0xF0
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/tlsh/version.rb
ADDED
data/tlsh.gemspec
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
lib = File.expand_path('../lib', __FILE__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'tlsh/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |spec|
|
8
|
+
spec.name = 'tlsh'
|
9
|
+
spec.version = Tlsh::VERSION
|
10
|
+
spec.authors = ['adamliesko']
|
11
|
+
spec.email = ['adamliesko@gmail.com']
|
12
|
+
|
13
|
+
spec.summary = 'A fuzzy matching library which creates hashes that can be used for similarity comparisons.'
|
14
|
+
spec.description = <<DESC
|
15
|
+
tlsh is a fuzzy matching library, which hashes can be used for similarity comparison.
|
16
|
+
Given a byte stream with a minimum length of 256 bytes, TLSH generates a hash value
|
17
|
+
which can be used for similarity comparisons. Similar objects will have similar hash
|
18
|
+
values which allow for the detection of similar objects by comparing their hash values.
|
19
|
+
|
20
|
+
The computed hash is 35 bytes long (output as 70 hexadecimal characters).
|
21
|
+
The first 3 bytes are used to capture the information about the file as a whole (length, ...),
|
22
|
+
while the last 32 bytes are used to capture information about incremental parts of the file.
|
23
|
+
DESC
|
24
|
+
|
25
|
+
spec.homepage = 'https://github.com/adamliesko/tlsh'
|
26
|
+
spec.license = 'MIT'
|
27
|
+
|
28
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
29
|
+
f.match(%r{^(test|spec|features)/})
|
30
|
+
end
|
31
|
+
|
32
|
+
spec.bindir = 'exe'
|
33
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
34
|
+
spec.require_paths = ['lib']
|
35
|
+
|
36
|
+
spec.add_development_dependency 'bundler', '~> 1.15'
|
37
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
38
|
+
spec.add_development_dependency 'minitest', '~> 5.0'
|
39
|
+
spec.add_development_dependency 'coveralls', '~> 0'
|
40
|
+
spec.add_development_dependency 'simplecov', '~> 0'
|
41
|
+
end
|
metadata
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tlsh
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- adamliesko
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-08-18 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.15'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.15'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: minitest
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '5.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '5.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: coveralls
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: simplecov
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description: |
|
84
|
+
tlsh is a fuzzy matching library, which hashes can be used for similarity comparison.
|
85
|
+
Given a byte stream with a minimum length of 256 bytes, TLSH generates a hash value
|
86
|
+
which can be used for similarity comparisons. Similar objects will have similar hash
|
87
|
+
values which allow for the detection of similar objects by comparing their hash values.
|
88
|
+
|
89
|
+
The computed hash is 35 bytes long (output as 70 hexadecimal characters).
|
90
|
+
The first 3 bytes are used to capture the information about the file as a whole (length, ...),
|
91
|
+
while the last 32 bytes are used to capture information about incremental parts of the file.
|
92
|
+
email:
|
93
|
+
- adamliesko@gmail.com
|
94
|
+
executables: []
|
95
|
+
extensions: []
|
96
|
+
extra_rdoc_files: []
|
97
|
+
files:
|
98
|
+
- ".gitignore"
|
99
|
+
- ".rubocop.yml"
|
100
|
+
- ".travis.yml"
|
101
|
+
- CODE_OF_CONDUCT.md
|
102
|
+
- Gemfile
|
103
|
+
- LICENSE.txt
|
104
|
+
- README.md
|
105
|
+
- Rakefile
|
106
|
+
- bin/console
|
107
|
+
- bin/setup
|
108
|
+
- lib/tlsh.rb
|
109
|
+
- lib/tlsh/buckets.rb
|
110
|
+
- lib/tlsh/digest_hash/pearson.rb
|
111
|
+
- lib/tlsh/distance/distance.rb
|
112
|
+
- lib/tlsh/distance/precomputed_bits.rb
|
113
|
+
- lib/tlsh/quartiles.rb
|
114
|
+
- lib/tlsh/tlsh.rb
|
115
|
+
- lib/tlsh/tlsh_instance.rb
|
116
|
+
- lib/tlsh/version.rb
|
117
|
+
- tlsh.gemspec
|
118
|
+
homepage: https://github.com/adamliesko/tlsh
|
119
|
+
licenses:
|
120
|
+
- MIT
|
121
|
+
metadata: {}
|
122
|
+
post_install_message:
|
123
|
+
rdoc_options: []
|
124
|
+
require_paths:
|
125
|
+
- lib
|
126
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - ">="
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '0'
|
131
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: '0'
|
136
|
+
requirements: []
|
137
|
+
rubyforge_project:
|
138
|
+
rubygems_version: 2.6.12
|
139
|
+
signing_key:
|
140
|
+
specification_version: 4
|
141
|
+
summary: A fuzzy matching library which creates hashes that can be used for similarity
|
142
|
+
comparisons.
|
143
|
+
test_files: []
|