tlsh 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,137 @@
1
+ # Buckets provides utility computation methods for computation of quartile statistics
2
+ module Quartiles
3
+ class << self
4
+ EFF_BUCKETS = 128
5
+
6
+ def quartile_points(buckets)
7
+ spl = spr = 0
8
+ q1 = 0
9
+
10
+ p1 = EFF_BUCKETS / 4 - 1
11
+ p2 = EFF_BUCKETS / 2 - 1
12
+ p_end = EFF_BUCKETS - 1
13
+
14
+ buckets_cpy = buckets.dup[0..EFF_BUCKETS]
15
+
16
+ cut_left = []
17
+ cut_right = []
18
+
19
+ l = 0
20
+ r = p_end
21
+ loop do
22
+ ret = partition(buckets_cpy, l, r)
23
+ if ret > p2
24
+ r = ret - 1
25
+ cut_right[spr] = ret
26
+ spr += 1
27
+ elsif ret < p2
28
+ l = ret + 1
29
+ cut_left[spl] = ret
30
+ spl += 1
31
+ else
32
+ q1 = buckets_cpy[p2]
33
+ break
34
+ end
35
+ end
36
+
37
+ cut_left[spl] = p2 - 1
38
+ cut_right[spr] = p2 + 1
39
+
40
+ q2 = get_q2(buckets_cpy, cut_left, spl, p1)
41
+ q3 = get_q3(buckets_cpy, cut_right, spr, p_end)
42
+
43
+ [q1, q2, q3]
44
+ end
45
+
46
+ private
47
+
48
+ def partition(buffer, left, right)
49
+ return left if left == right
50
+
51
+ if left + 1 == right
52
+ if buffer[left] > buffer[right]
53
+ buffer[right], buffer[left] = buffer[left], buffer[right]
54
+ end
55
+ return left
56
+ end
57
+
58
+ ret = left
59
+
60
+ partition_buffer(buffer, ret, left, right)
61
+ end
62
+
63
+ def partition_buffer(buffer, ret, left, right)
64
+ pivot = (left + right) >> 1
65
+ value = buffer[pivot]
66
+
67
+ buffer[pivot] = buffer[right]
68
+ buffer[right] = value
69
+
70
+ (left..right).each do |i|
71
+ if buffer[i] < value
72
+ buffer[i], buffer[ret] = buffer[ret], buffer[i]
73
+ ret += 1
74
+ end
75
+
76
+ buffer[right] = buffer[ret]
77
+ buffer[ret] = value
78
+ end
79
+
80
+ ret
81
+ end
82
+
83
+ def get_q2(buckets, cut_left, spl, p1)
84
+ i = l = 0
85
+ while i <= spl
86
+ r = cut_left[i]
87
+
88
+ if r > p1
89
+ loop do
90
+ ret = partition(buckets, l, r)
91
+ if ret > p1
92
+ r = ret - 1
93
+ elsif ret < p1
94
+ l = ret + 1
95
+ else
96
+ return buckets[p1]
97
+ end
98
+ end
99
+ end
100
+
101
+ i += 1
102
+ end
103
+ end
104
+
105
+ def get_q3(buckets, cut_right, spr, p_end)
106
+ p3 = EFF_BUCKETS - EFF_BUCKETS / 4 - 1
107
+ q3 = 0
108
+
109
+ i = 0
110
+ r = p_end
111
+ while i <= spr
112
+ l = cut_right[i]
113
+ if l < p3
114
+ loop do
115
+ ret = partition(buckets, l, r)
116
+ if ret > p3
117
+ r = ret - 1
118
+ elsif ret < p3
119
+ l = ret + 1
120
+ else
121
+ q3 = buckets[p3]
122
+ break
123
+ end
124
+ end
125
+ break
126
+ elsif l > p3
127
+ r = l
128
+ else
129
+ q3 = buckets[p3]
130
+ end
131
+
132
+ i += 1
133
+ end
134
+ q3
135
+ end
136
+ end
137
+ end
data/lib/tlsh/tlsh.rb ADDED
@@ -0,0 +1,74 @@
1
+ require 'tlsh/version'
2
+ require 'tlsh/distance/distance'
3
+ require 'tlsh/digest_hash/pearson'
4
+
5
+ # Tlsh module implement interface for TLSH (Trend Micro Locality Sensitive Hash) computation.
6
+ # TLSH is usable for diff and similarity computations of binary data, because of the locality sensitivity.
7
+ module Tlsh
8
+ LOG1_5 = 0.4054651
9
+ LOG1_3 = 0.26236426
10
+ LOG1_1 = 0.095310180
11
+
12
+ class << self
13
+ def diff_files(filename, other_filename)
14
+ file_a = File.read(filename)
15
+ file_b = File.read(other_filename)
16
+
17
+ tslh_a = tlsh_hash(file_a.bytes)
18
+ tslh_b = tlsh_hash(file_b.bytes)
19
+ tslh_a.diff(tslh_b)
20
+ end
21
+
22
+ # hash_file calculates the TLSH for the input file
23
+ def hash_file(filename)
24
+ file = File.read(filename)
25
+ tlsh_hash(file.bytes)
26
+ end
27
+
28
+ def hash_bytes(blob)
29
+ tlsh_hash(blob)
30
+ end
31
+
32
+ private
33
+
34
+ def tlsh_hash(input)
35
+ buckets, checksum, filesize = Buckets.fill_buckets(input)
36
+
37
+ # get the quartiles and their ratio
38
+ q1, q2, q3 = Quartiles.quartile_points(buckets)
39
+ q1_ratio = (q1 * 100 / q3) % 16
40
+ q2_ratio = (q2 * 100 / q3) % 16
41
+ q_ratio = ((q1_ratio & 0xF) << 4) | (q2_ratio & 0xF)
42
+
43
+ # get the binary buckets representation
44
+ bin_hash = Buckets.buckets_binary(buckets, q1, q2, q3)
45
+
46
+ TlshInstance.new(checksum: checksum, l_value: l_value(filesize), q1_ratio: q1_ratio, q2_ratio: q2_ratio, q_ratio: q_ratio, body: bin_hash)
47
+ end
48
+
49
+ def l_value(length)
50
+ l = if length <= 656
51
+ l_value_small(length)
52
+
53
+ elsif length <= 3199
54
+ l_value_medium(length)
55
+
56
+ else
57
+ l_value_large(length)
58
+ end
59
+ l & 255
60
+ end
61
+
62
+ def l_value_small(length)
63
+ Float(Math.log(length) / LOG1_5).floor.to_i
64
+ end
65
+
66
+ def l_value_medium(length)
67
+ Float(Math.log(length) / LOG1_3 - 8.72777).floor.to_i
68
+ end
69
+
70
+ def l_value_large(length)
71
+ Float(Math.log(length) / LOG1_1 - 62.5472).floor.to_i
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,39 @@
1
+ module Tlsh
2
+ # TlshInstance represents single TLSH instance
3
+ class TlshInstance
4
+ attr_accessor :checksum, :l_value, :q1_ratio, :q2_ratio, :q_ratio, :body
5
+
6
+ def initialize(params = {})
7
+ params.each do |key, value|
8
+ setter = "#{key}="
9
+ send(setter, value) if respond_to?(setter.to_sym, false)
10
+ end
11
+ end
12
+
13
+ # returns diff against another TlshInstance. The closer to 0, the smaller the diff.
14
+ def diff(other)
15
+ Distance.diff_total(self, other, true)
16
+ end
17
+
18
+ # returns the binary representation of the hash
19
+ def binary
20
+ [swap_byte(checksum), swap_byte(l_value), q_ratio] + body
21
+ end
22
+
23
+ # returns the string representation of the hash
24
+ def string
25
+ binary.map { |i| i.to_i.to_s(16) }.join('')
26
+ end
27
+
28
+ def comparable?
29
+ checksum && l_value && q1_ratio && q2_ratio && q_ratio && body
30
+ end
31
+
32
+ private
33
+
34
+ def swap_byte(input)
35
+ out = ((input & 0xF0) >> 4) & 0x0F
36
+ out | ((input & 0x0F) << 4) & 0xF0
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,3 @@
1
+ module Tlsh
2
+ VERSION = '0.1.1'.freeze
3
+ end
data/tlsh.gemspec ADDED
@@ -0,0 +1,41 @@
1
+ # coding: utf-8
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'tlsh/version'
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = 'tlsh'
9
+ spec.version = Tlsh::VERSION
10
+ spec.authors = ['adamliesko']
11
+ spec.email = ['adamliesko@gmail.com']
12
+
13
+ spec.summary = 'A fuzzy matching library which creates hashes that can be used for similarity comparisons.'
14
+ spec.description = <<DESC
15
+ tlsh is a fuzzy matching library, which hashes can be used for similarity comparison.
16
+ Given a byte stream with a minimum length of 256 bytes, TLSH generates a hash value
17
+ which can be used for similarity comparisons. Similar objects will have similar hash
18
+ values which allow for the detection of similar objects by comparing their hash values.
19
+
20
+ The computed hash is 35 bytes long (output as 70 hexadecimal characters).
21
+ The first 3 bytes are used to capture the information about the file as a whole (length, ...),
22
+ while the last 32 bytes are used to capture information about incremental parts of the file.
23
+ DESC
24
+
25
+ spec.homepage = 'https://github.com/adamliesko/tlsh'
26
+ spec.license = 'MIT'
27
+
28
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
29
+ f.match(%r{^(test|spec|features)/})
30
+ end
31
+
32
+ spec.bindir = 'exe'
33
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
34
+ spec.require_paths = ['lib']
35
+
36
+ spec.add_development_dependency 'bundler', '~> 1.15'
37
+ spec.add_development_dependency 'rake', '~> 10.0'
38
+ spec.add_development_dependency 'minitest', '~> 5.0'
39
+ spec.add_development_dependency 'coveralls', '~> 0'
40
+ spec.add_development_dependency 'simplecov', '~> 0'
41
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tlsh
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - adamliesko
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-08-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.15'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.15'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '5.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '5.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: coveralls
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: |
84
+ tlsh is a fuzzy matching library, which hashes can be used for similarity comparison.
85
+ Given a byte stream with a minimum length of 256 bytes, TLSH generates a hash value
86
+ which can be used for similarity comparisons. Similar objects will have similar hash
87
+ values which allow for the detection of similar objects by comparing their hash values.
88
+
89
+ The computed hash is 35 bytes long (output as 70 hexadecimal characters).
90
+ The first 3 bytes are used to capture the information about the file as a whole (length, ...),
91
+ while the last 32 bytes are used to capture information about incremental parts of the file.
92
+ email:
93
+ - adamliesko@gmail.com
94
+ executables: []
95
+ extensions: []
96
+ extra_rdoc_files: []
97
+ files:
98
+ - ".gitignore"
99
+ - ".rubocop.yml"
100
+ - ".travis.yml"
101
+ - CODE_OF_CONDUCT.md
102
+ - Gemfile
103
+ - LICENSE.txt
104
+ - README.md
105
+ - Rakefile
106
+ - bin/console
107
+ - bin/setup
108
+ - lib/tlsh.rb
109
+ - lib/tlsh/buckets.rb
110
+ - lib/tlsh/digest_hash/pearson.rb
111
+ - lib/tlsh/distance/distance.rb
112
+ - lib/tlsh/distance/precomputed_bits.rb
113
+ - lib/tlsh/quartiles.rb
114
+ - lib/tlsh/tlsh.rb
115
+ - lib/tlsh/tlsh_instance.rb
116
+ - lib/tlsh/version.rb
117
+ - tlsh.gemspec
118
+ homepage: https://github.com/adamliesko/tlsh
119
+ licenses:
120
+ - MIT
121
+ metadata: {}
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project:
138
+ rubygems_version: 2.6.12
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: A fuzzy matching library which creates hashes that can be used for similarity
142
+ comparisons.
143
+ test_files: []