tlsh 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,137 @@
1
+ # Buckets provides utility computation methods for computation of quartile statistics
2
+ module Quartiles
3
+ class << self
4
+ EFF_BUCKETS = 128
5
+
6
+ def quartile_points(buckets)
7
+ spl = spr = 0
8
+ q1 = 0
9
+
10
+ p1 = EFF_BUCKETS / 4 - 1
11
+ p2 = EFF_BUCKETS / 2 - 1
12
+ p_end = EFF_BUCKETS - 1
13
+
14
+ buckets_cpy = buckets.dup[0..EFF_BUCKETS]
15
+
16
+ cut_left = []
17
+ cut_right = []
18
+
19
+ l = 0
20
+ r = p_end
21
+ loop do
22
+ ret = partition(buckets_cpy, l, r)
23
+ if ret > p2
24
+ r = ret - 1
25
+ cut_right[spr] = ret
26
+ spr += 1
27
+ elsif ret < p2
28
+ l = ret + 1
29
+ cut_left[spl] = ret
30
+ spl += 1
31
+ else
32
+ q1 = buckets_cpy[p2]
33
+ break
34
+ end
35
+ end
36
+
37
+ cut_left[spl] = p2 - 1
38
+ cut_right[spr] = p2 + 1
39
+
40
+ q2 = get_q2(buckets_cpy, cut_left, spl, p1)
41
+ q3 = get_q3(buckets_cpy, cut_right, spr, p_end)
42
+
43
+ [q1, q2, q3]
44
+ end
45
+
46
+ private
47
+
48
+ def partition(buffer, left, right)
49
+ return left if left == right
50
+
51
+ if left + 1 == right
52
+ if buffer[left] > buffer[right]
53
+ buffer[right], buffer[left] = buffer[left], buffer[right]
54
+ end
55
+ return left
56
+ end
57
+
58
+ ret = left
59
+
60
+ partition_buffer(buffer, ret, left, right)
61
+ end
62
+
63
+ def partition_buffer(buffer, ret, left, right)
64
+ pivot = (left + right) >> 1
65
+ value = buffer[pivot]
66
+
67
+ buffer[pivot] = buffer[right]
68
+ buffer[right] = value
69
+
70
+ (left..right).each do |i|
71
+ if buffer[i] < value
72
+ buffer[i], buffer[ret] = buffer[ret], buffer[i]
73
+ ret += 1
74
+ end
75
+
76
+ buffer[right] = buffer[ret]
77
+ buffer[ret] = value
78
+ end
79
+
80
+ ret
81
+ end
82
+
83
+ def get_q2(buckets, cut_left, spl, p1)
84
+ i = l = 0
85
+ while i <= spl
86
+ r = cut_left[i]
87
+
88
+ if r > p1
89
+ loop do
90
+ ret = partition(buckets, l, r)
91
+ if ret > p1
92
+ r = ret - 1
93
+ elsif ret < p1
94
+ l = ret + 1
95
+ else
96
+ return buckets[p1]
97
+ end
98
+ end
99
+ end
100
+
101
+ i += 1
102
+ end
103
+ end
104
+
105
+ def get_q3(buckets, cut_right, spr, p_end)
106
+ p3 = EFF_BUCKETS - EFF_BUCKETS / 4 - 1
107
+ q3 = 0
108
+
109
+ i = 0
110
+ r = p_end
111
+ while i <= spr
112
+ l = cut_right[i]
113
+ if l < p3
114
+ loop do
115
+ ret = partition(buckets, l, r)
116
+ if ret > p3
117
+ r = ret - 1
118
+ elsif ret < p3
119
+ l = ret + 1
120
+ else
121
+ q3 = buckets[p3]
122
+ break
123
+ end
124
+ end
125
+ break
126
+ elsif l > p3
127
+ r = l
128
+ else
129
+ q3 = buckets[p3]
130
+ end
131
+
132
+ i += 1
133
+ end
134
+ q3
135
+ end
136
+ end
137
+ end
data/lib/tlsh/tlsh.rb ADDED
@@ -0,0 +1,74 @@
1
+ require 'tlsh/version'
2
+ require 'tlsh/distance/distance'
3
+ require 'tlsh/digest_hash/pearson'
4
+
5
+ # Tlsh module implement interface for TLSH (Trend Micro Locality Sensitive Hash) computation.
6
+ # TLSH is usable for diff and similarity computations of binary data, because of the locality sensitivity.
7
+ module Tlsh
8
+ LOG1_5 = 0.4054651
9
+ LOG1_3 = 0.26236426
10
+ LOG1_1 = 0.095310180
11
+
12
+ class << self
13
+ def diff_files(filename, other_filename)
14
+ file_a = File.read(filename)
15
+ file_b = File.read(other_filename)
16
+
17
+ tslh_a = tlsh_hash(file_a.bytes)
18
+ tslh_b = tlsh_hash(file_b.bytes)
19
+ tslh_a.diff(tslh_b)
20
+ end
21
+
22
+ # hash_file calculates the TLSH for the input file
23
+ def hash_file(filename)
24
+ file = File.read(filename)
25
+ tlsh_hash(file.bytes)
26
+ end
27
+
28
+ def hash_bytes(blob)
29
+ tlsh_hash(blob)
30
+ end
31
+
32
+ private
33
+
34
+ def tlsh_hash(input)
35
+ buckets, checksum, filesize = Buckets.fill_buckets(input)
36
+
37
+ # get the quartiles and their ratio
38
+ q1, q2, q3 = Quartiles.quartile_points(buckets)
39
+ q1_ratio = (q1 * 100 / q3) % 16
40
+ q2_ratio = (q2 * 100 / q3) % 16
41
+ q_ratio = ((q1_ratio & 0xF) << 4) | (q2_ratio & 0xF)
42
+
43
+ # get the binary buckets representation
44
+ bin_hash = Buckets.buckets_binary(buckets, q1, q2, q3)
45
+
46
+ TlshInstance.new(checksum: checksum, l_value: l_value(filesize), q1_ratio: q1_ratio, q2_ratio: q2_ratio, q_ratio: q_ratio, body: bin_hash)
47
+ end
48
+
49
+ def l_value(length)
50
+ l = if length <= 656
51
+ l_value_small(length)
52
+
53
+ elsif length <= 3199
54
+ l_value_medium(length)
55
+
56
+ else
57
+ l_value_large(length)
58
+ end
59
+ l & 255
60
+ end
61
+
62
+ def l_value_small(length)
63
+ Float(Math.log(length) / LOG1_5).floor.to_i
64
+ end
65
+
66
+ def l_value_medium(length)
67
+ Float(Math.log(length) / LOG1_3 - 8.72777).floor.to_i
68
+ end
69
+
70
+ def l_value_large(length)
71
+ Float(Math.log(length) / LOG1_1 - 62.5472).floor.to_i
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,39 @@
1
+ module Tlsh
2
+ # TlshInstance represents single TLSH instance
3
+ class TlshInstance
4
+ attr_accessor :checksum, :l_value, :q1_ratio, :q2_ratio, :q_ratio, :body
5
+
6
+ def initialize(params = {})
7
+ params.each do |key, value|
8
+ setter = "#{key}="
9
+ send(setter, value) if respond_to?(setter.to_sym, false)
10
+ end
11
+ end
12
+
13
+ # returns diff against another TlshInstance. The closer to 0, the smaller the diff.
14
+ def diff(other)
15
+ Distance.diff_total(self, other, true)
16
+ end
17
+
18
+ # returns the binary representation of the hash
19
+ def binary
20
+ [swap_byte(checksum), swap_byte(l_value), q_ratio] + body
21
+ end
22
+
23
+ # returns the string representation of the hash
24
+ def string
25
+ binary.map { |i| i.to_i.to_s(16) }.join('')
26
+ end
27
+
28
+ def comparable?
29
+ checksum && l_value && q1_ratio && q2_ratio && q_ratio && body
30
+ end
31
+
32
+ private
33
+
34
+ def swap_byte(input)
35
+ out = ((input & 0xF0) >> 4) & 0x0F
36
+ out | ((input & 0x0F) << 4) & 0xF0
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,3 @@
1
+ module Tlsh
2
+ VERSION = '0.1.1'.freeze
3
+ end
data/tlsh.gemspec ADDED
@@ -0,0 +1,41 @@
1
+ # coding: utf-8
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'tlsh/version'
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = 'tlsh'
9
+ spec.version = Tlsh::VERSION
10
+ spec.authors = ['adamliesko']
11
+ spec.email = ['adamliesko@gmail.com']
12
+
13
+ spec.summary = 'A fuzzy matching library which creates hashes that can be used for similarity comparisons.'
14
+ spec.description = <<DESC
15
+ tlsh is a fuzzy matching library, which hashes can be used for similarity comparison.
16
+ Given a byte stream with a minimum length of 256 bytes, TLSH generates a hash value
17
+ which can be used for similarity comparisons. Similar objects will have similar hash
18
+ values which allow for the detection of similar objects by comparing their hash values.
19
+
20
+ The computed hash is 35 bytes long (output as 70 hexadecimal characters).
21
+ The first 3 bytes are used to capture the information about the file as a whole (length, ...),
22
+ while the last 32 bytes are used to capture information about incremental parts of the file.
23
+ DESC
24
+
25
+ spec.homepage = 'https://github.com/adamliesko/tlsh'
26
+ spec.license = 'MIT'
27
+
28
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
29
+ f.match(%r{^(test|spec|features)/})
30
+ end
31
+
32
+ spec.bindir = 'exe'
33
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
34
+ spec.require_paths = ['lib']
35
+
36
+ spec.add_development_dependency 'bundler', '~> 1.15'
37
+ spec.add_development_dependency 'rake', '~> 10.0'
38
+ spec.add_development_dependency 'minitest', '~> 5.0'
39
+ spec.add_development_dependency 'coveralls', '~> 0'
40
+ spec.add_development_dependency 'simplecov', '~> 0'
41
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tlsh
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - adamliesko
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-08-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.15'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.15'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '5.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '5.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: coveralls
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: |
84
+ tlsh is a fuzzy matching library, which hashes can be used for similarity comparison.
85
+ Given a byte stream with a minimum length of 256 bytes, TLSH generates a hash value
86
+ which can be used for similarity comparisons. Similar objects will have similar hash
87
+ values which allow for the detection of similar objects by comparing their hash values.
88
+
89
+ The computed hash is 35 bytes long (output as 70 hexadecimal characters).
90
+ The first 3 bytes are used to capture the information about the file as a whole (length, ...),
91
+ while the last 32 bytes are used to capture information about incremental parts of the file.
92
+ email:
93
+ - adamliesko@gmail.com
94
+ executables: []
95
+ extensions: []
96
+ extra_rdoc_files: []
97
+ files:
98
+ - ".gitignore"
99
+ - ".rubocop.yml"
100
+ - ".travis.yml"
101
+ - CODE_OF_CONDUCT.md
102
+ - Gemfile
103
+ - LICENSE.txt
104
+ - README.md
105
+ - Rakefile
106
+ - bin/console
107
+ - bin/setup
108
+ - lib/tlsh.rb
109
+ - lib/tlsh/buckets.rb
110
+ - lib/tlsh/digest_hash/pearson.rb
111
+ - lib/tlsh/distance/distance.rb
112
+ - lib/tlsh/distance/precomputed_bits.rb
113
+ - lib/tlsh/quartiles.rb
114
+ - lib/tlsh/tlsh.rb
115
+ - lib/tlsh/tlsh_instance.rb
116
+ - lib/tlsh/version.rb
117
+ - tlsh.gemspec
118
+ homepage: https://github.com/adamliesko/tlsh
119
+ licenses:
120
+ - MIT
121
+ metadata: {}
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project:
138
+ rubygems_version: 2.6.12
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: A fuzzy matching library which creates hashes that can be used for similarity
142
+ comparisons.
143
+ test_files: []