fileshunter 0.1.0.20130725

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,369 @@
1
+ module FilesHunter
2
+
3
+ module Decoders
4
+
5
+ class EBML < BeginPatternDecoder
6
+
7
+ BEGIN_PATTERN_MKV = "\x1A\x45\xDF\xA3".force_encoding(Encoding::ASCII_8BIT)
8
+ DOCTYPE_ID_INT = 642
9
+ SEGMENT_MATROSKA_ID = "\x18\x53\x80\x67".force_encoding(Encoding::ASCII_8BIT)
10
+ ACCEPTABLE_DOCTYPES = {
11
+ 'matroska' => :mkv,
12
+ 'webm' => :webm
13
+ }
14
+
15
+ # List of possible elements, sorted by size.
16
+ # Taken from http://matroska.svn.sourceforge.net/viewvc/matroska/trunk/foundation_src/spectool/specdata.xml?view=markup
17
+ VALID_ELEMENT_IDS = {
18
+ 1 => [
19
+ "\x80".force_encoding(Encoding::ASCII_8BIT),
20
+ "\x83".force_encoding(Encoding::ASCII_8BIT),
21
+ "\x85".force_encoding(Encoding::ASCII_8BIT),
22
+ "\x86".force_encoding(Encoding::ASCII_8BIT),
23
+ "\x88".force_encoding(Encoding::ASCII_8BIT),
24
+ "\x89".force_encoding(Encoding::ASCII_8BIT),
25
+ "\x8e".force_encoding(Encoding::ASCII_8BIT),
26
+ "\x8f".force_encoding(Encoding::ASCII_8BIT),
27
+ "\x91".force_encoding(Encoding::ASCII_8BIT),
28
+ "\x92".force_encoding(Encoding::ASCII_8BIT),
29
+ "\x96".force_encoding(Encoding::ASCII_8BIT),
30
+ "\x97".force_encoding(Encoding::ASCII_8BIT),
31
+ "\x98".force_encoding(Encoding::ASCII_8BIT),
32
+ "\x9a".force_encoding(Encoding::ASCII_8BIT),
33
+ "\x9b".force_encoding(Encoding::ASCII_8BIT),
34
+ "\x9c".force_encoding(Encoding::ASCII_8BIT),
35
+ "\x9f".force_encoding(Encoding::ASCII_8BIT),
36
+ "\xa0".force_encoding(Encoding::ASCII_8BIT),
37
+ "\xa1".force_encoding(Encoding::ASCII_8BIT),
38
+ "\xa2".force_encoding(Encoding::ASCII_8BIT),
39
+ "\xa3".force_encoding(Encoding::ASCII_8BIT),
40
+ "\xa4".force_encoding(Encoding::ASCII_8BIT),
41
+ "\xa5".force_encoding(Encoding::ASCII_8BIT),
42
+ "\xa6".force_encoding(Encoding::ASCII_8BIT),
43
+ "\xa7".force_encoding(Encoding::ASCII_8BIT),
44
+ "\xaa".force_encoding(Encoding::ASCII_8BIT),
45
+ "\xab".force_encoding(Encoding::ASCII_8BIT),
46
+ "\xae".force_encoding(Encoding::ASCII_8BIT),
47
+ "\xaf".force_encoding(Encoding::ASCII_8BIT),
48
+ "\xb0".force_encoding(Encoding::ASCII_8BIT),
49
+ "\xb2".force_encoding(Encoding::ASCII_8BIT),
50
+ "\xb3".force_encoding(Encoding::ASCII_8BIT),
51
+ "\xb5".force_encoding(Encoding::ASCII_8BIT),
52
+ "\xb6".force_encoding(Encoding::ASCII_8BIT),
53
+ "\xb7".force_encoding(Encoding::ASCII_8BIT),
54
+ "\xb9".force_encoding(Encoding::ASCII_8BIT),
55
+ "\xba".force_encoding(Encoding::ASCII_8BIT),
56
+ "\xbb".force_encoding(Encoding::ASCII_8BIT),
57
+ "\xbf".force_encoding(Encoding::ASCII_8BIT),
58
+ "\xc0".force_encoding(Encoding::ASCII_8BIT),
59
+ "\xc1".force_encoding(Encoding::ASCII_8BIT),
60
+ "\xc4".force_encoding(Encoding::ASCII_8BIT),
61
+ "\xc6".force_encoding(Encoding::ASCII_8BIT),
62
+ "\xc7".force_encoding(Encoding::ASCII_8BIT),
63
+ "\xc8".force_encoding(Encoding::ASCII_8BIT),
64
+ "\xc9".force_encoding(Encoding::ASCII_8BIT),
65
+ "\xca".force_encoding(Encoding::ASCII_8BIT),
66
+ "\xcb".force_encoding(Encoding::ASCII_8BIT),
67
+ "\xcc".force_encoding(Encoding::ASCII_8BIT),
68
+ "\xcd".force_encoding(Encoding::ASCII_8BIT),
69
+ "\xce".force_encoding(Encoding::ASCII_8BIT),
70
+ "\xcf".force_encoding(Encoding::ASCII_8BIT),
71
+ "\xd7".force_encoding(Encoding::ASCII_8BIT),
72
+ "\xdb".force_encoding(Encoding::ASCII_8BIT),
73
+ "\xe0".force_encoding(Encoding::ASCII_8BIT),
74
+ "\xe1".force_encoding(Encoding::ASCII_8BIT),
75
+ "\xe2".force_encoding(Encoding::ASCII_8BIT),
76
+ "\xe3".force_encoding(Encoding::ASCII_8BIT),
77
+ "\xe4".force_encoding(Encoding::ASCII_8BIT),
78
+ "\xe5".force_encoding(Encoding::ASCII_8BIT),
79
+ "\xe6".force_encoding(Encoding::ASCII_8BIT),
80
+ "\xe7".force_encoding(Encoding::ASCII_8BIT),
81
+ "\xe8".force_encoding(Encoding::ASCII_8BIT),
82
+ "\xe9".force_encoding(Encoding::ASCII_8BIT),
83
+ "\xea".force_encoding(Encoding::ASCII_8BIT),
84
+ "\xeb".force_encoding(Encoding::ASCII_8BIT),
85
+ "\xec".force_encoding(Encoding::ASCII_8BIT),
86
+ "\xed".force_encoding(Encoding::ASCII_8BIT),
87
+ "\xee".force_encoding(Encoding::ASCII_8BIT),
88
+ "\xf0".force_encoding(Encoding::ASCII_8BIT),
89
+ "\xf1".force_encoding(Encoding::ASCII_8BIT),
90
+ "\xf7".force_encoding(Encoding::ASCII_8BIT),
91
+ "\xfa".force_encoding(Encoding::ASCII_8BIT),
92
+ "\xfb".force_encoding(Encoding::ASCII_8BIT),
93
+ "\xfd".force_encoding(Encoding::ASCII_8BIT)
94
+ ],
95
+ 2 => [
96
+ "\x42\x54".force_encoding(Encoding::ASCII_8BIT),
97
+ "\x42\x55".force_encoding(Encoding::ASCII_8BIT),
98
+ "\x42\x82".force_encoding(Encoding::ASCII_8BIT),
99
+ "\x42\x85".force_encoding(Encoding::ASCII_8BIT),
100
+ "\x42\x86".force_encoding(Encoding::ASCII_8BIT),
101
+ "\x42\x87".force_encoding(Encoding::ASCII_8BIT),
102
+ "\x42\xf2".force_encoding(Encoding::ASCII_8BIT),
103
+ "\x42\xf3".force_encoding(Encoding::ASCII_8BIT),
104
+ "\x42\xf7".force_encoding(Encoding::ASCII_8BIT),
105
+ "\x43\x7c".force_encoding(Encoding::ASCII_8BIT),
106
+ "\x43\x7e".force_encoding(Encoding::ASCII_8BIT),
107
+ "\x44\x44".force_encoding(Encoding::ASCII_8BIT),
108
+ "\x44\x61".force_encoding(Encoding::ASCII_8BIT),
109
+ "\x44\x7a".force_encoding(Encoding::ASCII_8BIT),
110
+ "\x44\x84".force_encoding(Encoding::ASCII_8BIT),
111
+ "\x44\x85".force_encoding(Encoding::ASCII_8BIT),
112
+ "\x44\x87".force_encoding(Encoding::ASCII_8BIT),
113
+ "\x44\x89".force_encoding(Encoding::ASCII_8BIT),
114
+ "\x45\x0d".force_encoding(Encoding::ASCII_8BIT),
115
+ "\x45\x98".force_encoding(Encoding::ASCII_8BIT),
116
+ "\x45\xa3".force_encoding(Encoding::ASCII_8BIT),
117
+ "\x45\xb9".force_encoding(Encoding::ASCII_8BIT),
118
+ "\x45\xbc".force_encoding(Encoding::ASCII_8BIT),
119
+ "\x45\xbd".force_encoding(Encoding::ASCII_8BIT),
120
+ "\x45\xdb".force_encoding(Encoding::ASCII_8BIT),
121
+ "\x45\xdd".force_encoding(Encoding::ASCII_8BIT),
122
+ "\x46\x5c".force_encoding(Encoding::ASCII_8BIT),
123
+ "\x46\x60".force_encoding(Encoding::ASCII_8BIT),
124
+ "\x46\x61".force_encoding(Encoding::ASCII_8BIT),
125
+ "\x46\x62".force_encoding(Encoding::ASCII_8BIT),
126
+ "\x46\x6e".force_encoding(Encoding::ASCII_8BIT),
127
+ "\x46\x75".force_encoding(Encoding::ASCII_8BIT),
128
+ "\x46\x7e".force_encoding(Encoding::ASCII_8BIT),
129
+ "\x46\xae".force_encoding(Encoding::ASCII_8BIT),
130
+ "\x47\xe1".force_encoding(Encoding::ASCII_8BIT),
131
+ "\x47\xe2".force_encoding(Encoding::ASCII_8BIT),
132
+ "\x47\xe3".force_encoding(Encoding::ASCII_8BIT),
133
+ "\x47\xe4".force_encoding(Encoding::ASCII_8BIT),
134
+ "\x47\xe5".force_encoding(Encoding::ASCII_8BIT),
135
+ "\x47\xe6".force_encoding(Encoding::ASCII_8BIT),
136
+ "\x4d\x80".force_encoding(Encoding::ASCII_8BIT),
137
+ "\x4d\xbb".force_encoding(Encoding::ASCII_8BIT),
138
+ "\x50\x31".force_encoding(Encoding::ASCII_8BIT),
139
+ "\x50\x32".force_encoding(Encoding::ASCII_8BIT),
140
+ "\x50\x33".force_encoding(Encoding::ASCII_8BIT),
141
+ "\x50\x34".force_encoding(Encoding::ASCII_8BIT),
142
+ "\x50\x35".force_encoding(Encoding::ASCII_8BIT),
143
+ "\x53\x5f".force_encoding(Encoding::ASCII_8BIT),
144
+ "\x53\x6e".force_encoding(Encoding::ASCII_8BIT),
145
+ "\x53\x78".force_encoding(Encoding::ASCII_8BIT),
146
+ "\x53\x7f".force_encoding(Encoding::ASCII_8BIT),
147
+ "\x53\xab".force_encoding(Encoding::ASCII_8BIT),
148
+ "\x53\xac".force_encoding(Encoding::ASCII_8BIT),
149
+ "\x53\xb8".force_encoding(Encoding::ASCII_8BIT),
150
+ "\x53\xb9".force_encoding(Encoding::ASCII_8BIT),
151
+ "\x54\xaa".force_encoding(Encoding::ASCII_8BIT),
152
+ "\x54\xb0".force_encoding(Encoding::ASCII_8BIT),
153
+ "\x54\xb2".force_encoding(Encoding::ASCII_8BIT),
154
+ "\x54\xb3".force_encoding(Encoding::ASCII_8BIT),
155
+ "\x54\xba".force_encoding(Encoding::ASCII_8BIT),
156
+ "\x54\xbb".force_encoding(Encoding::ASCII_8BIT),
157
+ "\x54\xcc".force_encoding(Encoding::ASCII_8BIT),
158
+ "\x54\xdd".force_encoding(Encoding::ASCII_8BIT),
159
+ "\x55\xaa".force_encoding(Encoding::ASCII_8BIT),
160
+ "\x55\xee".force_encoding(Encoding::ASCII_8BIT),
161
+ "\x56\x54".force_encoding(Encoding::ASCII_8BIT),
162
+ "\x57\x41".force_encoding(Encoding::ASCII_8BIT),
163
+ "\x58\x54".force_encoding(Encoding::ASCII_8BIT),
164
+ "\x58\xd7".force_encoding(Encoding::ASCII_8BIT),
165
+ "\x61\xa7".force_encoding(Encoding::ASCII_8BIT),
166
+ "\x62\x40".force_encoding(Encoding::ASCII_8BIT),
167
+ "\x62\x64".force_encoding(Encoding::ASCII_8BIT),
168
+ "\x63\xa2".force_encoding(Encoding::ASCII_8BIT),
169
+ "\x63\xc0".force_encoding(Encoding::ASCII_8BIT),
170
+ "\x63\xc3".force_encoding(Encoding::ASCII_8BIT),
171
+ "\x63\xc4".force_encoding(Encoding::ASCII_8BIT),
172
+ "\x63\xc5".force_encoding(Encoding::ASCII_8BIT),
173
+ "\x63\xc6".force_encoding(Encoding::ASCII_8BIT),
174
+ "\x63\xc9".force_encoding(Encoding::ASCII_8BIT),
175
+ "\x63\xca".force_encoding(Encoding::ASCII_8BIT),
176
+ "\x65\x32".force_encoding(Encoding::ASCII_8BIT),
177
+ "\x66\x24".force_encoding(Encoding::ASCII_8BIT),
178
+ "\x66\xa5".force_encoding(Encoding::ASCII_8BIT),
179
+ "\x66\xbf".force_encoding(Encoding::ASCII_8BIT),
180
+ "\x66\xfc".force_encoding(Encoding::ASCII_8BIT),
181
+ "\x67\xc8".force_encoding(Encoding::ASCII_8BIT),
182
+ "\x68\xca".force_encoding(Encoding::ASCII_8BIT),
183
+ "\x69\x11".force_encoding(Encoding::ASCII_8BIT),
184
+ "\x69\x22".force_encoding(Encoding::ASCII_8BIT),
185
+ "\x69\x24".force_encoding(Encoding::ASCII_8BIT),
186
+ "\x69\x33".force_encoding(Encoding::ASCII_8BIT),
187
+ "\x69\x44".force_encoding(Encoding::ASCII_8BIT),
188
+ "\x69\x55".force_encoding(Encoding::ASCII_8BIT),
189
+ "\x69\xa5".force_encoding(Encoding::ASCII_8BIT),
190
+ "\x69\xbf".force_encoding(Encoding::ASCII_8BIT),
191
+ "\x69\xfc".force_encoding(Encoding::ASCII_8BIT),
192
+ "\x6d\x80".force_encoding(Encoding::ASCII_8BIT),
193
+ "\x6d\xe7".force_encoding(Encoding::ASCII_8BIT),
194
+ "\x6d\xf8".force_encoding(Encoding::ASCII_8BIT),
195
+ "\x6e\x67".force_encoding(Encoding::ASCII_8BIT),
196
+ "\x6e\xbc".force_encoding(Encoding::ASCII_8BIT),
197
+ "\x6f\xab".force_encoding(Encoding::ASCII_8BIT),
198
+ "\x73\x73".force_encoding(Encoding::ASCII_8BIT),
199
+ "\x73\x84".force_encoding(Encoding::ASCII_8BIT),
200
+ "\x73\xa4".force_encoding(Encoding::ASCII_8BIT),
201
+ "\x73\xc4".force_encoding(Encoding::ASCII_8BIT),
202
+ "\x73\xc5".force_encoding(Encoding::ASCII_8BIT),
203
+ "\x74\x46".force_encoding(Encoding::ASCII_8BIT),
204
+ "\x75\xa1".force_encoding(Encoding::ASCII_8BIT),
205
+ "\x78\xb5".force_encoding(Encoding::ASCII_8BIT),
206
+ "\x7b\xa9".force_encoding(Encoding::ASCII_8BIT),
207
+ "\x7d\x7b".force_encoding(Encoding::ASCII_8BIT),
208
+ "\x7e\x5b".force_encoding(Encoding::ASCII_8BIT),
209
+ "\x7e\x7b".force_encoding(Encoding::ASCII_8BIT),
210
+ "\x7e\x8a".force_encoding(Encoding::ASCII_8BIT),
211
+ "\x7e\x9a".force_encoding(Encoding::ASCII_8BIT),
212
+ "\x7e\xa5".force_encoding(Encoding::ASCII_8BIT),
213
+ "\x7e\xb5".force_encoding(Encoding::ASCII_8BIT)
214
+ ],
215
+ 3 => [
216
+ "\x22\xb5\x9c".force_encoding(Encoding::ASCII_8BIT),
217
+ "\x23\x31\x4f".force_encoding(Encoding::ASCII_8BIT),
218
+ "\x23\x83\xe3".force_encoding(Encoding::ASCII_8BIT),
219
+ "\x23\xe3\x83".force_encoding(Encoding::ASCII_8BIT),
220
+ "\x25\x86\x88".force_encoding(Encoding::ASCII_8BIT),
221
+ "\x26\xb2\x40".force_encoding(Encoding::ASCII_8BIT),
222
+ "\x2a\xd7\xb1".force_encoding(Encoding::ASCII_8BIT),
223
+ "\x2e\xb5\x24".force_encoding(Encoding::ASCII_8BIT),
224
+ "\x2f\xb5\x23".force_encoding(Encoding::ASCII_8BIT),
225
+ "\x3a\x96\x97".force_encoding(Encoding::ASCII_8BIT),
226
+ "\x3b\x40\x40".force_encoding(Encoding::ASCII_8BIT),
227
+ "\x3c\x83\xab".force_encoding(Encoding::ASCII_8BIT),
228
+ "\x3c\xb9\x23".force_encoding(Encoding::ASCII_8BIT),
229
+ "\x3e\x83\xbb".force_encoding(Encoding::ASCII_8BIT),
230
+ "\x3e\xb9\x23".force_encoding(Encoding::ASCII_8BIT)
231
+ ],
232
+ 4 => [
233
+ "\x10\x43\xa7\x70".force_encoding(Encoding::ASCII_8BIT),
234
+ "\x11\x4d\x9b\x74".force_encoding(Encoding::ASCII_8BIT),
235
+ "\x12\x54\xc3\x67".force_encoding(Encoding::ASCII_8BIT),
236
+ "\x15\x49\xa9\x66".force_encoding(Encoding::ASCII_8BIT),
237
+ "\x16\x54\xae\x6b".force_encoding(Encoding::ASCII_8BIT),
238
+ "\x18\x53\x80\x67".force_encoding(Encoding::ASCII_8BIT),
239
+ "\x19\x41\xa4\x69".force_encoding(Encoding::ASCII_8BIT),
240
+ "\x1a\x45\xdf\xa3".force_encoding(Encoding::ASCII_8BIT),
241
+ "\x1b\x53\x86\x67".force_encoding(Encoding::ASCII_8BIT),
242
+ "\x1c\x53\xbb\x6b".force_encoding(Encoding::ASCII_8BIT),
243
+ "\x1f\x43\xb6\x75".force_encoding(Encoding::ASCII_8BIT)
244
+ ]
245
+ }
246
+
247
+ def get_begin_pattern
248
+ return BEGIN_PATTERN_MKV, { :offset_inc => 4 }
249
+ end
250
+
251
+ def decode(offset)
252
+ ending_offset = nil
253
+
254
+ cursor = offset
255
+ # Read the variable int for the header size
256
+ header_size, vint_size = decode_vint(@data[cursor+4..cursor+11])
257
+ cursor += 4 + vint_size
258
+ progress(cursor)
259
+ # Here we have header_size bytes for the header data.
260
+ # Get the DocType
261
+ max_header_cursor = cursor + header_size
262
+ doc_type = nil
263
+ while ((cursor < max_header_cursor) and (doc_type == nil))
264
+ log_debug "=== @#{cursor} - Inspecting #{@data[cursor..cursor+20].inspect}"
265
+ # Read next EBML segment
266
+ segment_id, vint_size = decode_vint(@data[cursor..cursor+7])
267
+ log_debug "=== @#{cursor} - Found ID #{segment_id}"
268
+ cursor += vint_size
269
+ # Read its size
270
+ segment_size, vint_size = decode_vint(@data[cursor..cursor+7])
271
+ cursor += vint_size
272
+ if (segment_id == DOCTYPE_ID_INT)
273
+ doc_type = @data[cursor..cursor+segment_size-1]
274
+ log_debug "=== @#{cursor} - Found DocType: #{doc_type.inspect}"
275
+ end
276
+ cursor += segment_size
277
+ progress(cursor)
278
+ end
279
+ invalid_data("@#{offset} - Unable to get the DocType from the EBML file") if (doc_type == nil)
280
+ extension = ACCEPTABLE_DOCTYPES[doc_type]
281
+ invalid_data("@#{offset} - Unknown DocType: #{doc_type}") if (extension == nil)
282
+ # Make sure we consumed the header completely
283
+ cursor = max_header_cursor
284
+ # Now read the segment
285
+ invalid_data("@#{cursor} - Invalid Segment ID") if (@data[cursor..cursor+3] != SEGMENT_MATROSKA_ID)
286
+ found_relevant_data(extension)
287
+ # Read segment size
288
+ segment_size, vint_size = decode_vint(@data[cursor+4..cursor+11])
289
+ log_debug "=== @#{cursor} - Found segment of size #{segment_size}"
290
+ cursor += 4 + vint_size
291
+ if (segment_size == 127)
292
+ # The size is unknown
293
+ # We have to make a deep decoding
294
+ while (ebml_id_size = decode_ebml_id(cursor))
295
+ # Read segment size
296
+ segment_size, vint_size = decode_vint(@data[cursor+ebml_id_size..cursor+ebml_id_size+7])
297
+ log_debug "=== @#{cursor} - Found segment #{segment_id} (size #{ebml_id_size}) of size #{segment_size} (size #{vint_size})"
298
+ cursor += ebml_id_size + vint_size
299
+ if (segment_size != 127)
300
+ cursor += segment_size
301
+ break if (cursor == @end_offset)
302
+ end
303
+ progress(cursor)
304
+ end
305
+ else
306
+ cursor += segment_size
307
+ end
308
+ progress(cursor)
309
+ ending_offset = cursor
310
+
311
+ return ending_offset
312
+ end
313
+
314
+ private
315
+
316
+ # Take the data (as a String) and read it as a variable size integer (return also the size)
317
+ #
318
+ # Parameters::
319
+ # * *data* (_String_): The data to decode
320
+ # Result::
321
+ # * _Fixnum_: The corresponding value
322
+ # * _Fixnum_: The size of the vint
323
+ def decode_vint(data)
324
+ value = 0
325
+ size = 1
326
+
327
+ bytes = data.bytes.to_a
328
+ # Size of the integer is defined in first byte only
329
+ first_byte = bytes.first
330
+ size = 1
331
+ while ((first_byte & (1 << (8-size))) == 0)
332
+ size += 1
333
+ invalid_data("Invalid variable int encoded: #{data}") if (size > 8)
334
+ end
335
+ # Replace first byte with its true value
336
+ bytes[0] = first_byte & ((1 << (8-size))-1)
337
+ # Read all
338
+ size.times do |idx|
339
+ value = (value << 8) + bytes[idx]
340
+ end
341
+
342
+ return value, size
343
+ end
344
+
345
+ # Decode an EBML ID
346
+ #
347
+ # Parameters::
348
+ # * *cursor* (_Fixnum_): The cursor
349
+ # Result::
350
+ # * _Fixnum_: Size of the decoded EBML ID, or false if not a valid EBML ID
351
+ def decode_ebml_id(cursor)
352
+ if (VALID_ELEMENT_IDS[1].include?(@data[cursor]))
353
+ return 1
354
+ elsif (VALID_ELEMENT_IDS[2].include?(@data[cursor..cursor+1]))
355
+ return 2
356
+ elsif (VALID_ELEMENT_IDS[3].include?(@data[cursor..cursor+2]))
357
+ return 3
358
+ elsif (VALID_ELEMENT_IDS[4].include?(@data[cursor..cursor+3]))
359
+ return 4
360
+ else
361
+ return false
362
+ end
363
+ end
364
+
365
+ end
366
+
367
+ end
368
+
369
+ end