fileshunter 0.1.0.20130725

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,369 @@
1
+ module FilesHunter
2
+
3
+ module Decoders
4
+
5
+ class EBML < BeginPatternDecoder
6
+
7
+ BEGIN_PATTERN_MKV = "\x1A\x45\xDF\xA3".force_encoding(Encoding::ASCII_8BIT)
8
+ DOCTYPE_ID_INT = 642
9
+ SEGMENT_MATROSKA_ID = "\x18\x53\x80\x67".force_encoding(Encoding::ASCII_8BIT)
10
+ ACCEPTABLE_DOCTYPES = {
11
+ 'matroska' => :mkv,
12
+ 'webm' => :webm
13
+ }
14
+
15
+ # List of possible elements, sorted by size.
16
+ # Taken from http://matroska.svn.sourceforge.net/viewvc/matroska/trunk/foundation_src/spectool/specdata.xml?view=markup
17
+ VALID_ELEMENT_IDS = {
18
+ 1 => [
19
+ "\x80".force_encoding(Encoding::ASCII_8BIT),
20
+ "\x83".force_encoding(Encoding::ASCII_8BIT),
21
+ "\x85".force_encoding(Encoding::ASCII_8BIT),
22
+ "\x86".force_encoding(Encoding::ASCII_8BIT),
23
+ "\x88".force_encoding(Encoding::ASCII_8BIT),
24
+ "\x89".force_encoding(Encoding::ASCII_8BIT),
25
+ "\x8e".force_encoding(Encoding::ASCII_8BIT),
26
+ "\x8f".force_encoding(Encoding::ASCII_8BIT),
27
+ "\x91".force_encoding(Encoding::ASCII_8BIT),
28
+ "\x92".force_encoding(Encoding::ASCII_8BIT),
29
+ "\x96".force_encoding(Encoding::ASCII_8BIT),
30
+ "\x97".force_encoding(Encoding::ASCII_8BIT),
31
+ "\x98".force_encoding(Encoding::ASCII_8BIT),
32
+ "\x9a".force_encoding(Encoding::ASCII_8BIT),
33
+ "\x9b".force_encoding(Encoding::ASCII_8BIT),
34
+ "\x9c".force_encoding(Encoding::ASCII_8BIT),
35
+ "\x9f".force_encoding(Encoding::ASCII_8BIT),
36
+ "\xa0".force_encoding(Encoding::ASCII_8BIT),
37
+ "\xa1".force_encoding(Encoding::ASCII_8BIT),
38
+ "\xa2".force_encoding(Encoding::ASCII_8BIT),
39
+ "\xa3".force_encoding(Encoding::ASCII_8BIT),
40
+ "\xa4".force_encoding(Encoding::ASCII_8BIT),
41
+ "\xa5".force_encoding(Encoding::ASCII_8BIT),
42
+ "\xa6".force_encoding(Encoding::ASCII_8BIT),
43
+ "\xa7".force_encoding(Encoding::ASCII_8BIT),
44
+ "\xaa".force_encoding(Encoding::ASCII_8BIT),
45
+ "\xab".force_encoding(Encoding::ASCII_8BIT),
46
+ "\xae".force_encoding(Encoding::ASCII_8BIT),
47
+ "\xaf".force_encoding(Encoding::ASCII_8BIT),
48
+ "\xb0".force_encoding(Encoding::ASCII_8BIT),
49
+ "\xb2".force_encoding(Encoding::ASCII_8BIT),
50
+ "\xb3".force_encoding(Encoding::ASCII_8BIT),
51
+ "\xb5".force_encoding(Encoding::ASCII_8BIT),
52
+ "\xb6".force_encoding(Encoding::ASCII_8BIT),
53
+ "\xb7".force_encoding(Encoding::ASCII_8BIT),
54
+ "\xb9".force_encoding(Encoding::ASCII_8BIT),
55
+ "\xba".force_encoding(Encoding::ASCII_8BIT),
56
+ "\xbb".force_encoding(Encoding::ASCII_8BIT),
57
+ "\xbf".force_encoding(Encoding::ASCII_8BIT),
58
+ "\xc0".force_encoding(Encoding::ASCII_8BIT),
59
+ "\xc1".force_encoding(Encoding::ASCII_8BIT),
60
+ "\xc4".force_encoding(Encoding::ASCII_8BIT),
61
+ "\xc6".force_encoding(Encoding::ASCII_8BIT),
62
+ "\xc7".force_encoding(Encoding::ASCII_8BIT),
63
+ "\xc8".force_encoding(Encoding::ASCII_8BIT),
64
+ "\xc9".force_encoding(Encoding::ASCII_8BIT),
65
+ "\xca".force_encoding(Encoding::ASCII_8BIT),
66
+ "\xcb".force_encoding(Encoding::ASCII_8BIT),
67
+ "\xcc".force_encoding(Encoding::ASCII_8BIT),
68
+ "\xcd".force_encoding(Encoding::ASCII_8BIT),
69
+ "\xce".force_encoding(Encoding::ASCII_8BIT),
70
+ "\xcf".force_encoding(Encoding::ASCII_8BIT),
71
+ "\xd7".force_encoding(Encoding::ASCII_8BIT),
72
+ "\xdb".force_encoding(Encoding::ASCII_8BIT),
73
+ "\xe0".force_encoding(Encoding::ASCII_8BIT),
74
+ "\xe1".force_encoding(Encoding::ASCII_8BIT),
75
+ "\xe2".force_encoding(Encoding::ASCII_8BIT),
76
+ "\xe3".force_encoding(Encoding::ASCII_8BIT),
77
+ "\xe4".force_encoding(Encoding::ASCII_8BIT),
78
+ "\xe5".force_encoding(Encoding::ASCII_8BIT),
79
+ "\xe6".force_encoding(Encoding::ASCII_8BIT),
80
+ "\xe7".force_encoding(Encoding::ASCII_8BIT),
81
+ "\xe8".force_encoding(Encoding::ASCII_8BIT),
82
+ "\xe9".force_encoding(Encoding::ASCII_8BIT),
83
+ "\xea".force_encoding(Encoding::ASCII_8BIT),
84
+ "\xeb".force_encoding(Encoding::ASCII_8BIT),
85
+ "\xec".force_encoding(Encoding::ASCII_8BIT),
86
+ "\xed".force_encoding(Encoding::ASCII_8BIT),
87
+ "\xee".force_encoding(Encoding::ASCII_8BIT),
88
+ "\xf0".force_encoding(Encoding::ASCII_8BIT),
89
+ "\xf1".force_encoding(Encoding::ASCII_8BIT),
90
+ "\xf7".force_encoding(Encoding::ASCII_8BIT),
91
+ "\xfa".force_encoding(Encoding::ASCII_8BIT),
92
+ "\xfb".force_encoding(Encoding::ASCII_8BIT),
93
+ "\xfd".force_encoding(Encoding::ASCII_8BIT)
94
+ ],
95
+ 2 => [
96
+ "\x42\x54".force_encoding(Encoding::ASCII_8BIT),
97
+ "\x42\x55".force_encoding(Encoding::ASCII_8BIT),
98
+ "\x42\x82".force_encoding(Encoding::ASCII_8BIT),
99
+ "\x42\x85".force_encoding(Encoding::ASCII_8BIT),
100
+ "\x42\x86".force_encoding(Encoding::ASCII_8BIT),
101
+ "\x42\x87".force_encoding(Encoding::ASCII_8BIT),
102
+ "\x42\xf2".force_encoding(Encoding::ASCII_8BIT),
103
+ "\x42\xf3".force_encoding(Encoding::ASCII_8BIT),
104
+ "\x42\xf7".force_encoding(Encoding::ASCII_8BIT),
105
+ "\x43\x7c".force_encoding(Encoding::ASCII_8BIT),
106
+ "\x43\x7e".force_encoding(Encoding::ASCII_8BIT),
107
+ "\x44\x44".force_encoding(Encoding::ASCII_8BIT),
108
+ "\x44\x61".force_encoding(Encoding::ASCII_8BIT),
109
+ "\x44\x7a".force_encoding(Encoding::ASCII_8BIT),
110
+ "\x44\x84".force_encoding(Encoding::ASCII_8BIT),
111
+ "\x44\x85".force_encoding(Encoding::ASCII_8BIT),
112
+ "\x44\x87".force_encoding(Encoding::ASCII_8BIT),
113
+ "\x44\x89".force_encoding(Encoding::ASCII_8BIT),
114
+ "\x45\x0d".force_encoding(Encoding::ASCII_8BIT),
115
+ "\x45\x98".force_encoding(Encoding::ASCII_8BIT),
116
+ "\x45\xa3".force_encoding(Encoding::ASCII_8BIT),
117
+ "\x45\xb9".force_encoding(Encoding::ASCII_8BIT),
118
+ "\x45\xbc".force_encoding(Encoding::ASCII_8BIT),
119
+ "\x45\xbd".force_encoding(Encoding::ASCII_8BIT),
120
+ "\x45\xdb".force_encoding(Encoding::ASCII_8BIT),
121
+ "\x45\xdd".force_encoding(Encoding::ASCII_8BIT),
122
+ "\x46\x5c".force_encoding(Encoding::ASCII_8BIT),
123
+ "\x46\x60".force_encoding(Encoding::ASCII_8BIT),
124
+ "\x46\x61".force_encoding(Encoding::ASCII_8BIT),
125
+ "\x46\x62".force_encoding(Encoding::ASCII_8BIT),
126
+ "\x46\x6e".force_encoding(Encoding::ASCII_8BIT),
127
+ "\x46\x75".force_encoding(Encoding::ASCII_8BIT),
128
+ "\x46\x7e".force_encoding(Encoding::ASCII_8BIT),
129
+ "\x46\xae".force_encoding(Encoding::ASCII_8BIT),
130
+ "\x47\xe1".force_encoding(Encoding::ASCII_8BIT),
131
+ "\x47\xe2".force_encoding(Encoding::ASCII_8BIT),
132
+ "\x47\xe3".force_encoding(Encoding::ASCII_8BIT),
133
+ "\x47\xe4".force_encoding(Encoding::ASCII_8BIT),
134
+ "\x47\xe5".force_encoding(Encoding::ASCII_8BIT),
135
+ "\x47\xe6".force_encoding(Encoding::ASCII_8BIT),
136
+ "\x4d\x80".force_encoding(Encoding::ASCII_8BIT),
137
+ "\x4d\xbb".force_encoding(Encoding::ASCII_8BIT),
138
+ "\x50\x31".force_encoding(Encoding::ASCII_8BIT),
139
+ "\x50\x32".force_encoding(Encoding::ASCII_8BIT),
140
+ "\x50\x33".force_encoding(Encoding::ASCII_8BIT),
141
+ "\x50\x34".force_encoding(Encoding::ASCII_8BIT),
142
+ "\x50\x35".force_encoding(Encoding::ASCII_8BIT),
143
+ "\x53\x5f".force_encoding(Encoding::ASCII_8BIT),
144
+ "\x53\x6e".force_encoding(Encoding::ASCII_8BIT),
145
+ "\x53\x78".force_encoding(Encoding::ASCII_8BIT),
146
+ "\x53\x7f".force_encoding(Encoding::ASCII_8BIT),
147
+ "\x53\xab".force_encoding(Encoding::ASCII_8BIT),
148
+ "\x53\xac".force_encoding(Encoding::ASCII_8BIT),
149
+ "\x53\xb8".force_encoding(Encoding::ASCII_8BIT),
150
+ "\x53\xb9".force_encoding(Encoding::ASCII_8BIT),
151
+ "\x54\xaa".force_encoding(Encoding::ASCII_8BIT),
152
+ "\x54\xb0".force_encoding(Encoding::ASCII_8BIT),
153
+ "\x54\xb2".force_encoding(Encoding::ASCII_8BIT),
154
+ "\x54\xb3".force_encoding(Encoding::ASCII_8BIT),
155
+ "\x54\xba".force_encoding(Encoding::ASCII_8BIT),
156
+ "\x54\xbb".force_encoding(Encoding::ASCII_8BIT),
157
+ "\x54\xcc".force_encoding(Encoding::ASCII_8BIT),
158
+ "\x54\xdd".force_encoding(Encoding::ASCII_8BIT),
159
+ "\x55\xaa".force_encoding(Encoding::ASCII_8BIT),
160
+ "\x55\xee".force_encoding(Encoding::ASCII_8BIT),
161
+ "\x56\x54".force_encoding(Encoding::ASCII_8BIT),
162
+ "\x57\x41".force_encoding(Encoding::ASCII_8BIT),
163
+ "\x58\x54".force_encoding(Encoding::ASCII_8BIT),
164
+ "\x58\xd7".force_encoding(Encoding::ASCII_8BIT),
165
+ "\x61\xa7".force_encoding(Encoding::ASCII_8BIT),
166
+ "\x62\x40".force_encoding(Encoding::ASCII_8BIT),
167
+ "\x62\x64".force_encoding(Encoding::ASCII_8BIT),
168
+ "\x63\xa2".force_encoding(Encoding::ASCII_8BIT),
169
+ "\x63\xc0".force_encoding(Encoding::ASCII_8BIT),
170
+ "\x63\xc3".force_encoding(Encoding::ASCII_8BIT),
171
+ "\x63\xc4".force_encoding(Encoding::ASCII_8BIT),
172
+ "\x63\xc5".force_encoding(Encoding::ASCII_8BIT),
173
+ "\x63\xc6".force_encoding(Encoding::ASCII_8BIT),
174
+ "\x63\xc9".force_encoding(Encoding::ASCII_8BIT),
175
+ "\x63\xca".force_encoding(Encoding::ASCII_8BIT),
176
+ "\x65\x32".force_encoding(Encoding::ASCII_8BIT),
177
+ "\x66\x24".force_encoding(Encoding::ASCII_8BIT),
178
+ "\x66\xa5".force_encoding(Encoding::ASCII_8BIT),
179
+ "\x66\xbf".force_encoding(Encoding::ASCII_8BIT),
180
+ "\x66\xfc".force_encoding(Encoding::ASCII_8BIT),
181
+ "\x67\xc8".force_encoding(Encoding::ASCII_8BIT),
182
+ "\x68\xca".force_encoding(Encoding::ASCII_8BIT),
183
+ "\x69\x11".force_encoding(Encoding::ASCII_8BIT),
184
+ "\x69\x22".force_encoding(Encoding::ASCII_8BIT),
185
+ "\x69\x24".force_encoding(Encoding::ASCII_8BIT),
186
+ "\x69\x33".force_encoding(Encoding::ASCII_8BIT),
187
+ "\x69\x44".force_encoding(Encoding::ASCII_8BIT),
188
+ "\x69\x55".force_encoding(Encoding::ASCII_8BIT),
189
+ "\x69\xa5".force_encoding(Encoding::ASCII_8BIT),
190
+ "\x69\xbf".force_encoding(Encoding::ASCII_8BIT),
191
+ "\x69\xfc".force_encoding(Encoding::ASCII_8BIT),
192
+ "\x6d\x80".force_encoding(Encoding::ASCII_8BIT),
193
+ "\x6d\xe7".force_encoding(Encoding::ASCII_8BIT),
194
+ "\x6d\xf8".force_encoding(Encoding::ASCII_8BIT),
195
+ "\x6e\x67".force_encoding(Encoding::ASCII_8BIT),
196
+ "\x6e\xbc".force_encoding(Encoding::ASCII_8BIT),
197
+ "\x6f\xab".force_encoding(Encoding::ASCII_8BIT),
198
+ "\x73\x73".force_encoding(Encoding::ASCII_8BIT),
199
+ "\x73\x84".force_encoding(Encoding::ASCII_8BIT),
200
+ "\x73\xa4".force_encoding(Encoding::ASCII_8BIT),
201
+ "\x73\xc4".force_encoding(Encoding::ASCII_8BIT),
202
+ "\x73\xc5".force_encoding(Encoding::ASCII_8BIT),
203
+ "\x74\x46".force_encoding(Encoding::ASCII_8BIT),
204
+ "\x75\xa1".force_encoding(Encoding::ASCII_8BIT),
205
+ "\x78\xb5".force_encoding(Encoding::ASCII_8BIT),
206
+ "\x7b\xa9".force_encoding(Encoding::ASCII_8BIT),
207
+ "\x7d\x7b".force_encoding(Encoding::ASCII_8BIT),
208
+ "\x7e\x5b".force_encoding(Encoding::ASCII_8BIT),
209
+ "\x7e\x7b".force_encoding(Encoding::ASCII_8BIT),
210
+ "\x7e\x8a".force_encoding(Encoding::ASCII_8BIT),
211
+ "\x7e\x9a".force_encoding(Encoding::ASCII_8BIT),
212
+ "\x7e\xa5".force_encoding(Encoding::ASCII_8BIT),
213
+ "\x7e\xb5".force_encoding(Encoding::ASCII_8BIT)
214
+ ],
215
+ 3 => [
216
+ "\x22\xb5\x9c".force_encoding(Encoding::ASCII_8BIT),
217
+ "\x23\x31\x4f".force_encoding(Encoding::ASCII_8BIT),
218
+ "\x23\x83\xe3".force_encoding(Encoding::ASCII_8BIT),
219
+ "\x23\xe3\x83".force_encoding(Encoding::ASCII_8BIT),
220
+ "\x25\x86\x88".force_encoding(Encoding::ASCII_8BIT),
221
+ "\x26\xb2\x40".force_encoding(Encoding::ASCII_8BIT),
222
+ "\x2a\xd7\xb1".force_encoding(Encoding::ASCII_8BIT),
223
+ "\x2e\xb5\x24".force_encoding(Encoding::ASCII_8BIT),
224
+ "\x2f\xb5\x23".force_encoding(Encoding::ASCII_8BIT),
225
+ "\x3a\x96\x97".force_encoding(Encoding::ASCII_8BIT),
226
+ "\x3b\x40\x40".force_encoding(Encoding::ASCII_8BIT),
227
+ "\x3c\x83\xab".force_encoding(Encoding::ASCII_8BIT),
228
+ "\x3c\xb9\x23".force_encoding(Encoding::ASCII_8BIT),
229
+ "\x3e\x83\xbb".force_encoding(Encoding::ASCII_8BIT),
230
+ "\x3e\xb9\x23".force_encoding(Encoding::ASCII_8BIT)
231
+ ],
232
+ 4 => [
233
+ "\x10\x43\xa7\x70".force_encoding(Encoding::ASCII_8BIT),
234
+ "\x11\x4d\x9b\x74".force_encoding(Encoding::ASCII_8BIT),
235
+ "\x12\x54\xc3\x67".force_encoding(Encoding::ASCII_8BIT),
236
+ "\x15\x49\xa9\x66".force_encoding(Encoding::ASCII_8BIT),
237
+ "\x16\x54\xae\x6b".force_encoding(Encoding::ASCII_8BIT),
238
+ "\x18\x53\x80\x67".force_encoding(Encoding::ASCII_8BIT),
239
+ "\x19\x41\xa4\x69".force_encoding(Encoding::ASCII_8BIT),
240
+ "\x1a\x45\xdf\xa3".force_encoding(Encoding::ASCII_8BIT),
241
+ "\x1b\x53\x86\x67".force_encoding(Encoding::ASCII_8BIT),
242
+ "\x1c\x53\xbb\x6b".force_encoding(Encoding::ASCII_8BIT),
243
+ "\x1f\x43\xb6\x75".force_encoding(Encoding::ASCII_8BIT)
244
+ ]
245
+ }
246
+
247
+ def get_begin_pattern
248
+ return BEGIN_PATTERN_MKV, { :offset_inc => 4 }
249
+ end
250
+
251
+ def decode(offset)
252
+ ending_offset = nil
253
+
254
+ cursor = offset
255
+ # Read the variable int for the header size
256
+ header_size, vint_size = decode_vint(@data[cursor+4..cursor+11])
257
+ cursor += 4 + vint_size
258
+ progress(cursor)
259
+ # Here we have header_size bytes for the header data.
260
+ # Get the DocType
261
+ max_header_cursor = cursor + header_size
262
+ doc_type = nil
263
+ while ((cursor < max_header_cursor) and (doc_type == nil))
264
+ log_debug "=== @#{cursor} - Inspecting #{@data[cursor..cursor+20].inspect}"
265
+ # Read next EBML segment
266
+ segment_id, vint_size = decode_vint(@data[cursor..cursor+7])
267
+ log_debug "=== @#{cursor} - Found ID #{segment_id}"
268
+ cursor += vint_size
269
+ # Read its size
270
+ segment_size, vint_size = decode_vint(@data[cursor..cursor+7])
271
+ cursor += vint_size
272
+ if (segment_id == DOCTYPE_ID_INT)
273
+ doc_type = @data[cursor..cursor+segment_size-1]
274
+ log_debug "=== @#{cursor} - Found DocType: #{doc_type.inspect}"
275
+ end
276
+ cursor += segment_size
277
+ progress(cursor)
278
+ end
279
+ invalid_data("@#{offset} - Unable to get the DocType from the EBML file") if (doc_type == nil)
280
+ extension = ACCEPTABLE_DOCTYPES[doc_type]
281
+ invalid_data("@#{offset} - Unknown DocType: #{doc_type}") if (extension == nil)
282
+ # Make sure we consumed the header completely
283
+ cursor = max_header_cursor
284
+ # Now read the segment
285
+ invalid_data("@#{cursor} - Invalid Segment ID") if (@data[cursor..cursor+3] != SEGMENT_MATROSKA_ID)
286
+ found_relevant_data(extension)
287
+ # Read segment size
288
+ segment_size, vint_size = decode_vint(@data[cursor+4..cursor+11])
289
+ log_debug "=== @#{cursor} - Found segment of size #{segment_size}"
290
+ cursor += 4 + vint_size
291
+ if (segment_size == 127)
292
+ # The size is unknown
293
+ # We have to make a deep decoding
294
+ while (ebml_id_size = decode_ebml_id(cursor))
295
+ # Read segment size
296
+ segment_size, vint_size = decode_vint(@data[cursor+ebml_id_size..cursor+ebml_id_size+7])
297
+ log_debug "=== @#{cursor} - Found segment #{segment_id} (size #{ebml_id_size}) of size #{segment_size} (size #{vint_size})"
298
+ cursor += ebml_id_size + vint_size
299
+ if (segment_size != 127)
300
+ cursor += segment_size
301
+ break if (cursor == @end_offset)
302
+ end
303
+ progress(cursor)
304
+ end
305
+ else
306
+ cursor += segment_size
307
+ end
308
+ progress(cursor)
309
+ ending_offset = cursor
310
+
311
+ return ending_offset
312
+ end
313
+
314
+ private
315
+
316
+ # Take the data (as a String) and read it as a variable size integer (return also the size)
317
+ #
318
+ # Parameters::
319
+ # * *data* (_String_): The data to decode
320
+ # Result::
321
+ # * _Fixnum_: The corresponding value
322
+ # * _Fixnum_: The size of the vint
323
+ def decode_vint(data)
324
+ value = 0
325
+ size = 1
326
+
327
+ bytes = data.bytes.to_a
328
+ # Size of the integer is defined in first byte only
329
+ first_byte = bytes.first
330
+ size = 1
331
+ while ((first_byte & (1 << (8-size))) == 0)
332
+ size += 1
333
+ invalid_data("Invalid variable int encoded: #{data}") if (size > 8)
334
+ end
335
+ # Replace first byte with its true value
336
+ bytes[0] = first_byte & ((1 << (8-size))-1)
337
+ # Read all
338
+ size.times do |idx|
339
+ value = (value << 8) + bytes[idx]
340
+ end
341
+
342
+ return value, size
343
+ end
344
+
345
+ # Decode an EBML ID
346
+ #
347
+ # Parameters::
348
+ # * *cursor* (_Fixnum_): The cursor
349
+ # Result::
350
+ # * _Fixnum_: Size of the decoded EBML ID, or false if not a valid EBML ID
351
+ def decode_ebml_id(cursor)
352
+ if (VALID_ELEMENT_IDS[1].include?(@data[cursor]))
353
+ return 1
354
+ elsif (VALID_ELEMENT_IDS[2].include?(@data[cursor..cursor+1]))
355
+ return 2
356
+ elsif (VALID_ELEMENT_IDS[3].include?(@data[cursor..cursor+2]))
357
+ return 3
358
+ elsif (VALID_ELEMENT_IDS[4].include?(@data[cursor..cursor+3]))
359
+ return 4
360
+ else
361
+ return false
362
+ end
363
+ end
364
+
365
+ end
366
+
367
+ end
368
+
369
+ end