fileshunter 0.1.0.20130725

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,350 @@
1
+ module FilesHunter
2
+
3
+ module Decoders
4
+
5
+ class TIFF < BeginPatternDecoder
6
+
7
+ # TIFF Reference: http://partners.adobe.com/public/developer/en/tiff/TIFF6.pdf
8
+ # Exif Reference: http://www.exif.org/Exif2-2.PDF
9
+
10
+ BEGIN_PATTERN_TIFF_LE = "II*\x00".force_encoding(Encoding::ASCII_8BIT)
11
+ BEGIN_PATTERN_TIFF_BE = "MM\x00*".force_encoding(Encoding::ASCII_8BIT)
12
+ BEGIN_PATTERN_TIFF = Regexp.new("(#{Regexp.escape(BEGIN_PATTERN_TIFF_LE)}|#{Regexp.escape(BEGIN_PATTERN_TIFF_BE)})", nil, 'n')
13
+
14
+ TYPE_SIZES = {
15
+ 1 => 1,
16
+ 2 => 1,
17
+ 3 => 2,
18
+ 4 => 4,
19
+ 5 => 8,
20
+ 6 => 1,
21
+ 7 => 1,
22
+ 8 => 2,
23
+ 9 => 4,
24
+ 10 => 8,
25
+ 11 => 4,
26
+ 12 => 8
27
+ }
28
+
29
+ VALID_COMPRESSION_VALUES = [ 1, 2, 3, 4, 5, 6, 32773 ]
30
+ VALID_PHOTOMETRIC_INTERPRETATIONS = [ 0, 1, 2, 3, 4, 5, 6, 8 ]
31
+
32
+ TRAILING_00_REGEXP = Regexp.new("\x00*$".force_encoding(Encoding::ASCII_8BIT), nil, 'n')
33
+ NULL_TERMINATING_CHAR = "\x00".force_encoding(Encoding::ASCII_8BIT)
34
+
35
+ def initialize
36
+ super
37
+ @accept_no_image_data = false
38
+ end
39
+
40
+ def get_begin_pattern
41
+ return BEGIN_PATTERN_TIFF, { :offset_inc => 4, :max_regexp_size => 4 }
42
+ end
43
+
44
+ # Set this decoder to accept no image data.
45
+ # This is particularly useful for other decoders using it (for example with JPEG and its Exif info)
46
+ def accept_no_image_data
47
+ @accept_no_image_data = true
48
+ end
49
+
50
+ def decode(offset)
51
+ @file_offset = offset
52
+ @bindata_reader_16 = nil
53
+ @bindata_reader_32 = nil
54
+ if (@data[offset..offset+3] == BEGIN_PATTERN_TIFF_LE)
55
+ @bindata_reader_16 = BinData::Uint16le
56
+ @bindata_reader_32 = BinData::Uint32le
57
+ else
58
+ @bindata_reader_16 = BinData::Uint16be
59
+ @bindata_reader_32 = BinData::Uint32be
60
+ end
61
+ ifd_offset = @bindata_reader_32.read(@data[offset+4..offset+7])
62
+ extensions = [:tif, :tiff] # By default
63
+ @max_end_offset = ifd_offset
64
+ @strip_offsets = []
65
+ @strip_byte_counts = []
66
+ @tile_offsets = []
67
+ @tile_byte_counts = []
68
+ @compression = 1
69
+ @lst_bits_per_sample = [1]
70
+ @image_width = nil
71
+ @image_length = nil
72
+ @tag_parser = Proc.new do |tag, type, nbr, size, cursor|
73
+ case tag.to_i
74
+ when 2
75
+ metadata( :gps_latitude => @data[cursor..cursor+size-1] )
76
+ when 4
77
+ metadata( :gps_longitude => @data[cursor..cursor+size-1] )
78
+ when 6
79
+ metadata( :gps_altitude => @data[cursor..cursor+size-1] )
80
+ when 7
81
+ metadata( :gps_timestamp => [
82
+ read_ratio(cursor),
83
+ read_ratio(cursor+8),
84
+ read_ratio(cursor+16),
85
+ ] )
86
+ when 256
87
+ @image_width = ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3]))
88
+ invalid_data("@#{cursor} - Invalid image width #{@image_width}") if (@image_width == 0)
89
+ metadata( :image_width => @image_width )
90
+ when 257
91
+ @image_length = ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3]))
92
+ invalid_data("@#{cursor} - Invalid image length #{@image_length}") if (@image_length == 0)
93
+ metadata( :image_length => @image_length )
94
+ when 258
95
+ @lst_bits_per_sample = []
96
+ nbr.times do |idx_sample|
97
+ @lst_bits_per_sample << @bindata_reader_16.read(@data[cursor+2*idx_sample..cursor+2*idx_sample+1])
98
+ end
99
+ metadata( :lst_bits_per_sample => @lst_bits_per_sample )
100
+ when 259
101
+ @compression = @bindata_reader_16.read(@data[cursor..cursor+1])
102
+ invalid_data("@#{cursor} - Invalid compression #{@compression}") if (!VALID_COMPRESSION_VALUES.include?(@compression))
103
+ metadata( :compression => @compression )
104
+ when 262
105
+ photometric_interpretation = @bindata_reader_16.read(@data[cursor..cursor+1])
106
+ invalid_data("@#{cursor} - Invalid photometric interpretation #{photometric_interpretation}") if (!VALID_PHOTOMETRIC_INTERPRETATIONS.include?(photometric_interpretation))
107
+ metadata( :photometric_interpretation => photometric_interpretation )
108
+ when 264
109
+ cell_width = @bindata_reader_16.read(@data[cursor..cursor+1])
110
+ invalid_data("@#{cursor} - Invalid cell width #{cell_width}") if (cell_width == 0)
111
+ metadata( :cell_width => cell_width )
112
+ when 265
113
+ cell_length = @bindata_reader_16.read(@data[cursor..cursor+1])
114
+ invalid_data("@#{cursor} - Invalid cell length #{cell_length}") if (cell_length == 0)
115
+ metadata( :cell_length => cell_length )
116
+ when 266
117
+ fill_order = @bindata_reader_16.read(@data[cursor..cursor+1])
118
+ invalid_data("@#{cursor} - Invalid fill order #{fill_order}") if ((fill_order == 0) or (fill_order > 2))
119
+ metadata( :fill_order => fill_order )
120
+ when 269
121
+ metadata( :document_name => read_ascii(cursor, size) )
122
+ when 270
123
+ metadata( :image_description => read_ascii(cursor, size) )
124
+ when 271
125
+ metadata( :make => read_ascii(cursor, size) )
126
+ when 272
127
+ metadata( :model => read_ascii(cursor, size) )
128
+ when 273
129
+ value_size = ((type == 3) ? 2 : 4)
130
+ nbr.times do |idx|
131
+ @strip_offsets << ((type == 3) ? @bindata_reader_16.read(@data[cursor+idx*value_size..cursor+idx*value_size+1]) : @bindata_reader_32.read(@data[cursor+idx*value_size..cursor+idx*value_size+3]))
132
+ end
133
+ found_relevant_data(extensions)
134
+ when 274
135
+ orientation = @bindata_reader_16.read(@data[cursor..cursor+1])
136
+ invalid_data("@#{cursor} - Invalid orientation #{orientation}") if ((orientation == 0) or (orientation > 8))
137
+ metadata( :orientation => orientation )
138
+ when 277
139
+ samples_per_pixel = @bindata_reader_16.read(@data[cursor..cursor+1])
140
+ invalid_data("@#{cursor} - Invalid samples per pixel #{samples_per_pixel}") if (samples_per_pixel == 0)
141
+ metadata( :samples_per_pixel => samples_per_pixel )
142
+ when 278
143
+ rows_per_strip = ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3]))
144
+ invalid_data("@#{cursor} - Invalid rows per strip #{rows_per_strip}") if (rows_per_strip == 0)
145
+ metadata( :rows_per_strip => rows_per_strip )
146
+ when 279
147
+ value_size = ((type == 3) ? 2 : 4)
148
+ nbr.times do |idx|
149
+ @strip_byte_counts << ((type == 3) ? @bindata_reader_16.read(@data[cursor+idx*value_size..cursor+idx*value_size+1]) : @bindata_reader_32.read(@data[cursor+idx*value_size..cursor+idx*value_size+3]))
150
+ end
151
+ when 282
152
+ ratio = read_ratio(cursor)
153
+ invalid_data("@#{cursor} - Invalid x resolution #{ratio}") if (ratio == 0)
154
+ metadata( :x_resolution => ratio )
155
+ when 283
156
+ ratio = read_ratio(cursor)
157
+ invalid_data("@#{cursor} - Invalid y resolution #{ratio}") if (ratio == 0)
158
+ metadata( :y_resolution => ratio )
159
+ when 285
160
+ metadata( :page_name => read_ascii(cursor, size) )
161
+ when 296
162
+ resolution_unit = @bindata_reader_16.read(@data[cursor..cursor+1])
163
+ invalid_data("@#{cursor} - Invalid resolution unit #{resolution_unit}") if ((resolution_unit == 0) or (resolution_unit > 3))
164
+ metadata( :resolution_unit => resolution_unit )
165
+ when 297
166
+ page_number = @bindata_reader_16.read(@data[cursor..cursor+1])
167
+ page_total = @bindata_reader_16.read(@data[cursor+2..cursor+3])
168
+ invalid_data("@#{cursor} - Invalid page total #{page_total}") if (page_total == 0)
169
+ metadata( :page_number => page_number, :page_total => page_total )
170
+ when 305
171
+ metadata( :software => read_ascii(cursor, size) )
172
+ when 306
173
+ metadata( :date_time => read_ascii(cursor, size) )
174
+ when 315
175
+ metadata( :artist => read_ascii(cursor, size) )
176
+ when 316
177
+ metadata( :host_computer => read_ascii(cursor, size) )
178
+ when 324
179
+ nbr.times do |idx|
180
+ @tile_offsets << @bindata_reader_32.read(@data[cursor+idx*4..cursor+idx*4+3])
181
+ end
182
+ found_relevant_data(extensions)
183
+ when 325
184
+ nbr.times do |idx|
185
+ @tile_byte_counts << @bindata_reader_32.read(@data[cursor+idx*4..cursor+idx*4+3])
186
+ end
187
+ when 337
188
+ metadata( :target_printer => read_ascii(cursor, size) )
189
+ when 33432
190
+ metadata( :copyright => read_ascii(cursor, size) )
191
+ when 33434
192
+ metadata( :exposure_time => read_ratio(cursor) )
193
+ when 33437
194
+ metadata( :f_number => read_ratio(cursor) )
195
+ when 34665
196
+ exif_ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
197
+ metadata( :exif_ifd => true )
198
+ parse_ifd(exif_ifd_offset, &@tag_parser)
199
+ when 34853
200
+ gps_ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
201
+ metadata( :gps_ifd => true )
202
+ parse_ifd(gps_ifd_offset, &@tag_parser)
203
+ when 36864
204
+ metadata( :exif_version => read_ascii(cursor, size) )
205
+ when 36867
206
+ metadata( :date_time_original => read_ascii(cursor, size) )
207
+ when 36868
208
+ metadata( :date_time_digitized => read_ascii(cursor, size) )
209
+ when 37386
210
+ metadata( :focal_length => read_ratio(cursor) )
211
+ when 37510
212
+ metadata( :user_comment => read_ascii(cursor, size) )
213
+ when 37520
214
+ metadata( :subsec_time => read_ascii(cursor, size) )
215
+ when 37521
216
+ metadata( :subsec_time_original => read_ascii(cursor, size) )
217
+ when 37522
218
+ metadata( :subsec_time_digitized => read_ascii(cursor, size) )
219
+ when 40960
220
+ metadata( :flashpix_version => read_ascii(cursor, size) )
221
+ when 40962
222
+ metadata( :pixel_x_dimension => ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3])) )
223
+ when 40963
224
+ metadata( :pixel_y_dimension => ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3])) )
225
+ when 40965
226
+ interoperability_ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
227
+ metadata( :interoperability_ifd => true )
228
+ parse_ifd(interoperability_ifd_offset, &@tag_parser)
229
+ end
230
+ end
231
+ parse_ifd(ifd_offset, &@tag_parser)
232
+ log_debug "@#{@file_offset + @max_end_offset} - Found #{@strip_offsets.size} strips and #{@tile_offsets.size} tiles."
233
+ found_relevant_data(extensions)
234
+ invalid_data("@#{@file_offset + @max_end_offset} - No strips nor tiles defined.") if ((!@accept_no_image_data) and (@strip_offsets.empty?) and (@tile_offsets.empty?))
235
+ # Special case:
236
+ if ((@strip_offsets.size == 1) and
237
+ (@strip_byte_counts.empty?))
238
+ # Compute the strip size: this is the total image size
239
+ invalid_data("@#{@file_offset + @max_end_offset} - Missing strip byte counts and image is compressed") if (@compression != 1)
240
+ invalid_data("@#{@file_offset + @max_end_offset} - Missing image width") if (@image_width == nil)
241
+ invalid_data("@#{@file_offset + @max_end_offset} - Missing image length") if (@image_length == nil)
242
+ # Compute a single row size
243
+ nbr_bits_per_pixel = 0
244
+ all_samples_16 = true
245
+ all_samples_32 = true
246
+ @lst_bits_per_sample.each do |nbr_bits|
247
+ nbr_bits_per_pixel += nbr_bits
248
+ all_samples_16 = false if (nbr_bits != 16)
249
+ all_samples_32 = false if (nbr_bits != 32)
250
+ end
251
+ row_size_bits = @image_width * nbr_bits_per_pixel
252
+ # Compute the padding in bits
253
+ bits_padding = (all_samples_16 ? 16 : (all_samples_32 ? 32 : 8))
254
+ bits_rest = row_size_bits % bits_padding
255
+ row_size_bits += bits_padding - bits_rest if (bits_rest != 0)
256
+ # We have the real row size
257
+ image_end_offset = @strip_offsets[0] + @image_length * (row_size_bits / 8)
258
+ @max_end_offset = image_end_offset if (@max_end_offset < image_end_offset)
259
+ else
260
+ invalid_data("@#{@file_offset + @max_end_offset} - Found #{@strip_offsets.size} strip offsets but #{@strip_byte_counts.size} strip bytes count") if (@strip_offsets.size != @strip_byte_counts.size)
261
+ invalid_data("@#{@file_offset + @max_end_offset} - Found #{@tile_offsets.size} tile offsets but #{@tile_byte_counts.size} tile bytes count") if (@tile_offsets.size != @tile_byte_counts.size)
262
+ # Read all strips
263
+ @strip_offsets.each_with_index do |strip_offset, idx_strip|
264
+ @max_end_offset = strip_offset + @strip_byte_counts[idx_strip] if (@max_end_offset < strip_offset + @strip_byte_counts[idx_strip])
265
+ end
266
+ # Read all tiles
267
+ @tile_offsets.each_with_index do |tile_offset, idx_tile|
268
+ @max_end_offset = tile_offset + @tile_byte_counts[idx_tile] if (@max_end_offset < tile_offset + @tile_byte_counts[idx_tile])
269
+ end
270
+ end
271
+
272
+ return @file_offset + @max_end_offset
273
+ end
274
+
275
+ private
276
+
277
+ # Parse an IFD
278
+ #
279
+ # Parameters::
280
+ # * *ifd_offset* (_Fixnum_): IFD offset to parse
281
+ # * *&proc* (_Proc_): Code called each time a tag is being parsed:
282
+ # * Parameters::
283
+ # * *tag* (_Fixnum_): Tag read
284
+ # * *type* (_Fixnum_): Type of this tag
285
+ # * *nbr* (_Fixnum_): Number of values in this tag
286
+ # * *size* (_Fixnum_): Complete size of this tag
287
+ # * *cursor* (_Fixnum_): Cursor to read the values from
288
+ def parse_ifd(ifd_offset, &proc)
289
+ log_debug "@#{@file_offset + ifd_offset} - Parse IFD"
290
+ while (ifd_offset != 0)
291
+ cursor = @file_offset + ifd_offset
292
+ nbr_entries = @bindata_reader_16.read(@data[cursor..cursor+1])
293
+ cursor += 2
294
+ nbr_entries.times do |idx_entry|
295
+ tag = @bindata_reader_16.read(@data[cursor..cursor+1])
296
+ type = @bindata_reader_16.read(@data[cursor+2..cursor+3])
297
+ nbr = @bindata_reader_32.read(@data[cursor+4..cursor+7])
298
+ # Compute the size
299
+ invalid_data("@#{cursor} - Invalid type: #{type}") if (!TYPE_SIZES.include?(type))
300
+ size = TYPE_SIZES[type]*nbr
301
+ # Read the offset of the value
302
+ value_offset = @bindata_reader_32.read(@data[cursor+8..cursor+11])
303
+ log_debug "@#{cursor} - Found tag #{tag} (type #{type}) with #{nbr} values (size #{size}): #{value_offset}"
304
+ if (size > 4)
305
+ yield(tag, type, nbr, size, @file_offset + value_offset)
306
+ value_end_offset = value_offset + size
307
+ @max_end_offset = value_end_offset if (@max_end_offset < value_end_offset)
308
+ else
309
+ yield(tag, type, nbr, size, cursor + 8)
310
+ end
311
+ cursor += 12
312
+ progress(cursor)
313
+ end
314
+ ifd_end_offset = ifd_offset + 6 + nbr_entries*12
315
+ @max_end_offset = ifd_end_offset if (@max_end_offset < ifd_end_offset)
316
+ # Read the next ifd offset
317
+ ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
318
+ end
319
+ end
320
+
321
+ # Read an ASCII value
322
+ #
323
+ # Parameters::
324
+ # * *cursor* (_Fixnum_): The cursor to read from
325
+ # * *size* (_Fixnum_): Size of the string
326
+ # Result::
327
+ # * _String_ or <em>list<String></em>: Resulting string or list of strings if several.
328
+ def read_ascii(cursor, size)
329
+ lst_strings = @data[cursor..cursor+size-1].gsub(TRAILING_00_REGEXP, '').strip.split(NULL_TERMINATING_CHAR)
330
+ return (lst_strings.size == 1) ? lst_strings[0] : lst_strings
331
+ end
332
+
333
+ # Read a Rational value
334
+ #
335
+ # Parameters::
336
+ # * *cursor* (_Fixnum_): The cursor to read from
337
+ # Result::
338
+ # * _Float_: The rational
339
+ def read_ratio(cursor)
340
+ num = @bindata_reader_32.read(@data[cursor..cursor+3])
341
+ denom = @bindata_reader_32.read(@data[cursor+4..cursor+7])
342
+ invalid_data("@#{cursor} - Invalid rational #{num}/#{denom}") if ((denom == 0) and (num != 0))
343
+ return (num == 0) ? 0 : num.to_f / denom.to_f
344
+ end
345
+
346
+ end
347
+
348
+ end
349
+
350
+ end
@@ -0,0 +1,240 @@
1
+ module FilesHunter
2
+
3
+ module Decoders
4
+
5
+ class Text < Decoder
6
+
7
+ UTF_16BE_BOM = "\xFE\xFF".force_encoding(Encoding::ASCII_8BIT)
8
+ UTF_16LE_BOM = "\xFF\xFE".force_encoding(Encoding::ASCII_8BIT)
9
+ NULL_CHAR = "\x00".force_encoding(Encoding::ASCII_8BIT)
10
+ NL_CHAR = "\n".force_encoding(Encoding::ASCII_8BIT)
11
+
12
+ # Find segments from a given data
13
+ def find_segments
14
+ current_offset = @begin_offset
15
+ while (current_offset < @end_offset)
16
+ # First find a new line character from current_offset
17
+ newline_offset = @data.index(NL_CHAR, current_offset)
18
+ if ((newline_offset == nil) or
19
+ (newline_offset >= @end_offset))
20
+ # No text
21
+ current_offset = @end_offset
22
+ log_debug "Contains no more Text."
23
+ else
24
+ # We have a candidate
25
+ # Get back to see the beginning of Text
26
+ text_begin_offset = nil
27
+ text_header_size = 0
28
+ # Detect if it might be UTF-16 encoded
29
+ if (((newline_offset > @begin_offset) and
30
+ (@data[newline_offset-1] == NULL_CHAR)) or
31
+ ((newline_offset < @end_offset-1) and
32
+ (@data[newline_offset+1] == NULL_CHAR)))
33
+ # Cursor should always be on a \x00 unless it arrived at the end
34
+ cursor = newline_offset - 1
35
+ while ((cursor >= @begin_offset+1) and
36
+ (@data[cursor] == NULL_CHAR) and
37
+ ((((c = @data[cursor-1].ord) >= 32) and
38
+ (c != 127)) or
39
+ (c == 9) or
40
+ (c == 13)))
41
+ cursor -= 2
42
+ end
43
+ # Here we several possibilities:
44
+ # * cursor is on @begin_offset-1 and data begins with \x00: UTF-16 string starts at @begin_offset and is big endian,
45
+ # * else cursor is on @begin_offset and @data[@begin_offset+1] is \x00 but we did not check @data[@begin_offset] (if @data[@begin_offset] is ASCII then it means UTF-16 begins at @begin_offset and is little endian ; otherwise it starts at @begin_offset+1 and is big endian),
46
+ # * else cursor is at least on @begin_offset+1 (we have at least 2 bytes before it), and
47
+ # * @data[cursor] is not \x00 and @data[cursor+2] is \x00: we could be on the endianness marker, or out of the string already ; if not endianness marker, if @data[cursor+1] is valid ASCI then UTF-16 string starts at cursor+1 and is little endian, otherwise it starts at cursor+2 and is big endian,
48
+ # * else @data[cursor] is \x00 but preceding character is not ASCII (meaning it can't be the endianness marker either): UTF-16 string starts at cursor end is big endian
49
+ # UTF_16BE = "\xFE\xFF\x00\x??"
50
+ # UTF_16LE = "\xFF\xFE\x??\x00"
51
+ # In following comments, here are the conventions:
52
+ # * \xAA means valid ASCII character
53
+ # * \xBB means not a valid ASCII character
54
+ # * \x11 means a non zero character
55
+ # * \x?? means unknown character
56
+ # * other values represent their corresponding character
57
+ if (cursor == @begin_offset-1)
58
+ # @data[@begin_offset..@begin_offset+1] == \xAA\x00
59
+ text_begin_offset = @begin_offset
60
+ encoding = Encoding::UTF_16LE
61
+ elsif (cursor == @begin_offset)
62
+ # @data[@begin_offset..@begin_offset+2] == \x??\xAA\x00
63
+ if ((c = @data[@begin_offset].ord) == 0)
64
+ # @data[@begin_offset..@begin_offset+2] == \x00\xAA\x00
65
+ text_begin_offset = @begin_offset
66
+ encoding = Encoding::UTF_16BE
67
+ elsif (((c >= 32) and
68
+ (c != 127)) or
69
+ (c == 9) or
70
+ (c == 13))
71
+ # @data[@begin_offset..@begin_offset+2] == \xAA\xAA\x00
72
+ if (@data[@begin_offset..@begin_offset+1] == UTF_16BE_BOM)
73
+ # @data[@begin_offset..@begin_offset+2] == \xFE\xFF\x00
74
+ text_begin_offset = @begin_offset
75
+ encoding = Encoding::UTF_16BE
76
+ text_header_size = 2
77
+ else
78
+ text_begin_offset = @begin_offset + 1
79
+ encoding = Encoding::UTF_16LE
80
+ end
81
+ else
82
+ # @data[@begin_offset..@begin_offset+2] == \xBB\xAA\x00
83
+ text_begin_offset = @begin_offset + 1
84
+ encoding = Encoding::UTF_16LE
85
+ end
86
+ elsif (@data[cursor] == NULL_CHAR)
87
+ # @data[cursor-1..cursor+2] == \xBB\x00\xAA\x00
88
+ text_begin_offset = cursor
89
+ encoding = Encoding::UTF_16BE
90
+ elsif (@data[cursor-1..cursor] == UTF_16LE_BOM)
91
+ # @data[cursor-1..cursor+2] == \xFF\xFE\xAA\x00
92
+ text_begin_offset = cursor - 1
93
+ encoding = Encoding::UTF_16LE
94
+ text_header_size = 2
95
+ elsif (@data[cursor..cursor+1] == UTF_16BE_BOM)
96
+ # @data[cursor-1..cursor+2] == \x??\xFE\xFF\x00
97
+ text_begin_offset = cursor
98
+ encoding = Encoding::UTF_16BE
99
+ text_header_size = 2
100
+ else
101
+ # @data[cursor-1..cursor+2] == \x??\x11\xAA\x00
102
+ text_begin_offset = cursor + 1
103
+ encoding = Encoding::UTF_16LE
104
+ end
105
+ else
106
+ encoding = Encoding::ASCII_8BIT
107
+ cursor = newline_offset - 1
108
+ while ((cursor >= @begin_offset) and
109
+ ((((c = @data[cursor].ord) >= 32) and
110
+ (c != 127)) or
111
+ (c == 9) or
112
+ (c == 13)))
113
+ cursor -= 1
114
+ end
115
+ text_begin_offset = cursor + 1
116
+ end
117
+ # Now find forward
118
+ keep_alive
119
+ text_end_offset = nil
120
+ truncated = false
121
+ case encoding
122
+ when Encoding::ASCII_8BIT
123
+ cursor = newline_offset + 1
124
+ while ((cursor < @end_offset) and
125
+ ((((c = @data[cursor].ord) >= 32) and
126
+ (c != 127)) or
127
+ (c == 9) or
128
+ (c == 10) or
129
+ (c == 13)))
130
+ cursor += 1
131
+ end
132
+ text_end_offset = cursor
133
+ when Encoding::UTF_16BE
134
+ # cursor points on \x00
135
+ cursor = newline_offset + 1
136
+ while ((cursor < @end_offset-1) and
137
+ (@data[cursor] == NULL_CHAR) and
138
+ ((((c = @data[cursor+1].ord) >= 32) and
139
+ (c != 127)) or
140
+ (c == 9) or
141
+ (c == 10) or
142
+ (c == 13)))
143
+ cursor += 2
144
+ end
145
+ # Several possibilities:
146
+ # * cursor is at @end_offset, meaning the string ends at @end_offset,
147
+ # * else cursor is at @end_offset-1, meaning the string ends at @end_offset-1 or at @end_offset and is truncated if @data[@end_offset-1] is "\x00",
148
+ # * else the string ends at cursor
149
+ if (cursor == @end_offset-1)
150
+ if (@data[cursor] == NULL_CHAR)
151
+ truncated = true
152
+ text_end_offset = @end_offset
153
+ else
154
+ text_end_offset = @end_offset - 1
155
+ end
156
+ else
157
+ text_end_offset = cursor
158
+ end
159
+ when Encoding::UTF_16LE
160
+ # cursor points on the ASCII value
161
+ cursor = newline_offset
162
+ while ((cursor < @end_offset-1) and
163
+ (@data[cursor+1] == NULL_CHAR) and
164
+ ((((c = @data[cursor].ord) >= 32) and
165
+ (c != 127)) or
166
+ (c == 9) or
167
+ (c == 10) or
168
+ (c == 13)))
169
+ cursor += 2
170
+ end
171
+ # Several possibilities:
172
+ # * cursor is at @end_offset, meaning the string ends at @end_offset,
173
+ # * else cursor is at @end_offset-1, meaning the string ends at @end_offset-1 or at @end_offset and is truncated if @data[@end_offset-1] is a valid ASCII,
174
+ # * else the string ends at cursor
175
+ if (cursor == @end_offset-1)
176
+ if ((((c = @data[cursor].ord) >= 32) and
177
+ (c != 127)) or
178
+ (c == 9) or
179
+ (c == 10) or
180
+ (c == 13))
181
+ truncated = true
182
+ text_end_offset = @end_offset
183
+ else
184
+ text_end_offset = @end_offset - 1
185
+ end
186
+ else
187
+ text_end_offset = cursor
188
+ end
189
+ end
190
+ # Consider text files longer than a certain size only
191
+ if (text_end_offset - text_begin_offset < 512*((encoding == Encoding::ASCII_8BIT) ? 1 : 2))
192
+ #log_debug "@#{text_begin_offset} - Text section is too short (#{text_end_offset - text_begin_offset}) to be identified as text"
193
+ else
194
+ log_debug "@#{text_begin_offset} - Found text up to #{text_end_offset} with encoding #{encoding} and header of size #{text_header_size}"
195
+ # Now check some formats
196
+ text = @data[text_begin_offset+text_header_size..text_end_offset-1].clone.force_encoding(encoding)
197
+ lines = text.split("\r\n".encode(encoding))
198
+ lines = text.split("\n".encode(encoding)) if (lines.size == 1)
199
+ extension = [ :txt, :log ] # By default
200
+ if is_text_srt?(lines, encoding)
201
+ extension = :srt
202
+ elsif is_text_rtf?(lines, encoding)
203
+ extension = :rtf
204
+ elsif is_text_html?(lines, encoding)
205
+ extension = :html
206
+ elsif is_text_xml?(lines, encoding)
207
+ extension = :xml
208
+ end
209
+ found_segment(text_begin_offset, text_end_offset, extension, false, false, :encoding => encoding)
210
+ end
211
+ current_offset = text_end_offset + 1
212
+ end
213
+ end
214
+ end
215
+
216
+ private
217
+
218
+ def is_text_srt?(lines, encoding)
219
+ # TODO (Ruby bug): Replace [0-9] with \d when it will work in UTF_16LE encoding
220
+ return ((lines[0] =~ Regexp.new('^\d+$'.encode(encoding))) and
221
+ (lines[1] =~ Regexp.new('^[0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9] --> [0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9]$'.encode(encoding))))
222
+ end
223
+
224
+ def is_text_rtf?(lines, encoding)
225
+ return (lines[0] =~ Regexp.new('{\\\\rtf'.encode(encoding)))
226
+ end
227
+
228
+ def is_text_html?(lines, encoding)
229
+ return lines[0] =~ Regexp.new('<!DOCTYPE html'.encode(encoding))
230
+ end
231
+
232
+ def is_text_xml?(lines, encoding)
233
+ return lines[0] =~ Regexp.new('<?xml '.encode(encoding))
234
+ end
235
+
236
+ end
237
+
238
+ end
239
+
240
+ end
@@ -0,0 +1,50 @@
1
+ module FilesHunter
2
+
3
+ # A segment represents a chunk of data
4
+ class Segment
5
+
6
+ # Begin offset of the segment
7
+ # Fixnum
8
+ attr_reader :begin_offset
9
+
10
+ # End offset of the segment (equals the begin offset of the next segment)
11
+ # Fixnum
12
+ attr_reader :end_offset
13
+
14
+ # List of extensions guessed (sort by descending probability) (:mkv, :dll ...). :unknown used to unknown data.
15
+ # list<Symbol>
16
+ attr_reader :extensions
17
+
18
+ # Is this segment truncated? This means that for the given extension, data should have continued beyond this segment.
19
+ # Boolean
20
+ attr_reader :truncated
21
+
22
+ # Is this segment missing previous data? This means that for the given extension, data should have been present already before this segment.
23
+ # Boolean
24
+ attr_reader :missing_previous_data
25
+
26
+ # Metadata associated to this Segment (Decoder dependent)
27
+ # map< Symbol, Object >
28
+ attr_reader :metadata
29
+
30
+ # Constructor
31
+ #
32
+ # Parameters::
33
+ # * *begin_offset* (_Fixnum_): Specify begin offset
34
+ # * *end_offset* (_Fixnum_): Specify end offset
35
+ # * *extension* (_Symbol_ or <em>list<Symbol></em>): Specify extension
36
+ # * *truncated* (_Boolean_): Specify truncated flag
37
+ # * *missing_previous_data* (_Boolean_): Do we lack data before this segment?
38
+ # * *metadata* (<em>map<Symbol,Object></em>): Metadata (Decoder dependent)
39
+ def initialize(begin_offset, end_offset, extension, truncated, missing_previous_data, metadata)
40
+ @begin_offset = begin_offset
41
+ @end_offset = end_offset
42
+ @extensions = (extension.is_a?(Symbol)) ? [ extension ] : extension
43
+ @truncated = truncated
44
+ @missing_previous_data = missing_previous_data
45
+ @metadata = metadata
46
+ end
47
+
48
+ end
49
+
50
+ end