fileshunter 0.1.0.20130725

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,350 @@
1
+ module FilesHunter
2
+
3
+ module Decoders
4
+
5
+ class TIFF < BeginPatternDecoder
6
+
7
+ # TIFF Reference: http://partners.adobe.com/public/developer/en/tiff/TIFF6.pdf
8
+ # Exif Reference: http://www.exif.org/Exif2-2.PDF
9
+
10
+ BEGIN_PATTERN_TIFF_LE = "II*\x00".force_encoding(Encoding::ASCII_8BIT)
11
+ BEGIN_PATTERN_TIFF_BE = "MM\x00*".force_encoding(Encoding::ASCII_8BIT)
12
+ BEGIN_PATTERN_TIFF = Regexp.new("(#{Regexp.escape(BEGIN_PATTERN_TIFF_LE)}|#{Regexp.escape(BEGIN_PATTERN_TIFF_BE)})", nil, 'n')
13
+
14
+ TYPE_SIZES = {
15
+ 1 => 1,
16
+ 2 => 1,
17
+ 3 => 2,
18
+ 4 => 4,
19
+ 5 => 8,
20
+ 6 => 1,
21
+ 7 => 1,
22
+ 8 => 2,
23
+ 9 => 4,
24
+ 10 => 8,
25
+ 11 => 4,
26
+ 12 => 8
27
+ }
28
+
29
+ VALID_COMPRESSION_VALUES = [ 1, 2, 3, 4, 5, 6, 32773 ]
30
+ VALID_PHOTOMETRIC_INTERPRETATIONS = [ 0, 1, 2, 3, 4, 5, 6, 8 ]
31
+
32
+ TRAILING_00_REGEXP = Regexp.new("\x00*$".force_encoding(Encoding::ASCII_8BIT), nil, 'n')
33
+ NULL_TERMINATING_CHAR = "\x00".force_encoding(Encoding::ASCII_8BIT)
34
+
35
+ def initialize
36
+ super
37
+ @accept_no_image_data = false
38
+ end
39
+
40
+ def get_begin_pattern
41
+ return BEGIN_PATTERN_TIFF, { :offset_inc => 4, :max_regexp_size => 4 }
42
+ end
43
+
44
+ # Set this decoder to accept no image data.
45
+ # This is particularly useful for other decoders using it (for example with JPEG and its Exif info)
46
+ def accept_no_image_data
47
+ @accept_no_image_data = true
48
+ end
49
+
50
+ def decode(offset)
51
+ @file_offset = offset
52
+ @bindata_reader_16 = nil
53
+ @bindata_reader_32 = nil
54
+ if (@data[offset..offset+3] == BEGIN_PATTERN_TIFF_LE)
55
+ @bindata_reader_16 = BinData::Uint16le
56
+ @bindata_reader_32 = BinData::Uint32le
57
+ else
58
+ @bindata_reader_16 = BinData::Uint16be
59
+ @bindata_reader_32 = BinData::Uint32be
60
+ end
61
+ ifd_offset = @bindata_reader_32.read(@data[offset+4..offset+7])
62
+ extensions = [:tif, :tiff] # By default
63
+ @max_end_offset = ifd_offset
64
+ @strip_offsets = []
65
+ @strip_byte_counts = []
66
+ @tile_offsets = []
67
+ @tile_byte_counts = []
68
+ @compression = 1
69
+ @lst_bits_per_sample = [1]
70
+ @image_width = nil
71
+ @image_length = nil
72
+ @tag_parser = Proc.new do |tag, type, nbr, size, cursor|
73
+ case tag.to_i
74
+ when 2
75
+ metadata( :gps_latitude => @data[cursor..cursor+size-1] )
76
+ when 4
77
+ metadata( :gps_longitude => @data[cursor..cursor+size-1] )
78
+ when 6
79
+ metadata( :gps_altitude => @data[cursor..cursor+size-1] )
80
+ when 7
81
+ metadata( :gps_timestamp => [
82
+ read_ratio(cursor),
83
+ read_ratio(cursor+8),
84
+ read_ratio(cursor+16),
85
+ ] )
86
+ when 256
87
+ @image_width = ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3]))
88
+ invalid_data("@#{cursor} - Invalid image width #{@image_width}") if (@image_width == 0)
89
+ metadata( :image_width => @image_width )
90
+ when 257
91
+ @image_length = ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3]))
92
+ invalid_data("@#{cursor} - Invalid image length #{@image_length}") if (@image_length == 0)
93
+ metadata( :image_length => @image_length )
94
+ when 258
95
+ @lst_bits_per_sample = []
96
+ nbr.times do |idx_sample|
97
+ @lst_bits_per_sample << @bindata_reader_16.read(@data[cursor+2*idx_sample..cursor+2*idx_sample+1])
98
+ end
99
+ metadata( :lst_bits_per_sample => @lst_bits_per_sample )
100
+ when 259
101
+ @compression = @bindata_reader_16.read(@data[cursor..cursor+1])
102
+ invalid_data("@#{cursor} - Invalid compression #{@compression}") if (!VALID_COMPRESSION_VALUES.include?(@compression))
103
+ metadata( :compression => @compression )
104
+ when 262
105
+ photometric_interpretation = @bindata_reader_16.read(@data[cursor..cursor+1])
106
+ invalid_data("@#{cursor} - Invalid photometric interpretation #{photometric_interpretation}") if (!VALID_PHOTOMETRIC_INTERPRETATIONS.include?(photometric_interpretation))
107
+ metadata( :photometric_interpretation => photometric_interpretation )
108
+ when 264
109
+ cell_width = @bindata_reader_16.read(@data[cursor..cursor+1])
110
+ invalid_data("@#{cursor} - Invalid cell width #{cell_width}") if (cell_width == 0)
111
+ metadata( :cell_width => cell_width )
112
+ when 265
113
+ cell_length = @bindata_reader_16.read(@data[cursor..cursor+1])
114
+ invalid_data("@#{cursor} - Invalid cell length #{cell_length}") if (cell_length == 0)
115
+ metadata( :cell_length => cell_length )
116
+ when 266
117
+ fill_order = @bindata_reader_16.read(@data[cursor..cursor+1])
118
+ invalid_data("@#{cursor} - Invalid fill order #{fill_order}") if ((fill_order == 0) or (fill_order > 2))
119
+ metadata( :fill_order => fill_order )
120
+ when 269
121
+ metadata( :document_name => read_ascii(cursor, size) )
122
+ when 270
123
+ metadata( :image_description => read_ascii(cursor, size) )
124
+ when 271
125
+ metadata( :make => read_ascii(cursor, size) )
126
+ when 272
127
+ metadata( :model => read_ascii(cursor, size) )
128
+ when 273
129
+ value_size = ((type == 3) ? 2 : 4)
130
+ nbr.times do |idx|
131
+ @strip_offsets << ((type == 3) ? @bindata_reader_16.read(@data[cursor+idx*value_size..cursor+idx*value_size+1]) : @bindata_reader_32.read(@data[cursor+idx*value_size..cursor+idx*value_size+3]))
132
+ end
133
+ found_relevant_data(extensions)
134
+ when 274
135
+ orientation = @bindata_reader_16.read(@data[cursor..cursor+1])
136
+ invalid_data("@#{cursor} - Invalid orientation #{orientation}") if ((orientation == 0) or (orientation > 8))
137
+ metadata( :orientation => orientation )
138
+ when 277
139
+ samples_per_pixel = @bindata_reader_16.read(@data[cursor..cursor+1])
140
+ invalid_data("@#{cursor} - Invalid samples per pixel #{samples_per_pixel}") if (samples_per_pixel == 0)
141
+ metadata( :samples_per_pixel => samples_per_pixel )
142
+ when 278
143
+ rows_per_strip = ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3]))
144
+ invalid_data("@#{cursor} - Invalid rows per strip #{rows_per_strip}") if (rows_per_strip == 0)
145
+ metadata( :rows_per_strip => rows_per_strip )
146
+ when 279
147
+ value_size = ((type == 3) ? 2 : 4)
148
+ nbr.times do |idx|
149
+ @strip_byte_counts << ((type == 3) ? @bindata_reader_16.read(@data[cursor+idx*value_size..cursor+idx*value_size+1]) : @bindata_reader_32.read(@data[cursor+idx*value_size..cursor+idx*value_size+3]))
150
+ end
151
+ when 282
152
+ ratio = read_ratio(cursor)
153
+ invalid_data("@#{cursor} - Invalid x resolution #{ratio}") if (ratio == 0)
154
+ metadata( :x_resolution => ratio )
155
+ when 283
156
+ ratio = read_ratio(cursor)
157
+ invalid_data("@#{cursor} - Invalid y resolution #{ratio}") if (ratio == 0)
158
+ metadata( :y_resolution => ratio )
159
+ when 285
160
+ metadata( :page_name => read_ascii(cursor, size) )
161
+ when 296
162
+ resolution_unit = @bindata_reader_16.read(@data[cursor..cursor+1])
163
+ invalid_data("@#{cursor} - Invalid resolution unit #{resolution_unit}") if ((resolution_unit == 0) or (resolution_unit > 3))
164
+ metadata( :resolution_unit => resolution_unit )
165
+ when 297
166
+ page_number = @bindata_reader_16.read(@data[cursor..cursor+1])
167
+ page_total = @bindata_reader_16.read(@data[cursor+2..cursor+3])
168
+ invalid_data("@#{cursor} - Invalid page total #{page_total}") if (page_total == 0)
169
+ metadata( :page_number => page_number, :page_total => page_total )
170
+ when 305
171
+ metadata( :software => read_ascii(cursor, size) )
172
+ when 306
173
+ metadata( :date_time => read_ascii(cursor, size) )
174
+ when 315
175
+ metadata( :artist => read_ascii(cursor, size) )
176
+ when 316
177
+ metadata( :host_computer => read_ascii(cursor, size) )
178
+ when 324
179
+ nbr.times do |idx|
180
+ @tile_offsets << @bindata_reader_32.read(@data[cursor+idx*4..cursor+idx*4+3])
181
+ end
182
+ found_relevant_data(extensions)
183
+ when 325
184
+ nbr.times do |idx|
185
+ @tile_byte_counts << @bindata_reader_32.read(@data[cursor+idx*4..cursor+idx*4+3])
186
+ end
187
+ when 337
188
+ metadata( :target_printer => read_ascii(cursor, size) )
189
+ when 33432
190
+ metadata( :copyright => read_ascii(cursor, size) )
191
+ when 33434
192
+ metadata( :exposure_time => read_ratio(cursor) )
193
+ when 33437
194
+ metadata( :f_number => read_ratio(cursor) )
195
+ when 34665
196
+ exif_ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
197
+ metadata( :exif_ifd => true )
198
+ parse_ifd(exif_ifd_offset, &@tag_parser)
199
+ when 34853
200
+ gps_ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
201
+ metadata( :gps_ifd => true )
202
+ parse_ifd(gps_ifd_offset, &@tag_parser)
203
+ when 36864
204
+ metadata( :exif_version => read_ascii(cursor, size) )
205
+ when 36867
206
+ metadata( :date_time_original => read_ascii(cursor, size) )
207
+ when 36868
208
+ metadata( :date_time_digitized => read_ascii(cursor, size) )
209
+ when 37386
210
+ metadata( :focal_length => read_ratio(cursor) )
211
+ when 37510
212
+ metadata( :user_comment => read_ascii(cursor, size) )
213
+ when 37520
214
+ metadata( :subsec_time => read_ascii(cursor, size) )
215
+ when 37521
216
+ metadata( :subsec_time_original => read_ascii(cursor, size) )
217
+ when 37522
218
+ metadata( :subsec_time_digitized => read_ascii(cursor, size) )
219
+ when 40960
220
+ metadata( :flashpix_version => read_ascii(cursor, size) )
221
+ when 40962
222
+ metadata( :pixel_x_dimension => ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3])) )
223
+ when 40963
224
+ metadata( :pixel_y_dimension => ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3])) )
225
+ when 40965
226
+ interoperability_ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
227
+ metadata( :interoperability_ifd => true )
228
+ parse_ifd(interoperability_ifd_offset, &@tag_parser)
229
+ end
230
+ end
231
+ parse_ifd(ifd_offset, &@tag_parser)
232
+ log_debug "@#{@file_offset + @max_end_offset} - Found #{@strip_offsets.size} strips and #{@tile_offsets.size} tiles."
233
+ found_relevant_data(extensions)
234
+ invalid_data("@#{@file_offset + @max_end_offset} - No strips nor tiles defined.") if ((!@accept_no_image_data) and (@strip_offsets.empty?) and (@tile_offsets.empty?))
235
+ # Special case:
236
+ if ((@strip_offsets.size == 1) and
237
+ (@strip_byte_counts.empty?))
238
+ # Compute the strip size: this is the total image size
239
+ invalid_data("@#{@file_offset + @max_end_offset} - Missing strip byte counts and image is compressed") if (@compression != 1)
240
+ invalid_data("@#{@file_offset + @max_end_offset} - Missing image width") if (@image_width == nil)
241
+ invalid_data("@#{@file_offset + @max_end_offset} - Missing image length") if (@image_length == nil)
242
+ # Compute a single row size
243
+ nbr_bits_per_pixel = 0
244
+ all_samples_16 = true
245
+ all_samples_32 = true
246
+ @lst_bits_per_sample.each do |nbr_bits|
247
+ nbr_bits_per_pixel += nbr_bits
248
+ all_samples_16 = false if (nbr_bits != 16)
249
+ all_samples_32 = false if (nbr_bits != 32)
250
+ end
251
+ row_size_bits = @image_width * nbr_bits_per_pixel
252
+ # Compute the padding in bits
253
+ bits_padding = (all_samples_16 ? 16 : (all_samples_32 ? 32 : 8))
254
+ bits_rest = row_size_bits % bits_padding
255
+ row_size_bits += bits_padding - bits_rest if (bits_rest != 0)
256
+ # We have the real row size
257
+ image_end_offset = @strip_offsets[0] + @image_length * (row_size_bits / 8)
258
+ @max_end_offset = image_end_offset if (@max_end_offset < image_end_offset)
259
+ else
260
+ invalid_data("@#{@file_offset + @max_end_offset} - Found #{@strip_offsets.size} strip offsets but #{@strip_byte_counts.size} strip bytes count") if (@strip_offsets.size != @strip_byte_counts.size)
261
+ invalid_data("@#{@file_offset + @max_end_offset} - Found #{@tile_offsets.size} tile offsets but #{@tile_byte_counts.size} tile bytes count") if (@tile_offsets.size != @tile_byte_counts.size)
262
+ # Read all strips
263
+ @strip_offsets.each_with_index do |strip_offset, idx_strip|
264
+ @max_end_offset = strip_offset + @strip_byte_counts[idx_strip] if (@max_end_offset < strip_offset + @strip_byte_counts[idx_strip])
265
+ end
266
+ # Read all tiles
267
+ @tile_offsets.each_with_index do |tile_offset, idx_tile|
268
+ @max_end_offset = tile_offset + @tile_byte_counts[idx_tile] if (@max_end_offset < tile_offset + @tile_byte_counts[idx_tile])
269
+ end
270
+ end
271
+
272
+ return @file_offset + @max_end_offset
273
+ end
274
+
275
+ private
276
+
277
+ # Parse an IFD
278
+ #
279
+ # Parameters::
280
+ # * *ifd_offset* (_Fixnum_): IFD offset to parse
281
+ # * *&proc* (_Proc_): Code called each time a tag is being parsed:
282
+ # * Parameters::
283
+ # * *tag* (_Fixnum_): Tag read
284
+ # * *type* (_Fixnum_): Type of this tag
285
+ # * *nbr* (_Fixnum_): Number of values in this tag
286
+ # * *size* (_Fixnum_): Complete size of this tag
287
+ # * *cursor* (_Fixnum_): Cursor to read the values from
288
+ def parse_ifd(ifd_offset, &proc)
289
+ log_debug "@#{@file_offset + ifd_offset} - Parse IFD"
290
+ while (ifd_offset != 0)
291
+ cursor = @file_offset + ifd_offset
292
+ nbr_entries = @bindata_reader_16.read(@data[cursor..cursor+1])
293
+ cursor += 2
294
+ nbr_entries.times do |idx_entry|
295
+ tag = @bindata_reader_16.read(@data[cursor..cursor+1])
296
+ type = @bindata_reader_16.read(@data[cursor+2..cursor+3])
297
+ nbr = @bindata_reader_32.read(@data[cursor+4..cursor+7])
298
+ # Compute the size
299
+ invalid_data("@#{cursor} - Invalid type: #{type}") if (!TYPE_SIZES.include?(type))
300
+ size = TYPE_SIZES[type]*nbr
301
+ # Read the offset of the value
302
+ value_offset = @bindata_reader_32.read(@data[cursor+8..cursor+11])
303
+ log_debug "@#{cursor} - Found tag #{tag} (type #{type}) with #{nbr} values (size #{size}): #{value_offset}"
304
+ if (size > 4)
305
+ yield(tag, type, nbr, size, @file_offset + value_offset)
306
+ value_end_offset = value_offset + size
307
+ @max_end_offset = value_end_offset if (@max_end_offset < value_end_offset)
308
+ else
309
+ yield(tag, type, nbr, size, cursor + 8)
310
+ end
311
+ cursor += 12
312
+ progress(cursor)
313
+ end
314
+ ifd_end_offset = ifd_offset + 6 + nbr_entries*12
315
+ @max_end_offset = ifd_end_offset if (@max_end_offset < ifd_end_offset)
316
+ # Read the next ifd offset
317
+ ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
318
+ end
319
+ end
320
+
321
+ # Read an ASCII value
322
+ #
323
+ # Parameters::
324
+ # * *cursor* (_Fixnum_): The cursor to read from
325
+ # * *size* (_Fixnum_): Size of the string
326
+ # Result::
327
+ # * _String_ or <em>list<String></em>: Resulting string or list of strings if several.
328
+ def read_ascii(cursor, size)
329
+ lst_strings = @data[cursor..cursor+size-1].gsub(TRAILING_00_REGEXP, '').strip.split(NULL_TERMINATING_CHAR)
330
+ return (lst_strings.size == 1) ? lst_strings[0] : lst_strings
331
+ end
332
+
333
+ # Read a Rational value
334
+ #
335
+ # Parameters::
336
+ # * *cursor* (_Fixnum_): The cursor to read from
337
+ # Result::
338
+ # * _Float_: The rational
339
+ def read_ratio(cursor)
340
+ num = @bindata_reader_32.read(@data[cursor..cursor+3])
341
+ denom = @bindata_reader_32.read(@data[cursor+4..cursor+7])
342
+ invalid_data("@#{cursor} - Invalid rational #{num}/#{denom}") if ((denom == 0) and (num != 0))
343
+ return (num == 0) ? 0 : num.to_f / denom.to_f
344
+ end
345
+
346
+ end
347
+
348
+ end
349
+
350
+ end
@@ -0,0 +1,240 @@
1
+ module FilesHunter
2
+
3
+ module Decoders
4
+
5
+ class Text < Decoder
6
+
7
+ UTF_16BE_BOM = "\xFE\xFF".force_encoding(Encoding::ASCII_8BIT)
8
+ UTF_16LE_BOM = "\xFF\xFE".force_encoding(Encoding::ASCII_8BIT)
9
+ NULL_CHAR = "\x00".force_encoding(Encoding::ASCII_8BIT)
10
+ NL_CHAR = "\n".force_encoding(Encoding::ASCII_8BIT)
11
+
12
+ # Find segments from a given data
13
+ def find_segments
14
+ current_offset = @begin_offset
15
+ while (current_offset < @end_offset)
16
+ # First find a new line character from current_offset
17
+ newline_offset = @data.index(NL_CHAR, current_offset)
18
+ if ((newline_offset == nil) or
19
+ (newline_offset >= @end_offset))
20
+ # No text
21
+ current_offset = @end_offset
22
+ log_debug "Contains no more Text."
23
+ else
24
+ # We have a candidate
25
+ # Get back to see the beginning of Text
26
+ text_begin_offset = nil
27
+ text_header_size = 0
28
+ # Detect if it might be UTF-16 encoded
29
+ if (((newline_offset > @begin_offset) and
30
+ (@data[newline_offset-1] == NULL_CHAR)) or
31
+ ((newline_offset < @end_offset-1) and
32
+ (@data[newline_offset+1] == NULL_CHAR)))
33
+ # Cursor should always be on a \x00 unless it arrived at the end
34
+ cursor = newline_offset - 1
35
+ while ((cursor >= @begin_offset+1) and
36
+ (@data[cursor] == NULL_CHAR) and
37
+ ((((c = @data[cursor-1].ord) >= 32) and
38
+ (c != 127)) or
39
+ (c == 9) or
40
+ (c == 13)))
41
+ cursor -= 2
42
+ end
43
+ # Here we several possibilities:
44
+ # * cursor is on @begin_offset-1 and data begins with \x00: UTF-16 string starts at @begin_offset and is big endian,
45
+ # * else cursor is on @begin_offset and @data[@begin_offset+1] is \x00 but we did not check @data[@begin_offset] (if @data[@begin_offset] is ASCII then it means UTF-16 begins at @begin_offset and is little endian ; otherwise it starts at @begin_offset+1 and is big endian),
46
+ # * else cursor is at least on @begin_offset+1 (we have at least 2 bytes before it), and
47
+ # * @data[cursor] is not \x00 and @data[cursor+2] is \x00: we could be on the endianness marker, or out of the string already ; if not endianness marker, if @data[cursor+1] is valid ASCI then UTF-16 string starts at cursor+1 and is little endian, otherwise it starts at cursor+2 and is big endian,
48
+ # * else @data[cursor] is \x00 but preceding character is not ASCII (meaning it can't be the endianness marker either): UTF-16 string starts at cursor end is big endian
49
+ # UTF_16BE = "\xFE\xFF\x00\x??"
50
+ # UTF_16LE = "\xFF\xFE\x??\x00"
51
+ # In following comments, here are the conventions:
52
+ # * \xAA means valid ASCII character
53
+ # * \xBB means not a valid ASCII character
54
+ # * \x11 means a non zero character
55
+ # * \x?? means unknown character
56
+ # * other values represent their corresponding character
57
+ if (cursor == @begin_offset-1)
58
+ # @data[@begin_offset..@begin_offset+1] == \xAA\x00
59
+ text_begin_offset = @begin_offset
60
+ encoding = Encoding::UTF_16LE
61
+ elsif (cursor == @begin_offset)
62
+ # @data[@begin_offset..@begin_offset+2] == \x??\xAA\x00
63
+ if ((c = @data[@begin_offset].ord) == 0)
64
+ # @data[@begin_offset..@begin_offset+2] == \x00\xAA\x00
65
+ text_begin_offset = @begin_offset
66
+ encoding = Encoding::UTF_16BE
67
+ elsif (((c >= 32) and
68
+ (c != 127)) or
69
+ (c == 9) or
70
+ (c == 13))
71
+ # @data[@begin_offset..@begin_offset+2] == \xAA\xAA\x00
72
+ if (@data[@begin_offset..@begin_offset+1] == UTF_16BE_BOM)
73
+ # @data[@begin_offset..@begin_offset+2] == \xFE\xFF\x00
74
+ text_begin_offset = @begin_offset
75
+ encoding = Encoding::UTF_16BE
76
+ text_header_size = 2
77
+ else
78
+ text_begin_offset = @begin_offset + 1
79
+ encoding = Encoding::UTF_16LE
80
+ end
81
+ else
82
+ # @data[@begin_offset..@begin_offset+2] == \xBB\xAA\x00
83
+ text_begin_offset = @begin_offset + 1
84
+ encoding = Encoding::UTF_16LE
85
+ end
86
+ elsif (@data[cursor] == NULL_CHAR)
87
+ # @data[cursor-1..cursor+2] == \xBB\x00\xAA\x00
88
+ text_begin_offset = cursor
89
+ encoding = Encoding::UTF_16BE
90
+ elsif (@data[cursor-1..cursor] == UTF_16LE_BOM)
91
+ # @data[cursor-1..cursor+2] == \xFF\xFE\xAA\x00
92
+ text_begin_offset = cursor - 1
93
+ encoding = Encoding::UTF_16LE
94
+ text_header_size = 2
95
+ elsif (@data[cursor..cursor+1] == UTF_16BE_BOM)
96
+ # @data[cursor-1..cursor+2] == \x??\xFE\xFF\x00
97
+ text_begin_offset = cursor
98
+ encoding = Encoding::UTF_16BE
99
+ text_header_size = 2
100
+ else
101
+ # @data[cursor-1..cursor+2] == \x??\x11\xAA\x00
102
+ text_begin_offset = cursor + 1
103
+ encoding = Encoding::UTF_16LE
104
+ end
105
+ else
106
+ encoding = Encoding::ASCII_8BIT
107
+ cursor = newline_offset - 1
108
+ while ((cursor >= @begin_offset) and
109
+ ((((c = @data[cursor].ord) >= 32) and
110
+ (c != 127)) or
111
+ (c == 9) or
112
+ (c == 13)))
113
+ cursor -= 1
114
+ end
115
+ text_begin_offset = cursor + 1
116
+ end
117
+ # Now find forward
118
+ keep_alive
119
+ text_end_offset = nil
120
+ truncated = false
121
+ case encoding
122
+ when Encoding::ASCII_8BIT
123
+ cursor = newline_offset + 1
124
+ while ((cursor < @end_offset) and
125
+ ((((c = @data[cursor].ord) >= 32) and
126
+ (c != 127)) or
127
+ (c == 9) or
128
+ (c == 10) or
129
+ (c == 13)))
130
+ cursor += 1
131
+ end
132
+ text_end_offset = cursor
133
+ when Encoding::UTF_16BE
134
+ # cursor points on \x00
135
+ cursor = newline_offset + 1
136
+ while ((cursor < @end_offset-1) and
137
+ (@data[cursor] == NULL_CHAR) and
138
+ ((((c = @data[cursor+1].ord) >= 32) and
139
+ (c != 127)) or
140
+ (c == 9) or
141
+ (c == 10) or
142
+ (c == 13)))
143
+ cursor += 2
144
+ end
145
+ # Several possibilities:
146
+ # * cursor is at @end_offset, meaning the string ends at @end_offset,
147
+ # * else cursor is at @end_offset-1, meaning the string ends at @end_offset-1 or at @end_offset and is truncated if @data[@end_offset-1] is "\x00",
148
+ # * else the string ends at cursor
149
+ if (cursor == @end_offset-1)
150
+ if (@data[cursor] == NULL_CHAR)
151
+ truncated = true
152
+ text_end_offset = @end_offset
153
+ else
154
+ text_end_offset = @end_offset - 1
155
+ end
156
+ else
157
+ text_end_offset = cursor
158
+ end
159
+ when Encoding::UTF_16LE
160
+ # cursor points on the ASCII value
161
+ cursor = newline_offset
162
+ while ((cursor < @end_offset-1) and
163
+ (@data[cursor+1] == NULL_CHAR) and
164
+ ((((c = @data[cursor].ord) >= 32) and
165
+ (c != 127)) or
166
+ (c == 9) or
167
+ (c == 10) or
168
+ (c == 13)))
169
+ cursor += 2
170
+ end
171
+ # Several possibilities:
172
+ # * cursor is at @end_offset, meaning the string ends at @end_offset,
173
+ # * else cursor is at @end_offset-1, meaning the string ends at @end_offset-1 or at @end_offset and is truncated if @data[@end_offset-1] is a valid ASCII,
174
+ # * else the string ends at cursor
175
+ if (cursor == @end_offset-1)
176
+ if ((((c = @data[cursor].ord) >= 32) and
177
+ (c != 127)) or
178
+ (c == 9) or
179
+ (c == 10) or
180
+ (c == 13))
181
+ truncated = true
182
+ text_end_offset = @end_offset
183
+ else
184
+ text_end_offset = @end_offset - 1
185
+ end
186
+ else
187
+ text_end_offset = cursor
188
+ end
189
+ end
190
+ # Consider text files longer than a certain size only
191
+ if (text_end_offset - text_begin_offset < 512*((encoding == Encoding::ASCII_8BIT) ? 1 : 2))
192
+ #log_debug "@#{text_begin_offset} - Text section is too short (#{text_end_offset - text_begin_offset}) to be identified as text"
193
+ else
194
+ log_debug "@#{text_begin_offset} - Found text up to #{text_end_offset} with encoding #{encoding} and header of size #{text_header_size}"
195
+ # Now check some formats
196
+ text = @data[text_begin_offset+text_header_size..text_end_offset-1].clone.force_encoding(encoding)
197
+ lines = text.split("\r\n".encode(encoding))
198
+ lines = text.split("\n".encode(encoding)) if (lines.size == 1)
199
+ extension = [ :txt, :log ] # By default
200
+ if is_text_srt?(lines, encoding)
201
+ extension = :srt
202
+ elsif is_text_rtf?(lines, encoding)
203
+ extension = :rtf
204
+ elsif is_text_html?(lines, encoding)
205
+ extension = :html
206
+ elsif is_text_xml?(lines, encoding)
207
+ extension = :xml
208
+ end
209
+ found_segment(text_begin_offset, text_end_offset, extension, false, false, :encoding => encoding)
210
+ end
211
+ current_offset = text_end_offset + 1
212
+ end
213
+ end
214
+ end
215
+
216
+ private
217
+
218
+ def is_text_srt?(lines, encoding)
219
+ # TODO (Ruby bug): Replace [0-9] with \d when it will work in UTF_16LE encoding
220
+ return ((lines[0] =~ Regexp.new('^\d+$'.encode(encoding))) and
221
+ (lines[1] =~ Regexp.new('^[0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9] --> [0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9]$'.encode(encoding))))
222
+ end
223
+
224
+ def is_text_rtf?(lines, encoding)
225
+ return (lines[0] =~ Regexp.new('{\\\\rtf'.encode(encoding)))
226
+ end
227
+
228
+ def is_text_html?(lines, encoding)
229
+ return lines[0] =~ Regexp.new('<!DOCTYPE html'.encode(encoding))
230
+ end
231
+
232
+ def is_text_xml?(lines, encoding)
233
+ return lines[0] =~ Regexp.new('<?xml '.encode(encoding))
234
+ end
235
+
236
+ end
237
+
238
+ end
239
+
240
+ end
@@ -0,0 +1,50 @@
1
+ module FilesHunter
2
+
3
+ # A segment represents a chunk of data
4
+ class Segment
5
+
6
+ # Begin offset of the segment
7
+ # Fixnum
8
+ attr_reader :begin_offset
9
+
10
+ # End offset of the segment (equals the begin offset of the next segment)
11
+ # Fixnum
12
+ attr_reader :end_offset
13
+
14
+ # List of extensions guessed (sort by descending probability) (:mkv, :dll ...). :unknown used to unknown data.
15
+ # list<Symbol>
16
+ attr_reader :extensions
17
+
18
+ # Is this segment truncated? This means that for the given extension, data should have continued beyond this segment.
19
+ # Boolean
20
+ attr_reader :truncated
21
+
22
+ # Is this segment missing previous data? This means that for the given extension, data should have been present already before this segment.
23
+ # Boolean
24
+ attr_reader :missing_previous_data
25
+
26
+ # Metadata associated to this Segment (Decoder dependent)
27
+ # map< Symbol, Object >
28
+ attr_reader :metadata
29
+
30
+ # Constructor
31
+ #
32
+ # Parameters::
33
+ # * *begin_offset* (_Fixnum_): Specify begin offset
34
+ # * *end_offset* (_Fixnum_): Specify end offset
35
+ # * *extension* (_Symbol_ or <em>list<Symbol></em>): Specify extension
36
+ # * *truncated* (_Boolean_): Specify truncated flag
37
+ # * *missing_previous_data* (_Boolean_): Do we lack data before this segment?
38
+ # * *metadata* (<em>map<Symbol,Object></em>): Metadata (Decoder dependent)
39
+ def initialize(begin_offset, end_offset, extension, truncated, missing_previous_data, metadata)
40
+ @begin_offset = begin_offset
41
+ @end_offset = end_offset
42
+ @extensions = (extension.is_a?(Symbol)) ? [ extension ] : extension
43
+ @truncated = truncated
44
+ @missing_previous_data = missing_previous_data
45
+ @metadata = metadata
46
+ end
47
+
48
+ end
49
+
50
+ end