fileshunter 0.1.0.20130725
Sign up to get free protection for your applications and to get access to all the features.
- data/AUTHORS +3 -0
- data/ChangeLog +5 -0
- data/Credits +21 -0
- data/LICENSE +31 -0
- data/README +15 -0
- data/README.md +11 -0
- data/Rakefile +7 -0
- data/ReleaseInfo +8 -0
- data/bin/fileshunt +216 -0
- data/ext/fileshunter/Decoders/_FLAC.c +233 -0
- data/ext/fileshunter/Decoders/extconf.rb +3 -0
- data/lib/fileshunter/BeginPatternDecoder.rb +218 -0
- data/lib/fileshunter/Decoder.rb +66 -0
- data/lib/fileshunter/Decoders/ASF.rb +50 -0
- data/lib/fileshunter/Decoders/BMP.rb +118 -0
- data/lib/fileshunter/Decoders/CAB.rb +140 -0
- data/lib/fileshunter/Decoders/CFBF.rb +92 -0
- data/lib/fileshunter/Decoders/EBML.rb +369 -0
- data/lib/fileshunter/Decoders/EXE.rb +505 -0
- data/lib/fileshunter/Decoders/FLAC.rb +387 -0
- data/lib/fileshunter/Decoders/ICO.rb +71 -0
- data/lib/fileshunter/Decoders/JPEG.rb +247 -0
- data/lib/fileshunter/Decoders/M2V.rb +30 -0
- data/lib/fileshunter/Decoders/MP3.rb +341 -0
- data/lib/fileshunter/Decoders/MP4.rb +620 -0
- data/lib/fileshunter/Decoders/MPG_Video.rb +30 -0
- data/lib/fileshunter/Decoders/OGG.rb +74 -0
- data/lib/fileshunter/Decoders/RIFF.rb +437 -0
- data/lib/fileshunter/Decoders/TIFF.rb +350 -0
- data/lib/fileshunter/Decoders/Text.rb +240 -0
- data/lib/fileshunter/Segment.rb +50 -0
- data/lib/fileshunter/SegmentsAnalyzer.rb +251 -0
- data/lib/fileshunter.rb +15 -0
- metadata +130 -0
@@ -0,0 +1,350 @@
|
|
1
|
+
module FilesHunter
|
2
|
+
|
3
|
+
module Decoders
|
4
|
+
|
5
|
+
class TIFF < BeginPatternDecoder
|
6
|
+
|
7
|
+
# TIFF Reference: http://partners.adobe.com/public/developer/en/tiff/TIFF6.pdf
|
8
|
+
# Exif Reference: http://www.exif.org/Exif2-2.PDF
|
9
|
+
|
10
|
+
BEGIN_PATTERN_TIFF_LE = "II*\x00".force_encoding(Encoding::ASCII_8BIT)
|
11
|
+
BEGIN_PATTERN_TIFF_BE = "MM\x00*".force_encoding(Encoding::ASCII_8BIT)
|
12
|
+
BEGIN_PATTERN_TIFF = Regexp.new("(#{Regexp.escape(BEGIN_PATTERN_TIFF_LE)}|#{Regexp.escape(BEGIN_PATTERN_TIFF_BE)})", nil, 'n')
|
13
|
+
|
14
|
+
TYPE_SIZES = {
|
15
|
+
1 => 1,
|
16
|
+
2 => 1,
|
17
|
+
3 => 2,
|
18
|
+
4 => 4,
|
19
|
+
5 => 8,
|
20
|
+
6 => 1,
|
21
|
+
7 => 1,
|
22
|
+
8 => 2,
|
23
|
+
9 => 4,
|
24
|
+
10 => 8,
|
25
|
+
11 => 4,
|
26
|
+
12 => 8
|
27
|
+
}
|
28
|
+
|
29
|
+
VALID_COMPRESSION_VALUES = [ 1, 2, 3, 4, 5, 6, 32773 ]
|
30
|
+
VALID_PHOTOMETRIC_INTERPRETATIONS = [ 0, 1, 2, 3, 4, 5, 6, 8 ]
|
31
|
+
|
32
|
+
TRAILING_00_REGEXP = Regexp.new("\x00*$".force_encoding(Encoding::ASCII_8BIT), nil, 'n')
|
33
|
+
NULL_TERMINATING_CHAR = "\x00".force_encoding(Encoding::ASCII_8BIT)
|
34
|
+
|
35
|
+
def initialize
|
36
|
+
super
|
37
|
+
@accept_no_image_data = false
|
38
|
+
end
|
39
|
+
|
40
|
+
def get_begin_pattern
|
41
|
+
return BEGIN_PATTERN_TIFF, { :offset_inc => 4, :max_regexp_size => 4 }
|
42
|
+
end
|
43
|
+
|
44
|
+
# Set this decoder to accept no image data.
|
45
|
+
# This is particularly useful for other decoders using it (for example with JPEG and its Exif info)
|
46
|
+
def accept_no_image_data
|
47
|
+
@accept_no_image_data = true
|
48
|
+
end
|
49
|
+
|
50
|
+
def decode(offset)
|
51
|
+
@file_offset = offset
|
52
|
+
@bindata_reader_16 = nil
|
53
|
+
@bindata_reader_32 = nil
|
54
|
+
if (@data[offset..offset+3] == BEGIN_PATTERN_TIFF_LE)
|
55
|
+
@bindata_reader_16 = BinData::Uint16le
|
56
|
+
@bindata_reader_32 = BinData::Uint32le
|
57
|
+
else
|
58
|
+
@bindata_reader_16 = BinData::Uint16be
|
59
|
+
@bindata_reader_32 = BinData::Uint32be
|
60
|
+
end
|
61
|
+
ifd_offset = @bindata_reader_32.read(@data[offset+4..offset+7])
|
62
|
+
extensions = [:tif, :tiff] # By default
|
63
|
+
@max_end_offset = ifd_offset
|
64
|
+
@strip_offsets = []
|
65
|
+
@strip_byte_counts = []
|
66
|
+
@tile_offsets = []
|
67
|
+
@tile_byte_counts = []
|
68
|
+
@compression = 1
|
69
|
+
@lst_bits_per_sample = [1]
|
70
|
+
@image_width = nil
|
71
|
+
@image_length = nil
|
72
|
+
@tag_parser = Proc.new do |tag, type, nbr, size, cursor|
|
73
|
+
case tag.to_i
|
74
|
+
when 2
|
75
|
+
metadata( :gps_latitude => @data[cursor..cursor+size-1] )
|
76
|
+
when 4
|
77
|
+
metadata( :gps_longitude => @data[cursor..cursor+size-1] )
|
78
|
+
when 6
|
79
|
+
metadata( :gps_altitude => @data[cursor..cursor+size-1] )
|
80
|
+
when 7
|
81
|
+
metadata( :gps_timestamp => [
|
82
|
+
read_ratio(cursor),
|
83
|
+
read_ratio(cursor+8),
|
84
|
+
read_ratio(cursor+16),
|
85
|
+
] )
|
86
|
+
when 256
|
87
|
+
@image_width = ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3]))
|
88
|
+
invalid_data("@#{cursor} - Invalid image width #{@image_width}") if (@image_width == 0)
|
89
|
+
metadata( :image_width => @image_width )
|
90
|
+
when 257
|
91
|
+
@image_length = ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3]))
|
92
|
+
invalid_data("@#{cursor} - Invalid image length #{@image_length}") if (@image_length == 0)
|
93
|
+
metadata( :image_length => @image_length )
|
94
|
+
when 258
|
95
|
+
@lst_bits_per_sample = []
|
96
|
+
nbr.times do |idx_sample|
|
97
|
+
@lst_bits_per_sample << @bindata_reader_16.read(@data[cursor+2*idx_sample..cursor+2*idx_sample+1])
|
98
|
+
end
|
99
|
+
metadata( :lst_bits_per_sample => @lst_bits_per_sample )
|
100
|
+
when 259
|
101
|
+
@compression = @bindata_reader_16.read(@data[cursor..cursor+1])
|
102
|
+
invalid_data("@#{cursor} - Invalid compression #{@compression}") if (!VALID_COMPRESSION_VALUES.include?(@compression))
|
103
|
+
metadata( :compression => @compression )
|
104
|
+
when 262
|
105
|
+
photometric_interpretation = @bindata_reader_16.read(@data[cursor..cursor+1])
|
106
|
+
invalid_data("@#{cursor} - Invalid photometric interpretation #{photometric_interpretation}") if (!VALID_PHOTOMETRIC_INTERPRETATIONS.include?(photometric_interpretation))
|
107
|
+
metadata( :photometric_interpretation => photometric_interpretation )
|
108
|
+
when 264
|
109
|
+
cell_width = @bindata_reader_16.read(@data[cursor..cursor+1])
|
110
|
+
invalid_data("@#{cursor} - Invalid cell width #{cell_width}") if (cell_width == 0)
|
111
|
+
metadata( :cell_width => cell_width )
|
112
|
+
when 265
|
113
|
+
cell_length = @bindata_reader_16.read(@data[cursor..cursor+1])
|
114
|
+
invalid_data("@#{cursor} - Invalid cell length #{cell_length}") if (cell_length == 0)
|
115
|
+
metadata( :cell_length => cell_length )
|
116
|
+
when 266
|
117
|
+
fill_order = @bindata_reader_16.read(@data[cursor..cursor+1])
|
118
|
+
invalid_data("@#{cursor} - Invalid fill order #{fill_order}") if ((fill_order == 0) or (fill_order > 2))
|
119
|
+
metadata( :fill_order => fill_order )
|
120
|
+
when 269
|
121
|
+
metadata( :document_name => read_ascii(cursor, size) )
|
122
|
+
when 270
|
123
|
+
metadata( :image_description => read_ascii(cursor, size) )
|
124
|
+
when 271
|
125
|
+
metadata( :make => read_ascii(cursor, size) )
|
126
|
+
when 272
|
127
|
+
metadata( :model => read_ascii(cursor, size) )
|
128
|
+
when 273
|
129
|
+
value_size = ((type == 3) ? 2 : 4)
|
130
|
+
nbr.times do |idx|
|
131
|
+
@strip_offsets << ((type == 3) ? @bindata_reader_16.read(@data[cursor+idx*value_size..cursor+idx*value_size+1]) : @bindata_reader_32.read(@data[cursor+idx*value_size..cursor+idx*value_size+3]))
|
132
|
+
end
|
133
|
+
found_relevant_data(extensions)
|
134
|
+
when 274
|
135
|
+
orientation = @bindata_reader_16.read(@data[cursor..cursor+1])
|
136
|
+
invalid_data("@#{cursor} - Invalid orientation #{orientation}") if ((orientation == 0) or (orientation > 8))
|
137
|
+
metadata( :orientation => orientation )
|
138
|
+
when 277
|
139
|
+
samples_per_pixel = @bindata_reader_16.read(@data[cursor..cursor+1])
|
140
|
+
invalid_data("@#{cursor} - Invalid samples per pixel #{samples_per_pixel}") if (samples_per_pixel == 0)
|
141
|
+
metadata( :samples_per_pixel => samples_per_pixel )
|
142
|
+
when 278
|
143
|
+
rows_per_strip = ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3]))
|
144
|
+
invalid_data("@#{cursor} - Invalid rows per strip #{rows_per_strip}") if (rows_per_strip == 0)
|
145
|
+
metadata( :rows_per_strip => rows_per_strip )
|
146
|
+
when 279
|
147
|
+
value_size = ((type == 3) ? 2 : 4)
|
148
|
+
nbr.times do |idx|
|
149
|
+
@strip_byte_counts << ((type == 3) ? @bindata_reader_16.read(@data[cursor+idx*value_size..cursor+idx*value_size+1]) : @bindata_reader_32.read(@data[cursor+idx*value_size..cursor+idx*value_size+3]))
|
150
|
+
end
|
151
|
+
when 282
|
152
|
+
ratio = read_ratio(cursor)
|
153
|
+
invalid_data("@#{cursor} - Invalid x resolution #{ratio}") if (ratio == 0)
|
154
|
+
metadata( :x_resolution => ratio )
|
155
|
+
when 283
|
156
|
+
ratio = read_ratio(cursor)
|
157
|
+
invalid_data("@#{cursor} - Invalid y resolution #{ratio}") if (ratio == 0)
|
158
|
+
metadata( :y_resolution => ratio )
|
159
|
+
when 285
|
160
|
+
metadata( :page_name => read_ascii(cursor, size) )
|
161
|
+
when 296
|
162
|
+
resolution_unit = @bindata_reader_16.read(@data[cursor..cursor+1])
|
163
|
+
invalid_data("@#{cursor} - Invalid resolution unit #{resolution_unit}") if ((resolution_unit == 0) or (resolution_unit > 3))
|
164
|
+
metadata( :resolution_unit => resolution_unit )
|
165
|
+
when 297
|
166
|
+
page_number = @bindata_reader_16.read(@data[cursor..cursor+1])
|
167
|
+
page_total = @bindata_reader_16.read(@data[cursor+2..cursor+3])
|
168
|
+
invalid_data("@#{cursor} - Invalid page total #{page_total}") if (page_total == 0)
|
169
|
+
metadata( :page_number => page_number, :page_total => page_total )
|
170
|
+
when 305
|
171
|
+
metadata( :software => read_ascii(cursor, size) )
|
172
|
+
when 306
|
173
|
+
metadata( :date_time => read_ascii(cursor, size) )
|
174
|
+
when 315
|
175
|
+
metadata( :artist => read_ascii(cursor, size) )
|
176
|
+
when 316
|
177
|
+
metadata( :host_computer => read_ascii(cursor, size) )
|
178
|
+
when 324
|
179
|
+
nbr.times do |idx|
|
180
|
+
@tile_offsets << @bindata_reader_32.read(@data[cursor+idx*4..cursor+idx*4+3])
|
181
|
+
end
|
182
|
+
found_relevant_data(extensions)
|
183
|
+
when 325
|
184
|
+
nbr.times do |idx|
|
185
|
+
@tile_byte_counts << @bindata_reader_32.read(@data[cursor+idx*4..cursor+idx*4+3])
|
186
|
+
end
|
187
|
+
when 337
|
188
|
+
metadata( :target_printer => read_ascii(cursor, size) )
|
189
|
+
when 33432
|
190
|
+
metadata( :copyright => read_ascii(cursor, size) )
|
191
|
+
when 33434
|
192
|
+
metadata( :exposure_time => read_ratio(cursor) )
|
193
|
+
when 33437
|
194
|
+
metadata( :f_number => read_ratio(cursor) )
|
195
|
+
when 34665
|
196
|
+
exif_ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
|
197
|
+
metadata( :exif_ifd => true )
|
198
|
+
parse_ifd(exif_ifd_offset, &@tag_parser)
|
199
|
+
when 34853
|
200
|
+
gps_ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
|
201
|
+
metadata( :gps_ifd => true )
|
202
|
+
parse_ifd(gps_ifd_offset, &@tag_parser)
|
203
|
+
when 36864
|
204
|
+
metadata( :exif_version => read_ascii(cursor, size) )
|
205
|
+
when 36867
|
206
|
+
metadata( :date_time_original => read_ascii(cursor, size) )
|
207
|
+
when 36868
|
208
|
+
metadata( :date_time_digitized => read_ascii(cursor, size) )
|
209
|
+
when 37386
|
210
|
+
metadata( :focal_length => read_ratio(cursor) )
|
211
|
+
when 37510
|
212
|
+
metadata( :user_comment => read_ascii(cursor, size) )
|
213
|
+
when 37520
|
214
|
+
metadata( :subsec_time => read_ascii(cursor, size) )
|
215
|
+
when 37521
|
216
|
+
metadata( :subsec_time_original => read_ascii(cursor, size) )
|
217
|
+
when 37522
|
218
|
+
metadata( :subsec_time_digitized => read_ascii(cursor, size) )
|
219
|
+
when 40960
|
220
|
+
metadata( :flashpix_version => read_ascii(cursor, size) )
|
221
|
+
when 40962
|
222
|
+
metadata( :pixel_x_dimension => ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3])) )
|
223
|
+
when 40963
|
224
|
+
metadata( :pixel_y_dimension => ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3])) )
|
225
|
+
when 40965
|
226
|
+
interoperability_ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
|
227
|
+
metadata( :interoperability_ifd => true )
|
228
|
+
parse_ifd(interoperability_ifd_offset, &@tag_parser)
|
229
|
+
end
|
230
|
+
end
|
231
|
+
parse_ifd(ifd_offset, &@tag_parser)
|
232
|
+
log_debug "@#{@file_offset + @max_end_offset} - Found #{@strip_offsets.size} strips and #{@tile_offsets.size} tiles."
|
233
|
+
found_relevant_data(extensions)
|
234
|
+
invalid_data("@#{@file_offset + @max_end_offset} - No strips nor tiles defined.") if ((!@accept_no_image_data) and (@strip_offsets.empty?) and (@tile_offsets.empty?))
|
235
|
+
# Special case:
|
236
|
+
if ((@strip_offsets.size == 1) and
|
237
|
+
(@strip_byte_counts.empty?))
|
238
|
+
# Compute the strip size: this is the total image size
|
239
|
+
invalid_data("@#{@file_offset + @max_end_offset} - Missing strip byte counts and image is compressed") if (@compression != 1)
|
240
|
+
invalid_data("@#{@file_offset + @max_end_offset} - Missing image width") if (@image_width == nil)
|
241
|
+
invalid_data("@#{@file_offset + @max_end_offset} - Missing image length") if (@image_length == nil)
|
242
|
+
# Compute a single row size
|
243
|
+
nbr_bits_per_pixel = 0
|
244
|
+
all_samples_16 = true
|
245
|
+
all_samples_32 = true
|
246
|
+
@lst_bits_per_sample.each do |nbr_bits|
|
247
|
+
nbr_bits_per_pixel += nbr_bits
|
248
|
+
all_samples_16 = false if (nbr_bits != 16)
|
249
|
+
all_samples_32 = false if (nbr_bits != 32)
|
250
|
+
end
|
251
|
+
row_size_bits = @image_width * nbr_bits_per_pixel
|
252
|
+
# Compute the padding in bits
|
253
|
+
bits_padding = (all_samples_16 ? 16 : (all_samples_32 ? 32 : 8))
|
254
|
+
bits_rest = row_size_bits % bits_padding
|
255
|
+
row_size_bits += bits_padding - bits_rest if (bits_rest != 0)
|
256
|
+
# We have the real row size
|
257
|
+
image_end_offset = @strip_offsets[0] + @image_length * (row_size_bits / 8)
|
258
|
+
@max_end_offset = image_end_offset if (@max_end_offset < image_end_offset)
|
259
|
+
else
|
260
|
+
invalid_data("@#{@file_offset + @max_end_offset} - Found #{@strip_offsets.size} strip offsets but #{@strip_byte_counts.size} strip bytes count") if (@strip_offsets.size != @strip_byte_counts.size)
|
261
|
+
invalid_data("@#{@file_offset + @max_end_offset} - Found #{@tile_offsets.size} tile offsets but #{@tile_byte_counts.size} tile bytes count") if (@tile_offsets.size != @tile_byte_counts.size)
|
262
|
+
# Read all strips
|
263
|
+
@strip_offsets.each_with_index do |strip_offset, idx_strip|
|
264
|
+
@max_end_offset = strip_offset + @strip_byte_counts[idx_strip] if (@max_end_offset < strip_offset + @strip_byte_counts[idx_strip])
|
265
|
+
end
|
266
|
+
# Read all tiles
|
267
|
+
@tile_offsets.each_with_index do |tile_offset, idx_tile|
|
268
|
+
@max_end_offset = tile_offset + @tile_byte_counts[idx_tile] if (@max_end_offset < tile_offset + @tile_byte_counts[idx_tile])
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
return @file_offset + @max_end_offset
|
273
|
+
end
|
274
|
+
|
275
|
+
private
|
276
|
+
|
277
|
+
# Parse an IFD
|
278
|
+
#
|
279
|
+
# Parameters::
|
280
|
+
# * *ifd_offset* (_Fixnum_): IFD offset to parse
|
281
|
+
# * *&proc* (_Proc_): Code called each time a tag is being parsed:
|
282
|
+
# * Parameters::
|
283
|
+
# * *tag* (_Fixnum_): Tag read
|
284
|
+
# * *type* (_Fixnum_): Type of this tag
|
285
|
+
# * *nbr* (_Fixnum_): Number of values in this tag
|
286
|
+
# * *size* (_Fixnum_): Complete size of this tag
|
287
|
+
# * *cursor* (_Fixnum_): Cursor to read the values from
|
288
|
+
def parse_ifd(ifd_offset, &proc)
|
289
|
+
log_debug "@#{@file_offset + ifd_offset} - Parse IFD"
|
290
|
+
while (ifd_offset != 0)
|
291
|
+
cursor = @file_offset + ifd_offset
|
292
|
+
nbr_entries = @bindata_reader_16.read(@data[cursor..cursor+1])
|
293
|
+
cursor += 2
|
294
|
+
nbr_entries.times do |idx_entry|
|
295
|
+
tag = @bindata_reader_16.read(@data[cursor..cursor+1])
|
296
|
+
type = @bindata_reader_16.read(@data[cursor+2..cursor+3])
|
297
|
+
nbr = @bindata_reader_32.read(@data[cursor+4..cursor+7])
|
298
|
+
# Compute the size
|
299
|
+
invalid_data("@#{cursor} - Invalid type: #{type}") if (!TYPE_SIZES.include?(type))
|
300
|
+
size = TYPE_SIZES[type]*nbr
|
301
|
+
# Read the offset of the value
|
302
|
+
value_offset = @bindata_reader_32.read(@data[cursor+8..cursor+11])
|
303
|
+
log_debug "@#{cursor} - Found tag #{tag} (type #{type}) with #{nbr} values (size #{size}): #{value_offset}"
|
304
|
+
if (size > 4)
|
305
|
+
yield(tag, type, nbr, size, @file_offset + value_offset)
|
306
|
+
value_end_offset = value_offset + size
|
307
|
+
@max_end_offset = value_end_offset if (@max_end_offset < value_end_offset)
|
308
|
+
else
|
309
|
+
yield(tag, type, nbr, size, cursor + 8)
|
310
|
+
end
|
311
|
+
cursor += 12
|
312
|
+
progress(cursor)
|
313
|
+
end
|
314
|
+
ifd_end_offset = ifd_offset + 6 + nbr_entries*12
|
315
|
+
@max_end_offset = ifd_end_offset if (@max_end_offset < ifd_end_offset)
|
316
|
+
# Read the next ifd offset
|
317
|
+
ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
# Read an ASCII value
|
322
|
+
#
|
323
|
+
# Parameters::
|
324
|
+
# * *cursor* (_Fixnum_): The cursor to read from
|
325
|
+
# * *size* (_Fixnum_): Size of the string
|
326
|
+
# Result::
|
327
|
+
# * _String_ or <em>list<String></em>: Resulting string or list of strings if several.
|
328
|
+
def read_ascii(cursor, size)
|
329
|
+
lst_strings = @data[cursor..cursor+size-1].gsub(TRAILING_00_REGEXP, '').strip.split(NULL_TERMINATING_CHAR)
|
330
|
+
return (lst_strings.size == 1) ? lst_strings[0] : lst_strings
|
331
|
+
end
|
332
|
+
|
333
|
+
# Read a Rational value
|
334
|
+
#
|
335
|
+
# Parameters::
|
336
|
+
# * *cursor* (_Fixnum_): The cursor to read from
|
337
|
+
# Result::
|
338
|
+
# * _Float_: The rational
|
339
|
+
def read_ratio(cursor)
|
340
|
+
num = @bindata_reader_32.read(@data[cursor..cursor+3])
|
341
|
+
denom = @bindata_reader_32.read(@data[cursor+4..cursor+7])
|
342
|
+
invalid_data("@#{cursor} - Invalid rational #{num}/#{denom}") if ((denom == 0) and (num != 0))
|
343
|
+
return (num == 0) ? 0 : num.to_f / denom.to_f
|
344
|
+
end
|
345
|
+
|
346
|
+
end
|
347
|
+
|
348
|
+
end
|
349
|
+
|
350
|
+
end
|
@@ -0,0 +1,240 @@
|
|
1
|
+
module FilesHunter
|
2
|
+
|
3
|
+
module Decoders
|
4
|
+
|
5
|
+
class Text < Decoder
|
6
|
+
|
7
|
+
UTF_16BE_BOM = "\xFE\xFF".force_encoding(Encoding::ASCII_8BIT)
|
8
|
+
UTF_16LE_BOM = "\xFF\xFE".force_encoding(Encoding::ASCII_8BIT)
|
9
|
+
NULL_CHAR = "\x00".force_encoding(Encoding::ASCII_8BIT)
|
10
|
+
NL_CHAR = "\n".force_encoding(Encoding::ASCII_8BIT)
|
11
|
+
|
12
|
+
# Find segments from a given data
|
13
|
+
def find_segments
|
14
|
+
current_offset = @begin_offset
|
15
|
+
while (current_offset < @end_offset)
|
16
|
+
# First find a new line character from current_offset
|
17
|
+
newline_offset = @data.index(NL_CHAR, current_offset)
|
18
|
+
if ((newline_offset == nil) or
|
19
|
+
(newline_offset >= @end_offset))
|
20
|
+
# No text
|
21
|
+
current_offset = @end_offset
|
22
|
+
log_debug "Contains no more Text."
|
23
|
+
else
|
24
|
+
# We have a candidate
|
25
|
+
# Get back to see the beginning of Text
|
26
|
+
text_begin_offset = nil
|
27
|
+
text_header_size = 0
|
28
|
+
# Detect if it might be UTF-16 encoded
|
29
|
+
if (((newline_offset > @begin_offset) and
|
30
|
+
(@data[newline_offset-1] == NULL_CHAR)) or
|
31
|
+
((newline_offset < @end_offset-1) and
|
32
|
+
(@data[newline_offset+1] == NULL_CHAR)))
|
33
|
+
# Cursor should always be on a \x00 unless it arrived at the end
|
34
|
+
cursor = newline_offset - 1
|
35
|
+
while ((cursor >= @begin_offset+1) and
|
36
|
+
(@data[cursor] == NULL_CHAR) and
|
37
|
+
((((c = @data[cursor-1].ord) >= 32) and
|
38
|
+
(c != 127)) or
|
39
|
+
(c == 9) or
|
40
|
+
(c == 13)))
|
41
|
+
cursor -= 2
|
42
|
+
end
|
43
|
+
# Here we several possibilities:
|
44
|
+
# * cursor is on @begin_offset-1 and data begins with \x00: UTF-16 string starts at @begin_offset and is big endian,
|
45
|
+
# * else cursor is on @begin_offset and @data[@begin_offset+1] is \x00 but we did not check @data[@begin_offset] (if @data[@begin_offset] is ASCII then it means UTF-16 begins at @begin_offset and is little endian ; otherwise it starts at @begin_offset+1 and is big endian),
|
46
|
+
# * else cursor is at least on @begin_offset+1 (we have at least 2 bytes before it), and
|
47
|
+
# * @data[cursor] is not \x00 and @data[cursor+2] is \x00: we could be on the endianness marker, or out of the string already ; if not endianness marker, if @data[cursor+1] is valid ASCI then UTF-16 string starts at cursor+1 and is little endian, otherwise it starts at cursor+2 and is big endian,
|
48
|
+
# * else @data[cursor] is \x00 but preceding character is not ASCII (meaning it can't be the endianness marker either): UTF-16 string starts at cursor end is big endian
|
49
|
+
# UTF_16BE = "\xFE\xFF\x00\x??"
|
50
|
+
# UTF_16LE = "\xFF\xFE\x??\x00"
|
51
|
+
# In following comments, here are the conventions:
|
52
|
+
# * \xAA means valid ASCII character
|
53
|
+
# * \xBB means not a valid ASCII character
|
54
|
+
# * \x11 means a non zero character
|
55
|
+
# * \x?? means unknown character
|
56
|
+
# * other values represent their corresponding character
|
57
|
+
if (cursor == @begin_offset-1)
|
58
|
+
# @data[@begin_offset..@begin_offset+1] == \xAA\x00
|
59
|
+
text_begin_offset = @begin_offset
|
60
|
+
encoding = Encoding::UTF_16LE
|
61
|
+
elsif (cursor == @begin_offset)
|
62
|
+
# @data[@begin_offset..@begin_offset+2] == \x??\xAA\x00
|
63
|
+
if ((c = @data[@begin_offset].ord) == 0)
|
64
|
+
# @data[@begin_offset..@begin_offset+2] == \x00\xAA\x00
|
65
|
+
text_begin_offset = @begin_offset
|
66
|
+
encoding = Encoding::UTF_16BE
|
67
|
+
elsif (((c >= 32) and
|
68
|
+
(c != 127)) or
|
69
|
+
(c == 9) or
|
70
|
+
(c == 13))
|
71
|
+
# @data[@begin_offset..@begin_offset+2] == \xAA\xAA\x00
|
72
|
+
if (@data[@begin_offset..@begin_offset+1] == UTF_16BE_BOM)
|
73
|
+
# @data[@begin_offset..@begin_offset+2] == \xFE\xFF\x00
|
74
|
+
text_begin_offset = @begin_offset
|
75
|
+
encoding = Encoding::UTF_16BE
|
76
|
+
text_header_size = 2
|
77
|
+
else
|
78
|
+
text_begin_offset = @begin_offset + 1
|
79
|
+
encoding = Encoding::UTF_16LE
|
80
|
+
end
|
81
|
+
else
|
82
|
+
# @data[@begin_offset..@begin_offset+2] == \xBB\xAA\x00
|
83
|
+
text_begin_offset = @begin_offset + 1
|
84
|
+
encoding = Encoding::UTF_16LE
|
85
|
+
end
|
86
|
+
elsif (@data[cursor] == NULL_CHAR)
|
87
|
+
# @data[cursor-1..cursor+2] == \xBB\x00\xAA\x00
|
88
|
+
text_begin_offset = cursor
|
89
|
+
encoding = Encoding::UTF_16BE
|
90
|
+
elsif (@data[cursor-1..cursor] == UTF_16LE_BOM)
|
91
|
+
# @data[cursor-1..cursor+2] == \xFF\xFE\xAA\x00
|
92
|
+
text_begin_offset = cursor - 1
|
93
|
+
encoding = Encoding::UTF_16LE
|
94
|
+
text_header_size = 2
|
95
|
+
elsif (@data[cursor..cursor+1] == UTF_16BE_BOM)
|
96
|
+
# @data[cursor-1..cursor+2] == \x??\xFE\xFF\x00
|
97
|
+
text_begin_offset = cursor
|
98
|
+
encoding = Encoding::UTF_16BE
|
99
|
+
text_header_size = 2
|
100
|
+
else
|
101
|
+
# @data[cursor-1..cursor+2] == \x??\x11\xAA\x00
|
102
|
+
text_begin_offset = cursor + 1
|
103
|
+
encoding = Encoding::UTF_16LE
|
104
|
+
end
|
105
|
+
else
|
106
|
+
encoding = Encoding::ASCII_8BIT
|
107
|
+
cursor = newline_offset - 1
|
108
|
+
while ((cursor >= @begin_offset) and
|
109
|
+
((((c = @data[cursor].ord) >= 32) and
|
110
|
+
(c != 127)) or
|
111
|
+
(c == 9) or
|
112
|
+
(c == 13)))
|
113
|
+
cursor -= 1
|
114
|
+
end
|
115
|
+
text_begin_offset = cursor + 1
|
116
|
+
end
|
117
|
+
# Now find forward
|
118
|
+
keep_alive
|
119
|
+
text_end_offset = nil
|
120
|
+
truncated = false
|
121
|
+
case encoding
|
122
|
+
when Encoding::ASCII_8BIT
|
123
|
+
cursor = newline_offset + 1
|
124
|
+
while ((cursor < @end_offset) and
|
125
|
+
((((c = @data[cursor].ord) >= 32) and
|
126
|
+
(c != 127)) or
|
127
|
+
(c == 9) or
|
128
|
+
(c == 10) or
|
129
|
+
(c == 13)))
|
130
|
+
cursor += 1
|
131
|
+
end
|
132
|
+
text_end_offset = cursor
|
133
|
+
when Encoding::UTF_16BE
|
134
|
+
# cursor points on \x00
|
135
|
+
cursor = newline_offset + 1
|
136
|
+
while ((cursor < @end_offset-1) and
|
137
|
+
(@data[cursor] == NULL_CHAR) and
|
138
|
+
((((c = @data[cursor+1].ord) >= 32) and
|
139
|
+
(c != 127)) or
|
140
|
+
(c == 9) or
|
141
|
+
(c == 10) or
|
142
|
+
(c == 13)))
|
143
|
+
cursor += 2
|
144
|
+
end
|
145
|
+
# Several possibilities:
|
146
|
+
# * cursor is at @end_offset, meaning the string ends at @end_offset,
|
147
|
+
# * else cursor is at @end_offset-1, meaning the string ends at @end_offset-1 or at @end_offset and is truncated if @data[@end_offset-1] is "\x00",
|
148
|
+
# * else the string ends at cursor
|
149
|
+
if (cursor == @end_offset-1)
|
150
|
+
if (@data[cursor] == NULL_CHAR)
|
151
|
+
truncated = true
|
152
|
+
text_end_offset = @end_offset
|
153
|
+
else
|
154
|
+
text_end_offset = @end_offset - 1
|
155
|
+
end
|
156
|
+
else
|
157
|
+
text_end_offset = cursor
|
158
|
+
end
|
159
|
+
when Encoding::UTF_16LE
|
160
|
+
# cursor points on the ASCII value
|
161
|
+
cursor = newline_offset
|
162
|
+
while ((cursor < @end_offset-1) and
|
163
|
+
(@data[cursor+1] == NULL_CHAR) and
|
164
|
+
((((c = @data[cursor].ord) >= 32) and
|
165
|
+
(c != 127)) or
|
166
|
+
(c == 9) or
|
167
|
+
(c == 10) or
|
168
|
+
(c == 13)))
|
169
|
+
cursor += 2
|
170
|
+
end
|
171
|
+
# Several possibilities:
|
172
|
+
# * cursor is at @end_offset, meaning the string ends at @end_offset,
|
173
|
+
# * else cursor is at @end_offset-1, meaning the string ends at @end_offset-1 or at @end_offset and is truncated if @data[@end_offset-1] is a valid ASCII,
|
174
|
+
# * else the string ends at cursor
|
175
|
+
if (cursor == @end_offset-1)
|
176
|
+
if ((((c = @data[cursor].ord) >= 32) and
|
177
|
+
(c != 127)) or
|
178
|
+
(c == 9) or
|
179
|
+
(c == 10) or
|
180
|
+
(c == 13))
|
181
|
+
truncated = true
|
182
|
+
text_end_offset = @end_offset
|
183
|
+
else
|
184
|
+
text_end_offset = @end_offset - 1
|
185
|
+
end
|
186
|
+
else
|
187
|
+
text_end_offset = cursor
|
188
|
+
end
|
189
|
+
end
|
190
|
+
# Consider text files longer than a certain size only
|
191
|
+
if (text_end_offset - text_begin_offset < 512*((encoding == Encoding::ASCII_8BIT) ? 1 : 2))
|
192
|
+
#log_debug "@#{text_begin_offset} - Text section is too short (#{text_end_offset - text_begin_offset}) to be identified as text"
|
193
|
+
else
|
194
|
+
log_debug "@#{text_begin_offset} - Found text up to #{text_end_offset} with encoding #{encoding} and header of size #{text_header_size}"
|
195
|
+
# Now check some formats
|
196
|
+
text = @data[text_begin_offset+text_header_size..text_end_offset-1].clone.force_encoding(encoding)
|
197
|
+
lines = text.split("\r\n".encode(encoding))
|
198
|
+
lines = text.split("\n".encode(encoding)) if (lines.size == 1)
|
199
|
+
extension = [ :txt, :log ] # By default
|
200
|
+
if is_text_srt?(lines, encoding)
|
201
|
+
extension = :srt
|
202
|
+
elsif is_text_rtf?(lines, encoding)
|
203
|
+
extension = :rtf
|
204
|
+
elsif is_text_html?(lines, encoding)
|
205
|
+
extension = :html
|
206
|
+
elsif is_text_xml?(lines, encoding)
|
207
|
+
extension = :xml
|
208
|
+
end
|
209
|
+
found_segment(text_begin_offset, text_end_offset, extension, false, false, :encoding => encoding)
|
210
|
+
end
|
211
|
+
current_offset = text_end_offset + 1
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
private
|
217
|
+
|
218
|
+
def is_text_srt?(lines, encoding)
|
219
|
+
# TODO (Ruby bug): Replace [0-9] with \d when it will work in UTF_16LE encoding
|
220
|
+
return ((lines[0] =~ Regexp.new('^\d+$'.encode(encoding))) and
|
221
|
+
(lines[1] =~ Regexp.new('^[0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9] --> [0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9]$'.encode(encoding))))
|
222
|
+
end
|
223
|
+
|
224
|
+
def is_text_rtf?(lines, encoding)
|
225
|
+
return (lines[0] =~ Regexp.new('{\\\\rtf'.encode(encoding)))
|
226
|
+
end
|
227
|
+
|
228
|
+
def is_text_html?(lines, encoding)
|
229
|
+
return lines[0] =~ Regexp.new('<!DOCTYPE html'.encode(encoding))
|
230
|
+
end
|
231
|
+
|
232
|
+
def is_text_xml?(lines, encoding)
|
233
|
+
return lines[0] =~ Regexp.new('<?xml '.encode(encoding))
|
234
|
+
end
|
235
|
+
|
236
|
+
end
|
237
|
+
|
238
|
+
end
|
239
|
+
|
240
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module FilesHunter
|
2
|
+
|
3
|
+
# A segment represents a chunk of data
|
4
|
+
class Segment
|
5
|
+
|
6
|
+
# Begin offset of the segment
|
7
|
+
# Fixnum
|
8
|
+
attr_reader :begin_offset
|
9
|
+
|
10
|
+
# End offset of the segment (equals the begin offset of the next segment)
|
11
|
+
# Fixnum
|
12
|
+
attr_reader :end_offset
|
13
|
+
|
14
|
+
# List of extensions guessed (sort by descending probability) (:mkv, :dll ...). :unknown used to unknown data.
|
15
|
+
# list<Symbol>
|
16
|
+
attr_reader :extensions
|
17
|
+
|
18
|
+
# Is this segment truncated? This means that for the given extension, data should have continued beyond this segment.
|
19
|
+
# Boolean
|
20
|
+
attr_reader :truncated
|
21
|
+
|
22
|
+
# Is this segment missing previous data? This means that for the given extension, data should have been present already before this segment.
|
23
|
+
# Boolean
|
24
|
+
attr_reader :missing_previous_data
|
25
|
+
|
26
|
+
# Metadata associated to this Segment (Decoder dependent)
|
27
|
+
# map< Symbol, Object >
|
28
|
+
attr_reader :metadata
|
29
|
+
|
30
|
+
# Constructor
|
31
|
+
#
|
32
|
+
# Parameters::
|
33
|
+
# * *begin_offset* (_Fixnum_): Specify begin offset
|
34
|
+
# * *end_offset* (_Fixnum_): Specify end offset
|
35
|
+
# * *extension* (_Symbol_ or <em>list<Symbol></em>): Specify extension
|
36
|
+
# * *truncated* (_Boolean_): Specify truncated flag
|
37
|
+
# * *missing_previous_data* (_Boolean_): Do we lack data before this segment?
|
38
|
+
# * *metadata* (<em>map<Symbol,Object></em>): Metadata (Decoder dependent)
|
39
|
+
def initialize(begin_offset, end_offset, extension, truncated, missing_previous_data, metadata)
|
40
|
+
@begin_offset = begin_offset
|
41
|
+
@end_offset = end_offset
|
42
|
+
@extensions = (extension.is_a?(Symbol)) ? [ extension ] : extension
|
43
|
+
@truncated = truncated
|
44
|
+
@missing_previous_data = missing_previous_data
|
45
|
+
@metadata = metadata
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|