fileshunter 0.1.0.20130725
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/AUTHORS +3 -0
- data/ChangeLog +5 -0
- data/Credits +21 -0
- data/LICENSE +31 -0
- data/README +15 -0
- data/README.md +11 -0
- data/Rakefile +7 -0
- data/ReleaseInfo +8 -0
- data/bin/fileshunt +216 -0
- data/ext/fileshunter/Decoders/_FLAC.c +233 -0
- data/ext/fileshunter/Decoders/extconf.rb +3 -0
- data/lib/fileshunter/BeginPatternDecoder.rb +218 -0
- data/lib/fileshunter/Decoder.rb +66 -0
- data/lib/fileshunter/Decoders/ASF.rb +50 -0
- data/lib/fileshunter/Decoders/BMP.rb +118 -0
- data/lib/fileshunter/Decoders/CAB.rb +140 -0
- data/lib/fileshunter/Decoders/CFBF.rb +92 -0
- data/lib/fileshunter/Decoders/EBML.rb +369 -0
- data/lib/fileshunter/Decoders/EXE.rb +505 -0
- data/lib/fileshunter/Decoders/FLAC.rb +387 -0
- data/lib/fileshunter/Decoders/ICO.rb +71 -0
- data/lib/fileshunter/Decoders/JPEG.rb +247 -0
- data/lib/fileshunter/Decoders/M2V.rb +30 -0
- data/lib/fileshunter/Decoders/MP3.rb +341 -0
- data/lib/fileshunter/Decoders/MP4.rb +620 -0
- data/lib/fileshunter/Decoders/MPG_Video.rb +30 -0
- data/lib/fileshunter/Decoders/OGG.rb +74 -0
- data/lib/fileshunter/Decoders/RIFF.rb +437 -0
- data/lib/fileshunter/Decoders/TIFF.rb +350 -0
- data/lib/fileshunter/Decoders/Text.rb +240 -0
- data/lib/fileshunter/Segment.rb +50 -0
- data/lib/fileshunter/SegmentsAnalyzer.rb +251 -0
- data/lib/fileshunter.rb +15 -0
- metadata +130 -0
@@ -0,0 +1,350 @@
|
|
1
|
+
module FilesHunter
|
2
|
+
|
3
|
+
module Decoders
|
4
|
+
|
5
|
+
class TIFF < BeginPatternDecoder
|
6
|
+
|
7
|
+
# TIFF Reference: http://partners.adobe.com/public/developer/en/tiff/TIFF6.pdf
|
8
|
+
# Exif Reference: http://www.exif.org/Exif2-2.PDF
|
9
|
+
|
10
|
+
BEGIN_PATTERN_TIFF_LE = "II*\x00".force_encoding(Encoding::ASCII_8BIT)
|
11
|
+
BEGIN_PATTERN_TIFF_BE = "MM\x00*".force_encoding(Encoding::ASCII_8BIT)
|
12
|
+
BEGIN_PATTERN_TIFF = Regexp.new("(#{Regexp.escape(BEGIN_PATTERN_TIFF_LE)}|#{Regexp.escape(BEGIN_PATTERN_TIFF_BE)})", nil, 'n')
|
13
|
+
|
14
|
+
TYPE_SIZES = {
|
15
|
+
1 => 1,
|
16
|
+
2 => 1,
|
17
|
+
3 => 2,
|
18
|
+
4 => 4,
|
19
|
+
5 => 8,
|
20
|
+
6 => 1,
|
21
|
+
7 => 1,
|
22
|
+
8 => 2,
|
23
|
+
9 => 4,
|
24
|
+
10 => 8,
|
25
|
+
11 => 4,
|
26
|
+
12 => 8
|
27
|
+
}
|
28
|
+
|
29
|
+
VALID_COMPRESSION_VALUES = [ 1, 2, 3, 4, 5, 6, 32773 ]
|
30
|
+
VALID_PHOTOMETRIC_INTERPRETATIONS = [ 0, 1, 2, 3, 4, 5, 6, 8 ]
|
31
|
+
|
32
|
+
TRAILING_00_REGEXP = Regexp.new("\x00*$".force_encoding(Encoding::ASCII_8BIT), nil, 'n')
|
33
|
+
NULL_TERMINATING_CHAR = "\x00".force_encoding(Encoding::ASCII_8BIT)
|
34
|
+
|
35
|
+
def initialize
|
36
|
+
super
|
37
|
+
@accept_no_image_data = false
|
38
|
+
end
|
39
|
+
|
40
|
+
def get_begin_pattern
|
41
|
+
return BEGIN_PATTERN_TIFF, { :offset_inc => 4, :max_regexp_size => 4 }
|
42
|
+
end
|
43
|
+
|
44
|
+
# Set this decoder to accept no image data.
|
45
|
+
# This is particularly useful for other decoders using it (for example with JPEG and its Exif info)
|
46
|
+
def accept_no_image_data
|
47
|
+
@accept_no_image_data = true
|
48
|
+
end
|
49
|
+
|
50
|
+
def decode(offset)
|
51
|
+
@file_offset = offset
|
52
|
+
@bindata_reader_16 = nil
|
53
|
+
@bindata_reader_32 = nil
|
54
|
+
if (@data[offset..offset+3] == BEGIN_PATTERN_TIFF_LE)
|
55
|
+
@bindata_reader_16 = BinData::Uint16le
|
56
|
+
@bindata_reader_32 = BinData::Uint32le
|
57
|
+
else
|
58
|
+
@bindata_reader_16 = BinData::Uint16be
|
59
|
+
@bindata_reader_32 = BinData::Uint32be
|
60
|
+
end
|
61
|
+
ifd_offset = @bindata_reader_32.read(@data[offset+4..offset+7])
|
62
|
+
extensions = [:tif, :tiff] # By default
|
63
|
+
@max_end_offset = ifd_offset
|
64
|
+
@strip_offsets = []
|
65
|
+
@strip_byte_counts = []
|
66
|
+
@tile_offsets = []
|
67
|
+
@tile_byte_counts = []
|
68
|
+
@compression = 1
|
69
|
+
@lst_bits_per_sample = [1]
|
70
|
+
@image_width = nil
|
71
|
+
@image_length = nil
|
72
|
+
@tag_parser = Proc.new do |tag, type, nbr, size, cursor|
|
73
|
+
case tag.to_i
|
74
|
+
when 2
|
75
|
+
metadata( :gps_latitude => @data[cursor..cursor+size-1] )
|
76
|
+
when 4
|
77
|
+
metadata( :gps_longitude => @data[cursor..cursor+size-1] )
|
78
|
+
when 6
|
79
|
+
metadata( :gps_altitude => @data[cursor..cursor+size-1] )
|
80
|
+
when 7
|
81
|
+
metadata( :gps_timestamp => [
|
82
|
+
read_ratio(cursor),
|
83
|
+
read_ratio(cursor+8),
|
84
|
+
read_ratio(cursor+16),
|
85
|
+
] )
|
86
|
+
when 256
|
87
|
+
@image_width = ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3]))
|
88
|
+
invalid_data("@#{cursor} - Invalid image width #{@image_width}") if (@image_width == 0)
|
89
|
+
metadata( :image_width => @image_width )
|
90
|
+
when 257
|
91
|
+
@image_length = ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3]))
|
92
|
+
invalid_data("@#{cursor} - Invalid image length #{@image_length}") if (@image_length == 0)
|
93
|
+
metadata( :image_length => @image_length )
|
94
|
+
when 258
|
95
|
+
@lst_bits_per_sample = []
|
96
|
+
nbr.times do |idx_sample|
|
97
|
+
@lst_bits_per_sample << @bindata_reader_16.read(@data[cursor+2*idx_sample..cursor+2*idx_sample+1])
|
98
|
+
end
|
99
|
+
metadata( :lst_bits_per_sample => @lst_bits_per_sample )
|
100
|
+
when 259
|
101
|
+
@compression = @bindata_reader_16.read(@data[cursor..cursor+1])
|
102
|
+
invalid_data("@#{cursor} - Invalid compression #{@compression}") if (!VALID_COMPRESSION_VALUES.include?(@compression))
|
103
|
+
metadata( :compression => @compression )
|
104
|
+
when 262
|
105
|
+
photometric_interpretation = @bindata_reader_16.read(@data[cursor..cursor+1])
|
106
|
+
invalid_data("@#{cursor} - Invalid photometric interpretation #{photometric_interpretation}") if (!VALID_PHOTOMETRIC_INTERPRETATIONS.include?(photometric_interpretation))
|
107
|
+
metadata( :photometric_interpretation => photometric_interpretation )
|
108
|
+
when 264
|
109
|
+
cell_width = @bindata_reader_16.read(@data[cursor..cursor+1])
|
110
|
+
invalid_data("@#{cursor} - Invalid cell width #{cell_width}") if (cell_width == 0)
|
111
|
+
metadata( :cell_width => cell_width )
|
112
|
+
when 265
|
113
|
+
cell_length = @bindata_reader_16.read(@data[cursor..cursor+1])
|
114
|
+
invalid_data("@#{cursor} - Invalid cell length #{cell_length}") if (cell_length == 0)
|
115
|
+
metadata( :cell_length => cell_length )
|
116
|
+
when 266
|
117
|
+
fill_order = @bindata_reader_16.read(@data[cursor..cursor+1])
|
118
|
+
invalid_data("@#{cursor} - Invalid fill order #{fill_order}") if ((fill_order == 0) or (fill_order > 2))
|
119
|
+
metadata( :fill_order => fill_order )
|
120
|
+
when 269
|
121
|
+
metadata( :document_name => read_ascii(cursor, size) )
|
122
|
+
when 270
|
123
|
+
metadata( :image_description => read_ascii(cursor, size) )
|
124
|
+
when 271
|
125
|
+
metadata( :make => read_ascii(cursor, size) )
|
126
|
+
when 272
|
127
|
+
metadata( :model => read_ascii(cursor, size) )
|
128
|
+
when 273
|
129
|
+
value_size = ((type == 3) ? 2 : 4)
|
130
|
+
nbr.times do |idx|
|
131
|
+
@strip_offsets << ((type == 3) ? @bindata_reader_16.read(@data[cursor+idx*value_size..cursor+idx*value_size+1]) : @bindata_reader_32.read(@data[cursor+idx*value_size..cursor+idx*value_size+3]))
|
132
|
+
end
|
133
|
+
found_relevant_data(extensions)
|
134
|
+
when 274
|
135
|
+
orientation = @bindata_reader_16.read(@data[cursor..cursor+1])
|
136
|
+
invalid_data("@#{cursor} - Invalid orientation #{orientation}") if ((orientation == 0) or (orientation > 8))
|
137
|
+
metadata( :orientation => orientation )
|
138
|
+
when 277
|
139
|
+
samples_per_pixel = @bindata_reader_16.read(@data[cursor..cursor+1])
|
140
|
+
invalid_data("@#{cursor} - Invalid samples per pixel #{samples_per_pixel}") if (samples_per_pixel == 0)
|
141
|
+
metadata( :samples_per_pixel => samples_per_pixel )
|
142
|
+
when 278
|
143
|
+
rows_per_strip = ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3]))
|
144
|
+
invalid_data("@#{cursor} - Invalid rows per strip #{rows_per_strip}") if (rows_per_strip == 0)
|
145
|
+
metadata( :rows_per_strip => rows_per_strip )
|
146
|
+
when 279
|
147
|
+
value_size = ((type == 3) ? 2 : 4)
|
148
|
+
nbr.times do |idx|
|
149
|
+
@strip_byte_counts << ((type == 3) ? @bindata_reader_16.read(@data[cursor+idx*value_size..cursor+idx*value_size+1]) : @bindata_reader_32.read(@data[cursor+idx*value_size..cursor+idx*value_size+3]))
|
150
|
+
end
|
151
|
+
when 282
|
152
|
+
ratio = read_ratio(cursor)
|
153
|
+
invalid_data("@#{cursor} - Invalid x resolution #{ratio}") if (ratio == 0)
|
154
|
+
metadata( :x_resolution => ratio )
|
155
|
+
when 283
|
156
|
+
ratio = read_ratio(cursor)
|
157
|
+
invalid_data("@#{cursor} - Invalid y resolution #{ratio}") if (ratio == 0)
|
158
|
+
metadata( :y_resolution => ratio )
|
159
|
+
when 285
|
160
|
+
metadata( :page_name => read_ascii(cursor, size) )
|
161
|
+
when 296
|
162
|
+
resolution_unit = @bindata_reader_16.read(@data[cursor..cursor+1])
|
163
|
+
invalid_data("@#{cursor} - Invalid resolution unit #{resolution_unit}") if ((resolution_unit == 0) or (resolution_unit > 3))
|
164
|
+
metadata( :resolution_unit => resolution_unit )
|
165
|
+
when 297
|
166
|
+
page_number = @bindata_reader_16.read(@data[cursor..cursor+1])
|
167
|
+
page_total = @bindata_reader_16.read(@data[cursor+2..cursor+3])
|
168
|
+
invalid_data("@#{cursor} - Invalid page total #{page_total}") if (page_total == 0)
|
169
|
+
metadata( :page_number => page_number, :page_total => page_total )
|
170
|
+
when 305
|
171
|
+
metadata( :software => read_ascii(cursor, size) )
|
172
|
+
when 306
|
173
|
+
metadata( :date_time => read_ascii(cursor, size) )
|
174
|
+
when 315
|
175
|
+
metadata( :artist => read_ascii(cursor, size) )
|
176
|
+
when 316
|
177
|
+
metadata( :host_computer => read_ascii(cursor, size) )
|
178
|
+
when 324
|
179
|
+
nbr.times do |idx|
|
180
|
+
@tile_offsets << @bindata_reader_32.read(@data[cursor+idx*4..cursor+idx*4+3])
|
181
|
+
end
|
182
|
+
found_relevant_data(extensions)
|
183
|
+
when 325
|
184
|
+
nbr.times do |idx|
|
185
|
+
@tile_byte_counts << @bindata_reader_32.read(@data[cursor+idx*4..cursor+idx*4+3])
|
186
|
+
end
|
187
|
+
when 337
|
188
|
+
metadata( :target_printer => read_ascii(cursor, size) )
|
189
|
+
when 33432
|
190
|
+
metadata( :copyright => read_ascii(cursor, size) )
|
191
|
+
when 33434
|
192
|
+
metadata( :exposure_time => read_ratio(cursor) )
|
193
|
+
when 33437
|
194
|
+
metadata( :f_number => read_ratio(cursor) )
|
195
|
+
when 34665
|
196
|
+
exif_ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
|
197
|
+
metadata( :exif_ifd => true )
|
198
|
+
parse_ifd(exif_ifd_offset, &@tag_parser)
|
199
|
+
when 34853
|
200
|
+
gps_ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
|
201
|
+
metadata( :gps_ifd => true )
|
202
|
+
parse_ifd(gps_ifd_offset, &@tag_parser)
|
203
|
+
when 36864
|
204
|
+
metadata( :exif_version => read_ascii(cursor, size) )
|
205
|
+
when 36867
|
206
|
+
metadata( :date_time_original => read_ascii(cursor, size) )
|
207
|
+
when 36868
|
208
|
+
metadata( :date_time_digitized => read_ascii(cursor, size) )
|
209
|
+
when 37386
|
210
|
+
metadata( :focal_length => read_ratio(cursor) )
|
211
|
+
when 37510
|
212
|
+
metadata( :user_comment => read_ascii(cursor, size) )
|
213
|
+
when 37520
|
214
|
+
metadata( :subsec_time => read_ascii(cursor, size) )
|
215
|
+
when 37521
|
216
|
+
metadata( :subsec_time_original => read_ascii(cursor, size) )
|
217
|
+
when 37522
|
218
|
+
metadata( :subsec_time_digitized => read_ascii(cursor, size) )
|
219
|
+
when 40960
|
220
|
+
metadata( :flashpix_version => read_ascii(cursor, size) )
|
221
|
+
when 40962
|
222
|
+
metadata( :pixel_x_dimension => ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3])) )
|
223
|
+
when 40963
|
224
|
+
metadata( :pixel_y_dimension => ((type == 3) ? @bindata_reader_16.read(@data[cursor..cursor+1]) : @bindata_reader_32.read(@data[cursor..cursor+3])) )
|
225
|
+
when 40965
|
226
|
+
interoperability_ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
|
227
|
+
metadata( :interoperability_ifd => true )
|
228
|
+
parse_ifd(interoperability_ifd_offset, &@tag_parser)
|
229
|
+
end
|
230
|
+
end
|
231
|
+
parse_ifd(ifd_offset, &@tag_parser)
|
232
|
+
log_debug "@#{@file_offset + @max_end_offset} - Found #{@strip_offsets.size} strips and #{@tile_offsets.size} tiles."
|
233
|
+
found_relevant_data(extensions)
|
234
|
+
invalid_data("@#{@file_offset + @max_end_offset} - No strips nor tiles defined.") if ((!@accept_no_image_data) and (@strip_offsets.empty?) and (@tile_offsets.empty?))
|
235
|
+
# Special case:
|
236
|
+
if ((@strip_offsets.size == 1) and
|
237
|
+
(@strip_byte_counts.empty?))
|
238
|
+
# Compute the strip size: this is the total image size
|
239
|
+
invalid_data("@#{@file_offset + @max_end_offset} - Missing strip byte counts and image is compressed") if (@compression != 1)
|
240
|
+
invalid_data("@#{@file_offset + @max_end_offset} - Missing image width") if (@image_width == nil)
|
241
|
+
invalid_data("@#{@file_offset + @max_end_offset} - Missing image length") if (@image_length == nil)
|
242
|
+
# Compute a single row size
|
243
|
+
nbr_bits_per_pixel = 0
|
244
|
+
all_samples_16 = true
|
245
|
+
all_samples_32 = true
|
246
|
+
@lst_bits_per_sample.each do |nbr_bits|
|
247
|
+
nbr_bits_per_pixel += nbr_bits
|
248
|
+
all_samples_16 = false if (nbr_bits != 16)
|
249
|
+
all_samples_32 = false if (nbr_bits != 32)
|
250
|
+
end
|
251
|
+
row_size_bits = @image_width * nbr_bits_per_pixel
|
252
|
+
# Compute the padding in bits
|
253
|
+
bits_padding = (all_samples_16 ? 16 : (all_samples_32 ? 32 : 8))
|
254
|
+
bits_rest = row_size_bits % bits_padding
|
255
|
+
row_size_bits += bits_padding - bits_rest if (bits_rest != 0)
|
256
|
+
# We have the real row size
|
257
|
+
image_end_offset = @strip_offsets[0] + @image_length * (row_size_bits / 8)
|
258
|
+
@max_end_offset = image_end_offset if (@max_end_offset < image_end_offset)
|
259
|
+
else
|
260
|
+
invalid_data("@#{@file_offset + @max_end_offset} - Found #{@strip_offsets.size} strip offsets but #{@strip_byte_counts.size} strip bytes count") if (@strip_offsets.size != @strip_byte_counts.size)
|
261
|
+
invalid_data("@#{@file_offset + @max_end_offset} - Found #{@tile_offsets.size} tile offsets but #{@tile_byte_counts.size} tile bytes count") if (@tile_offsets.size != @tile_byte_counts.size)
|
262
|
+
# Read all strips
|
263
|
+
@strip_offsets.each_with_index do |strip_offset, idx_strip|
|
264
|
+
@max_end_offset = strip_offset + @strip_byte_counts[idx_strip] if (@max_end_offset < strip_offset + @strip_byte_counts[idx_strip])
|
265
|
+
end
|
266
|
+
# Read all tiles
|
267
|
+
@tile_offsets.each_with_index do |tile_offset, idx_tile|
|
268
|
+
@max_end_offset = tile_offset + @tile_byte_counts[idx_tile] if (@max_end_offset < tile_offset + @tile_byte_counts[idx_tile])
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
return @file_offset + @max_end_offset
|
273
|
+
end
|
274
|
+
|
275
|
+
private
|
276
|
+
|
277
|
+
# Parse an IFD
|
278
|
+
#
|
279
|
+
# Parameters::
|
280
|
+
# * *ifd_offset* (_Fixnum_): IFD offset to parse
|
281
|
+
# * *&proc* (_Proc_): Code called each time a tag is being parsed:
|
282
|
+
# * Parameters::
|
283
|
+
# * *tag* (_Fixnum_): Tag read
|
284
|
+
# * *type* (_Fixnum_): Type of this tag
|
285
|
+
# * *nbr* (_Fixnum_): Number of values in this tag
|
286
|
+
# * *size* (_Fixnum_): Complete size of this tag
|
287
|
+
# * *cursor* (_Fixnum_): Cursor to read the values from
|
288
|
+
def parse_ifd(ifd_offset, &proc)
|
289
|
+
log_debug "@#{@file_offset + ifd_offset} - Parse IFD"
|
290
|
+
while (ifd_offset != 0)
|
291
|
+
cursor = @file_offset + ifd_offset
|
292
|
+
nbr_entries = @bindata_reader_16.read(@data[cursor..cursor+1])
|
293
|
+
cursor += 2
|
294
|
+
nbr_entries.times do |idx_entry|
|
295
|
+
tag = @bindata_reader_16.read(@data[cursor..cursor+1])
|
296
|
+
type = @bindata_reader_16.read(@data[cursor+2..cursor+3])
|
297
|
+
nbr = @bindata_reader_32.read(@data[cursor+4..cursor+7])
|
298
|
+
# Compute the size
|
299
|
+
invalid_data("@#{cursor} - Invalid type: #{type}") if (!TYPE_SIZES.include?(type))
|
300
|
+
size = TYPE_SIZES[type]*nbr
|
301
|
+
# Read the offset of the value
|
302
|
+
value_offset = @bindata_reader_32.read(@data[cursor+8..cursor+11])
|
303
|
+
log_debug "@#{cursor} - Found tag #{tag} (type #{type}) with #{nbr} values (size #{size}): #{value_offset}"
|
304
|
+
if (size > 4)
|
305
|
+
yield(tag, type, nbr, size, @file_offset + value_offset)
|
306
|
+
value_end_offset = value_offset + size
|
307
|
+
@max_end_offset = value_end_offset if (@max_end_offset < value_end_offset)
|
308
|
+
else
|
309
|
+
yield(tag, type, nbr, size, cursor + 8)
|
310
|
+
end
|
311
|
+
cursor += 12
|
312
|
+
progress(cursor)
|
313
|
+
end
|
314
|
+
ifd_end_offset = ifd_offset + 6 + nbr_entries*12
|
315
|
+
@max_end_offset = ifd_end_offset if (@max_end_offset < ifd_end_offset)
|
316
|
+
# Read the next ifd offset
|
317
|
+
ifd_offset = @bindata_reader_32.read(@data[cursor..cursor+3])
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
# Read an ASCII value
|
322
|
+
#
|
323
|
+
# Parameters::
|
324
|
+
# * *cursor* (_Fixnum_): The cursor to read from
|
325
|
+
# * *size* (_Fixnum_): Size of the string
|
326
|
+
# Result::
|
327
|
+
# * _String_ or <em>list<String></em>: Resulting string or list of strings if several.
|
328
|
+
def read_ascii(cursor, size)
|
329
|
+
lst_strings = @data[cursor..cursor+size-1].gsub(TRAILING_00_REGEXP, '').strip.split(NULL_TERMINATING_CHAR)
|
330
|
+
return (lst_strings.size == 1) ? lst_strings[0] : lst_strings
|
331
|
+
end
|
332
|
+
|
333
|
+
# Read a Rational value
|
334
|
+
#
|
335
|
+
# Parameters::
|
336
|
+
# * *cursor* (_Fixnum_): The cursor to read from
|
337
|
+
# Result::
|
338
|
+
# * _Float_: The rational
|
339
|
+
def read_ratio(cursor)
|
340
|
+
num = @bindata_reader_32.read(@data[cursor..cursor+3])
|
341
|
+
denom = @bindata_reader_32.read(@data[cursor+4..cursor+7])
|
342
|
+
invalid_data("@#{cursor} - Invalid rational #{num}/#{denom}") if ((denom == 0) and (num != 0))
|
343
|
+
return (num == 0) ? 0 : num.to_f / denom.to_f
|
344
|
+
end
|
345
|
+
|
346
|
+
end
|
347
|
+
|
348
|
+
end
|
349
|
+
|
350
|
+
end
|
@@ -0,0 +1,240 @@
|
|
1
|
+
module FilesHunter
|
2
|
+
|
3
|
+
module Decoders
|
4
|
+
|
5
|
+
class Text < Decoder
|
6
|
+
|
7
|
+
UTF_16BE_BOM = "\xFE\xFF".force_encoding(Encoding::ASCII_8BIT)
|
8
|
+
UTF_16LE_BOM = "\xFF\xFE".force_encoding(Encoding::ASCII_8BIT)
|
9
|
+
NULL_CHAR = "\x00".force_encoding(Encoding::ASCII_8BIT)
|
10
|
+
NL_CHAR = "\n".force_encoding(Encoding::ASCII_8BIT)
|
11
|
+
|
12
|
+
# Find segments from a given data
|
13
|
+
def find_segments
|
14
|
+
current_offset = @begin_offset
|
15
|
+
while (current_offset < @end_offset)
|
16
|
+
# First find a new line character from current_offset
|
17
|
+
newline_offset = @data.index(NL_CHAR, current_offset)
|
18
|
+
if ((newline_offset == nil) or
|
19
|
+
(newline_offset >= @end_offset))
|
20
|
+
# No text
|
21
|
+
current_offset = @end_offset
|
22
|
+
log_debug "Contains no more Text."
|
23
|
+
else
|
24
|
+
# We have a candidate
|
25
|
+
# Get back to see the beginning of Text
|
26
|
+
text_begin_offset = nil
|
27
|
+
text_header_size = 0
|
28
|
+
# Detect if it might be UTF-16 encoded
|
29
|
+
if (((newline_offset > @begin_offset) and
|
30
|
+
(@data[newline_offset-1] == NULL_CHAR)) or
|
31
|
+
((newline_offset < @end_offset-1) and
|
32
|
+
(@data[newline_offset+1] == NULL_CHAR)))
|
33
|
+
# Cursor should always be on a \x00 unless it arrived at the end
|
34
|
+
cursor = newline_offset - 1
|
35
|
+
while ((cursor >= @begin_offset+1) and
|
36
|
+
(@data[cursor] == NULL_CHAR) and
|
37
|
+
((((c = @data[cursor-1].ord) >= 32) and
|
38
|
+
(c != 127)) or
|
39
|
+
(c == 9) or
|
40
|
+
(c == 13)))
|
41
|
+
cursor -= 2
|
42
|
+
end
|
43
|
+
# Here we several possibilities:
|
44
|
+
# * cursor is on @begin_offset-1 and data begins with \x00: UTF-16 string starts at @begin_offset and is big endian,
|
45
|
+
# * else cursor is on @begin_offset and @data[@begin_offset+1] is \x00 but we did not check @data[@begin_offset] (if @data[@begin_offset] is ASCII then it means UTF-16 begins at @begin_offset and is little endian ; otherwise it starts at @begin_offset+1 and is big endian),
|
46
|
+
# * else cursor is at least on @begin_offset+1 (we have at least 2 bytes before it), and
|
47
|
+
# * @data[cursor] is not \x00 and @data[cursor+2] is \x00: we could be on the endianness marker, or out of the string already ; if not endianness marker, if @data[cursor+1] is valid ASCI then UTF-16 string starts at cursor+1 and is little endian, otherwise it starts at cursor+2 and is big endian,
|
48
|
+
# * else @data[cursor] is \x00 but preceding character is not ASCII (meaning it can't be the endianness marker either): UTF-16 string starts at cursor end is big endian
|
49
|
+
# UTF_16BE = "\xFE\xFF\x00\x??"
|
50
|
+
# UTF_16LE = "\xFF\xFE\x??\x00"
|
51
|
+
# In following comments, here are the conventions:
|
52
|
+
# * \xAA means valid ASCII character
|
53
|
+
# * \xBB means not a valid ASCII character
|
54
|
+
# * \x11 means a non zero character
|
55
|
+
# * \x?? means unknown character
|
56
|
+
# * other values represent their corresponding character
|
57
|
+
if (cursor == @begin_offset-1)
|
58
|
+
# @data[@begin_offset..@begin_offset+1] == \xAA\x00
|
59
|
+
text_begin_offset = @begin_offset
|
60
|
+
encoding = Encoding::UTF_16LE
|
61
|
+
elsif (cursor == @begin_offset)
|
62
|
+
# @data[@begin_offset..@begin_offset+2] == \x??\xAA\x00
|
63
|
+
if ((c = @data[@begin_offset].ord) == 0)
|
64
|
+
# @data[@begin_offset..@begin_offset+2] == \x00\xAA\x00
|
65
|
+
text_begin_offset = @begin_offset
|
66
|
+
encoding = Encoding::UTF_16BE
|
67
|
+
elsif (((c >= 32) and
|
68
|
+
(c != 127)) or
|
69
|
+
(c == 9) or
|
70
|
+
(c == 13))
|
71
|
+
# @data[@begin_offset..@begin_offset+2] == \xAA\xAA\x00
|
72
|
+
if (@data[@begin_offset..@begin_offset+1] == UTF_16BE_BOM)
|
73
|
+
# @data[@begin_offset..@begin_offset+2] == \xFE\xFF\x00
|
74
|
+
text_begin_offset = @begin_offset
|
75
|
+
encoding = Encoding::UTF_16BE
|
76
|
+
text_header_size = 2
|
77
|
+
else
|
78
|
+
text_begin_offset = @begin_offset + 1
|
79
|
+
encoding = Encoding::UTF_16LE
|
80
|
+
end
|
81
|
+
else
|
82
|
+
# @data[@begin_offset..@begin_offset+2] == \xBB\xAA\x00
|
83
|
+
text_begin_offset = @begin_offset + 1
|
84
|
+
encoding = Encoding::UTF_16LE
|
85
|
+
end
|
86
|
+
elsif (@data[cursor] == NULL_CHAR)
|
87
|
+
# @data[cursor-1..cursor+2] == \xBB\x00\xAA\x00
|
88
|
+
text_begin_offset = cursor
|
89
|
+
encoding = Encoding::UTF_16BE
|
90
|
+
elsif (@data[cursor-1..cursor] == UTF_16LE_BOM)
|
91
|
+
# @data[cursor-1..cursor+2] == \xFF\xFE\xAA\x00
|
92
|
+
text_begin_offset = cursor - 1
|
93
|
+
encoding = Encoding::UTF_16LE
|
94
|
+
text_header_size = 2
|
95
|
+
elsif (@data[cursor..cursor+1] == UTF_16BE_BOM)
|
96
|
+
# @data[cursor-1..cursor+2] == \x??\xFE\xFF\x00
|
97
|
+
text_begin_offset = cursor
|
98
|
+
encoding = Encoding::UTF_16BE
|
99
|
+
text_header_size = 2
|
100
|
+
else
|
101
|
+
# @data[cursor-1..cursor+2] == \x??\x11\xAA\x00
|
102
|
+
text_begin_offset = cursor + 1
|
103
|
+
encoding = Encoding::UTF_16LE
|
104
|
+
end
|
105
|
+
else
|
106
|
+
encoding = Encoding::ASCII_8BIT
|
107
|
+
cursor = newline_offset - 1
|
108
|
+
while ((cursor >= @begin_offset) and
|
109
|
+
((((c = @data[cursor].ord) >= 32) and
|
110
|
+
(c != 127)) or
|
111
|
+
(c == 9) or
|
112
|
+
(c == 13)))
|
113
|
+
cursor -= 1
|
114
|
+
end
|
115
|
+
text_begin_offset = cursor + 1
|
116
|
+
end
|
117
|
+
# Now find forward
|
118
|
+
keep_alive
|
119
|
+
text_end_offset = nil
|
120
|
+
truncated = false
|
121
|
+
case encoding
|
122
|
+
when Encoding::ASCII_8BIT
|
123
|
+
cursor = newline_offset + 1
|
124
|
+
while ((cursor < @end_offset) and
|
125
|
+
((((c = @data[cursor].ord) >= 32) and
|
126
|
+
(c != 127)) or
|
127
|
+
(c == 9) or
|
128
|
+
(c == 10) or
|
129
|
+
(c == 13)))
|
130
|
+
cursor += 1
|
131
|
+
end
|
132
|
+
text_end_offset = cursor
|
133
|
+
when Encoding::UTF_16BE
|
134
|
+
# cursor points on \x00
|
135
|
+
cursor = newline_offset + 1
|
136
|
+
while ((cursor < @end_offset-1) and
|
137
|
+
(@data[cursor] == NULL_CHAR) and
|
138
|
+
((((c = @data[cursor+1].ord) >= 32) and
|
139
|
+
(c != 127)) or
|
140
|
+
(c == 9) or
|
141
|
+
(c == 10) or
|
142
|
+
(c == 13)))
|
143
|
+
cursor += 2
|
144
|
+
end
|
145
|
+
# Several possibilities:
|
146
|
+
# * cursor is at @end_offset, meaning the string ends at @end_offset,
|
147
|
+
# * else cursor is at @end_offset-1, meaning the string ends at @end_offset-1 or at @end_offset and is truncated if @data[@end_offset-1] is "\x00",
|
148
|
+
# * else the string ends at cursor
|
149
|
+
if (cursor == @end_offset-1)
|
150
|
+
if (@data[cursor] == NULL_CHAR)
|
151
|
+
truncated = true
|
152
|
+
text_end_offset = @end_offset
|
153
|
+
else
|
154
|
+
text_end_offset = @end_offset - 1
|
155
|
+
end
|
156
|
+
else
|
157
|
+
text_end_offset = cursor
|
158
|
+
end
|
159
|
+
when Encoding::UTF_16LE
|
160
|
+
# cursor points on the ASCII value
|
161
|
+
cursor = newline_offset
|
162
|
+
while ((cursor < @end_offset-1) and
|
163
|
+
(@data[cursor+1] == NULL_CHAR) and
|
164
|
+
((((c = @data[cursor].ord) >= 32) and
|
165
|
+
(c != 127)) or
|
166
|
+
(c == 9) or
|
167
|
+
(c == 10) or
|
168
|
+
(c == 13)))
|
169
|
+
cursor += 2
|
170
|
+
end
|
171
|
+
# Several possibilities:
|
172
|
+
# * cursor is at @end_offset, meaning the string ends at @end_offset,
|
173
|
+
# * else cursor is at @end_offset-1, meaning the string ends at @end_offset-1 or at @end_offset and is truncated if @data[@end_offset-1] is a valid ASCII,
|
174
|
+
# * else the string ends at cursor
|
175
|
+
if (cursor == @end_offset-1)
|
176
|
+
if ((((c = @data[cursor].ord) >= 32) and
|
177
|
+
(c != 127)) or
|
178
|
+
(c == 9) or
|
179
|
+
(c == 10) or
|
180
|
+
(c == 13))
|
181
|
+
truncated = true
|
182
|
+
text_end_offset = @end_offset
|
183
|
+
else
|
184
|
+
text_end_offset = @end_offset - 1
|
185
|
+
end
|
186
|
+
else
|
187
|
+
text_end_offset = cursor
|
188
|
+
end
|
189
|
+
end
|
190
|
+
# Consider text files longer than a certain size only
|
191
|
+
if (text_end_offset - text_begin_offset < 512*((encoding == Encoding::ASCII_8BIT) ? 1 : 2))
|
192
|
+
#log_debug "@#{text_begin_offset} - Text section is too short (#{text_end_offset - text_begin_offset}) to be identified as text"
|
193
|
+
else
|
194
|
+
log_debug "@#{text_begin_offset} - Found text up to #{text_end_offset} with encoding #{encoding} and header of size #{text_header_size}"
|
195
|
+
# Now check some formats
|
196
|
+
text = @data[text_begin_offset+text_header_size..text_end_offset-1].clone.force_encoding(encoding)
|
197
|
+
lines = text.split("\r\n".encode(encoding))
|
198
|
+
lines = text.split("\n".encode(encoding)) if (lines.size == 1)
|
199
|
+
extension = [ :txt, :log ] # By default
|
200
|
+
if is_text_srt?(lines, encoding)
|
201
|
+
extension = :srt
|
202
|
+
elsif is_text_rtf?(lines, encoding)
|
203
|
+
extension = :rtf
|
204
|
+
elsif is_text_html?(lines, encoding)
|
205
|
+
extension = :html
|
206
|
+
elsif is_text_xml?(lines, encoding)
|
207
|
+
extension = :xml
|
208
|
+
end
|
209
|
+
found_segment(text_begin_offset, text_end_offset, extension, false, false, :encoding => encoding)
|
210
|
+
end
|
211
|
+
current_offset = text_end_offset + 1
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
private
|
217
|
+
|
218
|
+
def is_text_srt?(lines, encoding)
|
219
|
+
# TODO (Ruby bug): Replace [0-9] with \d when it will work in UTF_16LE encoding
|
220
|
+
return ((lines[0] =~ Regexp.new('^\d+$'.encode(encoding))) and
|
221
|
+
(lines[1] =~ Regexp.new('^[0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9] --> [0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9]$'.encode(encoding))))
|
222
|
+
end
|
223
|
+
|
224
|
+
def is_text_rtf?(lines, encoding)
|
225
|
+
return (lines[0] =~ Regexp.new('{\\\\rtf'.encode(encoding)))
|
226
|
+
end
|
227
|
+
|
228
|
+
def is_text_html?(lines, encoding)
|
229
|
+
return lines[0] =~ Regexp.new('<!DOCTYPE html'.encode(encoding))
|
230
|
+
end
|
231
|
+
|
232
|
+
def is_text_xml?(lines, encoding)
|
233
|
+
return lines[0] =~ Regexp.new('<?xml '.encode(encoding))
|
234
|
+
end
|
235
|
+
|
236
|
+
end
|
237
|
+
|
238
|
+
end
|
239
|
+
|
240
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module FilesHunter
|
2
|
+
|
3
|
+
# A segment represents a chunk of data
|
4
|
+
class Segment
|
5
|
+
|
6
|
+
# Begin offset of the segment
|
7
|
+
# Fixnum
|
8
|
+
attr_reader :begin_offset
|
9
|
+
|
10
|
+
# End offset of the segment (equals the begin offset of the next segment)
|
11
|
+
# Fixnum
|
12
|
+
attr_reader :end_offset
|
13
|
+
|
14
|
+
# List of extensions guessed (sort by descending probability) (:mkv, :dll ...). :unknown used to unknown data.
|
15
|
+
# list<Symbol>
|
16
|
+
attr_reader :extensions
|
17
|
+
|
18
|
+
# Is this segment truncated? This means that for the given extension, data should have continued beyond this segment.
|
19
|
+
# Boolean
|
20
|
+
attr_reader :truncated
|
21
|
+
|
22
|
+
# Is this segment missing previous data? This means that for the given extension, data should have been present already before this segment.
|
23
|
+
# Boolean
|
24
|
+
attr_reader :missing_previous_data
|
25
|
+
|
26
|
+
# Metadata associated to this Segment (Decoder dependent)
|
27
|
+
# map< Symbol, Object >
|
28
|
+
attr_reader :metadata
|
29
|
+
|
30
|
+
# Constructor
|
31
|
+
#
|
32
|
+
# Parameters::
|
33
|
+
# * *begin_offset* (_Fixnum_): Specify begin offset
|
34
|
+
# * *end_offset* (_Fixnum_): Specify end offset
|
35
|
+
# * *extension* (_Symbol_ or <em>list<Symbol></em>): Specify extension
|
36
|
+
# * *truncated* (_Boolean_): Specify truncated flag
|
37
|
+
# * *missing_previous_data* (_Boolean_): Do we lack data before this segment?
|
38
|
+
# * *metadata* (<em>map<Symbol,Object></em>): Metadata (Decoder dependent)
|
39
|
+
def initialize(begin_offset, end_offset, extension, truncated, missing_previous_data, metadata)
|
40
|
+
@begin_offset = begin_offset
|
41
|
+
@end_offset = end_offset
|
42
|
+
@extensions = (extension.is_a?(Symbol)) ? [ extension ] : extension
|
43
|
+
@truncated = truncated
|
44
|
+
@missing_previous_data = missing_previous_data
|
45
|
+
@metadata = metadata
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|