fileshunter 0.1.0.20130725
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/AUTHORS +3 -0
- data/ChangeLog +5 -0
- data/Credits +21 -0
- data/LICENSE +31 -0
- data/README +15 -0
- data/README.md +11 -0
- data/Rakefile +7 -0
- data/ReleaseInfo +8 -0
- data/bin/fileshunt +216 -0
- data/ext/fileshunter/Decoders/_FLAC.c +233 -0
- data/ext/fileshunter/Decoders/extconf.rb +3 -0
- data/lib/fileshunter/BeginPatternDecoder.rb +218 -0
- data/lib/fileshunter/Decoder.rb +66 -0
- data/lib/fileshunter/Decoders/ASF.rb +50 -0
- data/lib/fileshunter/Decoders/BMP.rb +118 -0
- data/lib/fileshunter/Decoders/CAB.rb +140 -0
- data/lib/fileshunter/Decoders/CFBF.rb +92 -0
- data/lib/fileshunter/Decoders/EBML.rb +369 -0
- data/lib/fileshunter/Decoders/EXE.rb +505 -0
- data/lib/fileshunter/Decoders/FLAC.rb +387 -0
- data/lib/fileshunter/Decoders/ICO.rb +71 -0
- data/lib/fileshunter/Decoders/JPEG.rb +247 -0
- data/lib/fileshunter/Decoders/M2V.rb +30 -0
- data/lib/fileshunter/Decoders/MP3.rb +341 -0
- data/lib/fileshunter/Decoders/MP4.rb +620 -0
- data/lib/fileshunter/Decoders/MPG_Video.rb +30 -0
- data/lib/fileshunter/Decoders/OGG.rb +74 -0
- data/lib/fileshunter/Decoders/RIFF.rb +437 -0
- data/lib/fileshunter/Decoders/TIFF.rb +350 -0
- data/lib/fileshunter/Decoders/Text.rb +240 -0
- data/lib/fileshunter/Segment.rb +50 -0
- data/lib/fileshunter/SegmentsAnalyzer.rb +251 -0
- data/lib/fileshunter.rb +15 -0
- metadata +130 -0
@@ -0,0 +1,251 @@
|
|
1
|
+
require 'bindata'
|
2
|
+
require 'ioblockreader'
|
3
|
+
require 'rUtilAnts/Plugins'
|
4
|
+
require 'fileshunter/Segment'
|
5
|
+
require 'fileshunter/Decoder'
|
6
|
+
require 'fileshunter/BeginPatternDecoder'
|
7
|
+
|
8
|
+
module IOBlockReader
|
9
|
+
|
10
|
+
# Extend class IOBlockReader to raise exceptions when accessing off limits
|
11
|
+
class IOBlockReader
|
12
|
+
|
13
|
+
# Set limits that will trigger exceptions
|
14
|
+
def set_limits(begin_offset, end_offset)
|
15
|
+
@begin_offset = begin_offset
|
16
|
+
@end_offset = end_offset
|
17
|
+
end
|
18
|
+
|
19
|
+
alias_method :old_squares, :[]
|
20
|
+
def [](range)
|
21
|
+
if (range.is_a?(Range))
|
22
|
+
raise FilesHunter::AccessAfterDataError.new("Index out of range: #{range} (>= #{@end_offset})", @end_offset) if (range.last >= @end_offset)
|
23
|
+
raise FilesHunter::AccessBeforeDataError.new("Index out of range: #{range} (< #{@begin_offset})") if (range.first < @begin_offset)
|
24
|
+
result = self.old_squares(range)
|
25
|
+
else
|
26
|
+
raise FilesHunter::AccessAfterDataError.new("Index out of range: #{range} (>= #{@end_offset})", @end_offset) if (range >= @end_offset)
|
27
|
+
raise FilesHunter::AccessBeforeDataError.new("Index out of range: #{range} (< #{@begin_offset})") if (range < @begin_offset)
|
28
|
+
result = self.old_squares(range)
|
29
|
+
end
|
30
|
+
return result
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
module FilesHunter
|
38
|
+
|
39
|
+
class AccessDataError < RuntimeError
|
40
|
+
end
|
41
|
+
|
42
|
+
class AccessAfterDataError < AccessDataError
|
43
|
+
|
44
|
+
attr_reader :exceeding_offset
|
45
|
+
|
46
|
+
# Constructor
|
47
|
+
#
|
48
|
+
# Parameters::
|
49
|
+
# * *message* (_String_): The error message
|
50
|
+
# * *exceeding_offset* (_Fixnum_): The exceeding offset
|
51
|
+
def initialize(message, exceeding_offset)
|
52
|
+
super(message)
|
53
|
+
@exceeding_offset = exceeding_offset
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
class AccessBeforeDataError < AccessDataError
|
59
|
+
end
|
60
|
+
|
61
|
+
class CancelParsingError < RuntimeError
|
62
|
+
end
|
63
|
+
|
64
|
+
class SegmentsAnalyzer
|
65
|
+
|
66
|
+
# Is the parsing being cancelled?
|
67
|
+
# Boolean
|
68
|
+
attr_reader :parsing_cancelled
|
69
|
+
|
70
|
+
# Constructor
|
71
|
+
#
|
72
|
+
# Parameters::
|
73
|
+
# * *options* (<em>map<Symbol,Object></em>): Options [default = {}]
|
74
|
+
# * *:block_size* (_Fixnum_): Block size in bytes to read from the file at once [default = 134217728]
|
75
|
+
def initialize(options = {})
|
76
|
+
@block_size = (options[:block_size] || 134217728)
|
77
|
+
@plugins = RUtilAnts::Plugins::PluginsManager.new
|
78
|
+
@plugins.parse_plugins_from_dir(:Decoders, "#{File.dirname(__FILE__)}/Decoders", 'FilesHunter::Decoders')
|
79
|
+
# Following are variables that may be accessed in a multithreaded environment
|
80
|
+
@parsing_cancelled = false
|
81
|
+
@nbr_bytes = nil
|
82
|
+
@nbr_bytes_decoded = nil
|
83
|
+
end
|
84
|
+
|
85
|
+
# Get segments by analyzing a given file
|
86
|
+
#
|
87
|
+
# Parameters::
|
88
|
+
# * *file_name* (_String_): File to analyze
|
89
|
+
# Result::
|
90
|
+
# * <em>list<Segment></em>: List of segments for this file
|
91
|
+
def get_segments(file_name)
|
92
|
+
segments = []
|
93
|
+
|
94
|
+
@parsing_cancelled = false
|
95
|
+
|
96
|
+
File.open(file_name, 'rb') do |file|
|
97
|
+
content = IOBlockReader.init(file, :block_size => @block_size)
|
98
|
+
|
99
|
+
@nbr_bytes = File.size(file_name)
|
100
|
+
@nbr_bytes_decoded = 0
|
101
|
+
log_debug "File size: #{@nbr_bytes}"
|
102
|
+
segments << Segment.new(0, @nbr_bytes, :unknown, false, false, {})
|
103
|
+
|
104
|
+
begin
|
105
|
+
# Get decoders in a given order.
|
106
|
+
# This is important as some containers can include segments of other containers.
|
107
|
+
# A given format MUST NOT be able to include a format specified BEFORE him in the list.
|
108
|
+
# A given format CAN be able to include a format specified AFTER him in the list.
|
109
|
+
[
|
110
|
+
'CFBF', # includes Thumbs.db, DOC, XLS, PPT, MSI
|
111
|
+
'ASF', # includes WMV
|
112
|
+
'CAB', # includes CAB, MSU, MZZ
|
113
|
+
'EXE', # includes DLL, EXE, OCX, OBJ, DRV, SYS, FON. Cannot detect data concatenated after some EXE files.
|
114
|
+
'MPG_Video', # not generic enough
|
115
|
+
'M2V', # not generic enough
|
116
|
+
'EBML', # includes MKV, WEBM
|
117
|
+
'MP4', # includes 3GP, MOV, M4A and many others
|
118
|
+
'OGG',
|
119
|
+
'RIFF', # includes AVI, WAV, ANI
|
120
|
+
'FLAC',
|
121
|
+
'BMP',
|
122
|
+
'MP3',
|
123
|
+
'Text', # includes TXT, LOG, SRT, RTF, HTML, XML (both ASCII-8BIT and UTF-16)
|
124
|
+
'JPEG', # includes JPG, THM
|
125
|
+
'TIFF',
|
126
|
+
'ICO' # includes ICO, CUR
|
127
|
+
].each do |decoder_name|
|
128
|
+
@plugins.access_plugin(:Decoders, decoder_name) do |decoder|
|
129
|
+
log_debug "[#{file_name}] - Try #{decoder_name}"
|
130
|
+
# require 'ruby-prof'
|
131
|
+
# RubyProf.start
|
132
|
+
segments = foreach_unknown_segment(segments) do |begin_offset, end_offset|
|
133
|
+
log_debug "[#{file_name}] - Try #{decoder_name} for segment [#{begin_offset}, #{end_offset}]"
|
134
|
+
content.set_limits(begin_offset, end_offset)
|
135
|
+
decoder.setup(self, content, begin_offset, end_offset)
|
136
|
+
begin
|
137
|
+
decoder.find_segments
|
138
|
+
rescue AccessDataError
|
139
|
+
log_err "Decoder #{decoder_name} exceeded data ranges: #{$!}.\n#{$!.backtrace.join("\n")}"
|
140
|
+
end
|
141
|
+
next decoder.segments_found
|
142
|
+
end
|
143
|
+
# result = RubyProf.stop
|
144
|
+
# RubyProf::FlatPrinter.new(result).print(STDOUT)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
rescue CancelParsingError
|
148
|
+
log_info "[#{file_name}] - Parsing cancelled"
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
return segments
|
153
|
+
end
|
154
|
+
|
155
|
+
# Cancel the parsing.
|
156
|
+
# This method has to be called from a different thread than the one who is currently calling get_segments.
|
157
|
+
def cancel_parsing
|
158
|
+
@parsing_cancelled = true
|
159
|
+
end
|
160
|
+
|
161
|
+
# Add some bytes as being decoded
|
162
|
+
#
|
163
|
+
# Parameters::
|
164
|
+
# * *nbr_bytes* (_Fixnum_): Number of bytes just being decoded
|
165
|
+
def add_bytes_decoded(nbr_bytes)
|
166
|
+
@nbr_bytes_decoded = nbr_bytes
|
167
|
+
#puts "Progression: #{@nbr_bytes_decoded} / #{@nbr_bytes}"
|
168
|
+
end
|
169
|
+
|
170
|
+
# Get the current progression
|
171
|
+
#
|
172
|
+
# Result::
|
173
|
+
# * _Fixnum_: Total number of bytes
|
174
|
+
# * _Fixnum_: Total number of bytes decoded
|
175
|
+
def progression
|
176
|
+
return @nbr_bytes, @nbr_bytes_decoded
|
177
|
+
end
|
178
|
+
|
179
|
+
private
|
180
|
+
|
181
|
+
# Call the block for each unknown segments.
|
182
|
+
# Blocks have to return a list of segments they managed to decode.
|
183
|
+
# Return the list of segments splitted.
|
184
|
+
#
|
185
|
+
# Parameters::
|
186
|
+
# * *lst_segments* (<em>list<Segment></em>): The list of current segments to loop on
|
187
|
+
# * _Block_: Code called for each unknown segment found. This code is responsible for splitting each segment with decoded segments if possible.
|
188
|
+
# * Parameters::
|
189
|
+
# * *begin_offset* (_Fixnum_): Begin offset of the unknown segment
|
190
|
+
# * *end_offset* (_Fixnum_): End offset of the unknown segment
|
191
|
+
# * Result::
|
192
|
+
# * <em>list<Segment></em>: List of decoded segments (can be empty if none have been decoded)
|
193
|
+
# Result::
|
194
|
+
# * <em>list<Segment></em>: The resulting list of segments
|
195
|
+
def foreach_unknown_segment(lst_segments)
|
196
|
+
result_segments = []
|
197
|
+
|
198
|
+
# Split segments that can be decoded
|
199
|
+
splitted_segments = []
|
200
|
+
lst_segments.each do |segment|
|
201
|
+
if (segment.extensions == [:unknown])
|
202
|
+
log_debug "Try to find segments in #{segment.begin_offset}..#{segment.end_offset}"
|
203
|
+
decoded_segments = yield(segment.begin_offset, segment.end_offset)
|
204
|
+
log_debug "Decoded #{decoded_segments.size} new segments: #{decoded_segments.map { |decoded_segment| "[#{decoded_segment.extensions.join(',')}#{decoded_segment.truncated ? '(truncated)' : ''}#{decoded_segment.missing_previous_data ? '(missing previous data)' : ''}:#{decoded_segment.begin_offset}..#{decoded_segment.end_offset}]" }}"
|
205
|
+
if (decoded_segments.empty?)
|
206
|
+
splitted_segments << segment
|
207
|
+
else
|
208
|
+
last_written_offset = segment.begin_offset
|
209
|
+
decoded_segments.each do |decoded_segment|
|
210
|
+
splitted_segments << Segment.new(last_written_offset, decoded_segment.begin_offset, :unknown, false, false, {}) if (decoded_segment.begin_offset > last_written_offset)
|
211
|
+
splitted_segments << decoded_segment
|
212
|
+
last_written_offset = decoded_segment.end_offset
|
213
|
+
end
|
214
|
+
splitted_segments << Segment.new(last_written_offset, segment.end_offset, :unknown, false, false, {}) if (segment.end_offset > last_written_offset)
|
215
|
+
end
|
216
|
+
else
|
217
|
+
splitted_segments << segment
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
# Merge consecutives :unknown segments
|
222
|
+
nbr_consecutive_unknown = 0
|
223
|
+
splitted_segments.each_with_index do |segment, iIdx|
|
224
|
+
if (segment.extensions == [:unknown])
|
225
|
+
nbr_consecutive_unknown += 1
|
226
|
+
else
|
227
|
+
if (nbr_consecutive_unknown == 1)
|
228
|
+
# Just 1 unknown segment previously encountered
|
229
|
+
result_segments << splitted_segments[iIdx-1]
|
230
|
+
elsif (nbr_consecutive_unknown > 1)
|
231
|
+
# Several consecutive segments encountered: merge them
|
232
|
+
result_segments << Segment.new(splitted_segments[iIdx-nbr_consecutive_unknown].begin_offset, splitted_segments[iIdx-1].end_offset, :unknown, false, {})
|
233
|
+
end
|
234
|
+
result_segments << segment
|
235
|
+
nbr_consecutive_unknown = 0
|
236
|
+
end
|
237
|
+
end
|
238
|
+
if (nbr_consecutive_unknown == 1)
|
239
|
+
# Just 1 unknown segment previously encountered
|
240
|
+
result_segments << splitted_segments[-1]
|
241
|
+
elsif (nbr_consecutive_unknown > 1)
|
242
|
+
# Several consecutive segments encountered
|
243
|
+
result_segments << Segment.new(splitted_segments[-nbr_consecutive_unknown].begin_offset, splitted_segments[-1].end_offset, :unknown, false, false, {})
|
244
|
+
end
|
245
|
+
|
246
|
+
return result_segments
|
247
|
+
end
|
248
|
+
|
249
|
+
end
|
250
|
+
|
251
|
+
end
|
data/lib/fileshunter.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rUtilAnts/Logging'
|
2
|
+
RUtilAnts::Logging::install_logger_on_object
|
3
|
+
require 'fileshunter/SegmentsAnalyzer'
|
4
|
+
|
5
|
+
module FilesHunter
|
6
|
+
|
7
|
+
# Get a SegmentsAnalyzer that can be used to decode files.
|
8
|
+
#
|
9
|
+
# Parameters::
|
10
|
+
# * *options* (<em>map<Symbol,Object></em>): Options given to the SegmentsAnalyzer. See its documentation to know possible options. [default = {}]
|
11
|
+
def self.get_segments_analyzer(options = {})
|
12
|
+
return ::FilesHunter::SegmentsAnalyzer.new(options)
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
metadata
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fileshunter
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0.20130725
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Muriel Salvan
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-07-25 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rUtilAnts
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: ioblockreader
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 1.0.3
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 1.0.3
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: bindata
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
description: Analyze files and guess their true content's format. Extract hidden files
|
63
|
+
from corrupted ones. Easily extensible by adding new plug-ins for new formats. Handles
|
64
|
+
documents, videos, images, music, executables...
|
65
|
+
email: muriel@x-aeon.com
|
66
|
+
executables: []
|
67
|
+
extensions:
|
68
|
+
- ext/fileshunter/Decoders/extconf.rb
|
69
|
+
extra_rdoc_files: []
|
70
|
+
files:
|
71
|
+
- AUTHORS
|
72
|
+
- bin/fileshunt
|
73
|
+
- ChangeLog
|
74
|
+
- Credits
|
75
|
+
- ext/fileshunter/Decoders/extconf.rb
|
76
|
+
- ext/fileshunter/Decoders/_FLAC.c
|
77
|
+
- lib/fileshunter/BeginPatternDecoder.rb
|
78
|
+
- lib/fileshunter/Decoder.rb
|
79
|
+
- lib/fileshunter/Decoders/ASF.rb
|
80
|
+
- lib/fileshunter/Decoders/BMP.rb
|
81
|
+
- lib/fileshunter/Decoders/CAB.rb
|
82
|
+
- lib/fileshunter/Decoders/CFBF.rb
|
83
|
+
- lib/fileshunter/Decoders/EBML.rb
|
84
|
+
- lib/fileshunter/Decoders/EXE.rb
|
85
|
+
- lib/fileshunter/Decoders/FLAC.rb
|
86
|
+
- lib/fileshunter/Decoders/ICO.rb
|
87
|
+
- lib/fileshunter/Decoders/JPEG.rb
|
88
|
+
- lib/fileshunter/Decoders/M2V.rb
|
89
|
+
- lib/fileshunter/Decoders/MP3.rb
|
90
|
+
- lib/fileshunter/Decoders/MP4.rb
|
91
|
+
- lib/fileshunter/Decoders/MPG_Video.rb
|
92
|
+
- lib/fileshunter/Decoders/OGG.rb
|
93
|
+
- lib/fileshunter/Decoders/RIFF.rb
|
94
|
+
- lib/fileshunter/Decoders/Text.rb
|
95
|
+
- lib/fileshunter/Decoders/TIFF.rb
|
96
|
+
- lib/fileshunter/Segment.rb
|
97
|
+
- lib/fileshunter/SegmentsAnalyzer.rb
|
98
|
+
- lib/fileshunter.rb
|
99
|
+
- LICENSE
|
100
|
+
- Rakefile
|
101
|
+
- README
|
102
|
+
- README.md
|
103
|
+
- ReleaseInfo
|
104
|
+
homepage: https://github.com/Muriel-Salvan/files-hunter
|
105
|
+
licenses: []
|
106
|
+
post_install_message:
|
107
|
+
rdoc_options: []
|
108
|
+
require_paths:
|
109
|
+
- lib
|
110
|
+
- ext
|
111
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
112
|
+
none: false
|
113
|
+
requirements:
|
114
|
+
- - ! '>='
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0'
|
117
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
118
|
+
none: false
|
119
|
+
requirements:
|
120
|
+
- - ! '>='
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: '0'
|
123
|
+
requirements: []
|
124
|
+
rubyforge_project: files-hunter
|
125
|
+
rubygems_version: 1.8.24
|
126
|
+
signing_key:
|
127
|
+
specification_version: 3
|
128
|
+
summary: Analyze files to get their real format. Ideal to retrieve hidden and corrupted
|
129
|
+
files.
|
130
|
+
test_files: []
|