fileshunter 0.1.0.20130725

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,251 @@
1
+ require 'bindata'
2
+ require 'ioblockreader'
3
+ require 'rUtilAnts/Plugins'
4
+ require 'fileshunter/Segment'
5
+ require 'fileshunter/Decoder'
6
+ require 'fileshunter/BeginPatternDecoder'
7
+
8
+ module IOBlockReader
9
+
10
+ # Extend class IOBlockReader to raise exceptions when accessing off limits
11
+ class IOBlockReader
12
+
13
+ # Set limits that will trigger exceptions
14
+ def set_limits(begin_offset, end_offset)
15
+ @begin_offset = begin_offset
16
+ @end_offset = end_offset
17
+ end
18
+
19
+ alias_method :old_squares, :[]
20
+ def [](range)
21
+ if (range.is_a?(Range))
22
+ raise FilesHunter::AccessAfterDataError.new("Index out of range: #{range} (>= #{@end_offset})", @end_offset) if (range.last >= @end_offset)
23
+ raise FilesHunter::AccessBeforeDataError.new("Index out of range: #{range} (< #{@begin_offset})") if (range.first < @begin_offset)
24
+ result = self.old_squares(range)
25
+ else
26
+ raise FilesHunter::AccessAfterDataError.new("Index out of range: #{range} (>= #{@end_offset})", @end_offset) if (range >= @end_offset)
27
+ raise FilesHunter::AccessBeforeDataError.new("Index out of range: #{range} (< #{@begin_offset})") if (range < @begin_offset)
28
+ result = self.old_squares(range)
29
+ end
30
+ return result
31
+ end
32
+
33
+ end
34
+
35
+ end
36
+
37
+ module FilesHunter
38
+
39
+ class AccessDataError < RuntimeError
40
+ end
41
+
42
+ class AccessAfterDataError < AccessDataError
43
+
44
+ attr_reader :exceeding_offset
45
+
46
+ # Constructor
47
+ #
48
+ # Parameters::
49
+ # * *message* (_String_): The error message
50
+ # * *exceeding_offset* (_Fixnum_): The exceeding offset
51
+ def initialize(message, exceeding_offset)
52
+ super(message)
53
+ @exceeding_offset = exceeding_offset
54
+ end
55
+
56
+ end
57
+
58
+ class AccessBeforeDataError < AccessDataError
59
+ end
60
+
61
+ class CancelParsingError < RuntimeError
62
+ end
63
+
64
+ class SegmentsAnalyzer
65
+
66
+ # Is the parsing being cancelled?
67
+ # Boolean
68
+ attr_reader :parsing_cancelled
69
+
70
+ # Constructor
71
+ #
72
+ # Parameters::
73
+ # * *options* (<em>map<Symbol,Object></em>): Options [default = {}]
74
+ # * *:block_size* (_Fixnum_): Block size in bytes to read from the file at once [default = 134217728]
75
+ def initialize(options = {})
76
+ @block_size = (options[:block_size] || 134217728)
77
+ @plugins = RUtilAnts::Plugins::PluginsManager.new
78
+ @plugins.parse_plugins_from_dir(:Decoders, "#{File.dirname(__FILE__)}/Decoders", 'FilesHunter::Decoders')
79
+ # Following are variables that may be accessed in a multithreaded environment
80
+ @parsing_cancelled = false
81
+ @nbr_bytes = nil
82
+ @nbr_bytes_decoded = nil
83
+ end
84
+
85
+ # Get segments by analyzing a given file
86
+ #
87
+ # Parameters::
88
+ # * *file_name* (_String_): File to analyze
89
+ # Result::
90
+ # * <em>list<Segment></em>: List of segments for this file
91
+ def get_segments(file_name)
92
+ segments = []
93
+
94
+ @parsing_cancelled = false
95
+
96
+ File.open(file_name, 'rb') do |file|
97
+ content = IOBlockReader.init(file, :block_size => @block_size)
98
+
99
+ @nbr_bytes = File.size(file_name)
100
+ @nbr_bytes_decoded = 0
101
+ log_debug "File size: #{@nbr_bytes}"
102
+ segments << Segment.new(0, @nbr_bytes, :unknown, false, false, {})
103
+
104
+ begin
105
+ # Get decoders in a given order.
106
+ # This is important as some containers can include segments of other containers.
107
+ # A given format MUST NOT be able to include a format specified BEFORE him in the list.
108
+ # A given format CAN be able to include a format specified AFTER him in the list.
109
+ [
110
+ 'CFBF', # includes Thumbs.db, DOC, XLS, PPT, MSI
111
+ 'ASF', # includes WMV
112
+ 'CAB', # includes CAB, MSU, MZZ
113
+ 'EXE', # includes DLL, EXE, OCX, OBJ, DRV, SYS, FON. Cannot detect data concatenated after some EXE files.
114
+ 'MPG_Video', # not generic enough
115
+ 'M2V', # not generic enough
116
+ 'EBML', # includes MKV, WEBM
117
+ 'MP4', # includes 3GP, MOV, M4A and many others
118
+ 'OGG',
119
+ 'RIFF', # includes AVI, WAV, ANI
120
+ 'FLAC',
121
+ 'BMP',
122
+ 'MP3',
123
+ 'Text', # includes TXT, LOG, SRT, RTF, HTML, XML (both ASCII-8BIT and UTF-16)
124
+ 'JPEG', # includes JPG, THM
125
+ 'TIFF',
126
+ 'ICO' # includes ICO, CUR
127
+ ].each do |decoder_name|
128
+ @plugins.access_plugin(:Decoders, decoder_name) do |decoder|
129
+ log_debug "[#{file_name}] - Try #{decoder_name}"
130
+ # require 'ruby-prof'
131
+ # RubyProf.start
132
+ segments = foreach_unknown_segment(segments) do |begin_offset, end_offset|
133
+ log_debug "[#{file_name}] - Try #{decoder_name} for segment [#{begin_offset}, #{end_offset}]"
134
+ content.set_limits(begin_offset, end_offset)
135
+ decoder.setup(self, content, begin_offset, end_offset)
136
+ begin
137
+ decoder.find_segments
138
+ rescue AccessDataError
139
+ log_err "Decoder #{decoder_name} exceeded data ranges: #{$!}.\n#{$!.backtrace.join("\n")}"
140
+ end
141
+ next decoder.segments_found
142
+ end
143
+ # result = RubyProf.stop
144
+ # RubyProf::FlatPrinter.new(result).print(STDOUT)
145
+ end
146
+ end
147
+ rescue CancelParsingError
148
+ log_info "[#{file_name}] - Parsing cancelled"
149
+ end
150
+ end
151
+
152
+ return segments
153
+ end
154
+
155
+ # Cancel the parsing.
156
+ # This method has to be called from a different thread than the one who is currently calling get_segments.
157
+ def cancel_parsing
158
+ @parsing_cancelled = true
159
+ end
160
+
161
+ # Add some bytes as being decoded
162
+ #
163
+ # Parameters::
164
+ # * *nbr_bytes* (_Fixnum_): Number of bytes just being decoded
165
+ def add_bytes_decoded(nbr_bytes)
166
+ @nbr_bytes_decoded = nbr_bytes
167
+ #puts "Progression: #{@nbr_bytes_decoded} / #{@nbr_bytes}"
168
+ end
169
+
170
+ # Get the current progression
171
+ #
172
+ # Result::
173
+ # * _Fixnum_: Total number of bytes
174
+ # * _Fixnum_: Total number of bytes decoded
175
+ def progression
176
+ return @nbr_bytes, @nbr_bytes_decoded
177
+ end
178
+
179
+ private
180
+
181
+ # Call the block for each unknown segments.
182
+ # Blocks have to return a list of segments they managed to decode.
183
+ # Return the list of segments splitted.
184
+ #
185
+ # Parameters::
186
+ # * *lst_segments* (<em>list<Segment></em>): The list of current segments to loop on
187
+ # * _Block_: Code called for each unknown segment found. This code is responsible for splitting each segment with decoded segments if possible.
188
+ # * Parameters::
189
+ # * *begin_offset* (_Fixnum_): Begin offset of the unknown segment
190
+ # * *end_offset* (_Fixnum_): End offset of the unknown segment
191
+ # * Result::
192
+ # * <em>list<Segment></em>: List of decoded segments (can be empty if none have been decoded)
193
+ # Result::
194
+ # * <em>list<Segment></em>: The resulting list of segments
195
+ def foreach_unknown_segment(lst_segments)
196
+ result_segments = []
197
+
198
+ # Split segments that can be decoded
199
+ splitted_segments = []
200
+ lst_segments.each do |segment|
201
+ if (segment.extensions == [:unknown])
202
+ log_debug "Try to find segments in #{segment.begin_offset}..#{segment.end_offset}"
203
+ decoded_segments = yield(segment.begin_offset, segment.end_offset)
204
+ log_debug "Decoded #{decoded_segments.size} new segments: #{decoded_segments.map { |decoded_segment| "[#{decoded_segment.extensions.join(',')}#{decoded_segment.truncated ? '(truncated)' : ''}#{decoded_segment.missing_previous_data ? '(missing previous data)' : ''}:#{decoded_segment.begin_offset}..#{decoded_segment.end_offset}]" }}"
205
+ if (decoded_segments.empty?)
206
+ splitted_segments << segment
207
+ else
208
+ last_written_offset = segment.begin_offset
209
+ decoded_segments.each do |decoded_segment|
210
+ splitted_segments << Segment.new(last_written_offset, decoded_segment.begin_offset, :unknown, false, false, {}) if (decoded_segment.begin_offset > last_written_offset)
211
+ splitted_segments << decoded_segment
212
+ last_written_offset = decoded_segment.end_offset
213
+ end
214
+ splitted_segments << Segment.new(last_written_offset, segment.end_offset, :unknown, false, false, {}) if (segment.end_offset > last_written_offset)
215
+ end
216
+ else
217
+ splitted_segments << segment
218
+ end
219
+ end
220
+
221
+ # Merge consecutives :unknown segments
222
+ nbr_consecutive_unknown = 0
223
+ splitted_segments.each_with_index do |segment, iIdx|
224
+ if (segment.extensions == [:unknown])
225
+ nbr_consecutive_unknown += 1
226
+ else
227
+ if (nbr_consecutive_unknown == 1)
228
+ # Just 1 unknown segment previously encountered
229
+ result_segments << splitted_segments[iIdx-1]
230
+ elsif (nbr_consecutive_unknown > 1)
231
+ # Several consecutive segments encountered: merge them
232
+ result_segments << Segment.new(splitted_segments[iIdx-nbr_consecutive_unknown].begin_offset, splitted_segments[iIdx-1].end_offset, :unknown, false, {})
233
+ end
234
+ result_segments << segment
235
+ nbr_consecutive_unknown = 0
236
+ end
237
+ end
238
+ if (nbr_consecutive_unknown == 1)
239
+ # Just 1 unknown segment previously encountered
240
+ result_segments << splitted_segments[-1]
241
+ elsif (nbr_consecutive_unknown > 1)
242
+ # Several consecutive segments encountered
243
+ result_segments << Segment.new(splitted_segments[-nbr_consecutive_unknown].begin_offset, splitted_segments[-1].end_offset, :unknown, false, false, {})
244
+ end
245
+
246
+ return result_segments
247
+ end
248
+
249
+ end
250
+
251
+ end
@@ -0,0 +1,15 @@
1
+ require 'rUtilAnts/Logging'
2
+ RUtilAnts::Logging::install_logger_on_object
3
+ require 'fileshunter/SegmentsAnalyzer'
4
+
5
+ module FilesHunter
6
+
7
+ # Get a SegmentsAnalyzer that can be used to decode files.
8
+ #
9
+ # Parameters::
10
+ # * *options* (<em>map<Symbol,Object></em>): Options given to the SegmentsAnalyzer. See its documentation to know possible options. [default = {}]
11
+ def self.get_segments_analyzer(options = {})
12
+ return ::FilesHunter::SegmentsAnalyzer.new(options)
13
+ end
14
+
15
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fileshunter
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0.20130725
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Muriel Salvan
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-07-25 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rUtilAnts
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '1.0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '1.0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: ioblockreader
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: 1.0.3
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: 1.0.3
46
+ - !ruby/object:Gem::Dependency
47
+ name: bindata
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: Analyze files and guess their true content's format. Extract hidden files
63
+ from corrupted ones. Easily extensible by adding new plug-ins for new formats. Handles
64
+ documents, videos, images, music, executables...
65
+ email: muriel@x-aeon.com
66
+ executables: []
67
+ extensions:
68
+ - ext/fileshunter/Decoders/extconf.rb
69
+ extra_rdoc_files: []
70
+ files:
71
+ - AUTHORS
72
+ - bin/fileshunt
73
+ - ChangeLog
74
+ - Credits
75
+ - ext/fileshunter/Decoders/extconf.rb
76
+ - ext/fileshunter/Decoders/_FLAC.c
77
+ - lib/fileshunter/BeginPatternDecoder.rb
78
+ - lib/fileshunter/Decoder.rb
79
+ - lib/fileshunter/Decoders/ASF.rb
80
+ - lib/fileshunter/Decoders/BMP.rb
81
+ - lib/fileshunter/Decoders/CAB.rb
82
+ - lib/fileshunter/Decoders/CFBF.rb
83
+ - lib/fileshunter/Decoders/EBML.rb
84
+ - lib/fileshunter/Decoders/EXE.rb
85
+ - lib/fileshunter/Decoders/FLAC.rb
86
+ - lib/fileshunter/Decoders/ICO.rb
87
+ - lib/fileshunter/Decoders/JPEG.rb
88
+ - lib/fileshunter/Decoders/M2V.rb
89
+ - lib/fileshunter/Decoders/MP3.rb
90
+ - lib/fileshunter/Decoders/MP4.rb
91
+ - lib/fileshunter/Decoders/MPG_Video.rb
92
+ - lib/fileshunter/Decoders/OGG.rb
93
+ - lib/fileshunter/Decoders/RIFF.rb
94
+ - lib/fileshunter/Decoders/Text.rb
95
+ - lib/fileshunter/Decoders/TIFF.rb
96
+ - lib/fileshunter/Segment.rb
97
+ - lib/fileshunter/SegmentsAnalyzer.rb
98
+ - lib/fileshunter.rb
99
+ - LICENSE
100
+ - Rakefile
101
+ - README
102
+ - README.md
103
+ - ReleaseInfo
104
+ homepage: https://github.com/Muriel-Salvan/files-hunter
105
+ licenses: []
106
+ post_install_message:
107
+ rdoc_options: []
108
+ require_paths:
109
+ - lib
110
+ - ext
111
+ required_ruby_version: !ruby/object:Gem::Requirement
112
+ none: false
113
+ requirements:
114
+ - - ! '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ required_rubygems_version: !ruby/object:Gem::Requirement
118
+ none: false
119
+ requirements:
120
+ - - ! '>='
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
123
+ requirements: []
124
+ rubyforge_project: files-hunter
125
+ rubygems_version: 1.8.24
126
+ signing_key:
127
+ specification_version: 3
128
+ summary: Analyze files to get their real format. Ideal to retrieve hidden and corrupted
129
+ files.
130
+ test_files: []