fileshunter 0.1.0.20130725

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,251 @@
1
+ require 'bindata'
2
+ require 'ioblockreader'
3
+ require 'rUtilAnts/Plugins'
4
+ require 'fileshunter/Segment'
5
+ require 'fileshunter/Decoder'
6
+ require 'fileshunter/BeginPatternDecoder'
7
+
8
+ module IOBlockReader
9
+
10
+ # Extend class IOBlockReader to raise exceptions when accessing off limits
11
+ class IOBlockReader
12
+
13
+ # Set limits that will trigger exceptions
14
+ def set_limits(begin_offset, end_offset)
15
+ @begin_offset = begin_offset
16
+ @end_offset = end_offset
17
+ end
18
+
19
+ alias_method :old_squares, :[]
20
+ def [](range)
21
+ if (range.is_a?(Range))
22
+ raise FilesHunter::AccessAfterDataError.new("Index out of range: #{range} (>= #{@end_offset})", @end_offset) if (range.last >= @end_offset)
23
+ raise FilesHunter::AccessBeforeDataError.new("Index out of range: #{range} (< #{@begin_offset})") if (range.first < @begin_offset)
24
+ result = self.old_squares(range)
25
+ else
26
+ raise FilesHunter::AccessAfterDataError.new("Index out of range: #{range} (>= #{@end_offset})", @end_offset) if (range >= @end_offset)
27
+ raise FilesHunter::AccessBeforeDataError.new("Index out of range: #{range} (< #{@begin_offset})") if (range < @begin_offset)
28
+ result = self.old_squares(range)
29
+ end
30
+ return result
31
+ end
32
+
33
+ end
34
+
35
+ end
36
+
37
+ module FilesHunter
38
+
39
+ class AccessDataError < RuntimeError
40
+ end
41
+
42
+ class AccessAfterDataError < AccessDataError
43
+
44
+ attr_reader :exceeding_offset
45
+
46
+ # Constructor
47
+ #
48
+ # Parameters::
49
+ # * *message* (_String_): The error message
50
+ # * *exceeding_offset* (_Fixnum_): The exceeding offset
51
+ def initialize(message, exceeding_offset)
52
+ super(message)
53
+ @exceeding_offset = exceeding_offset
54
+ end
55
+
56
+ end
57
+
58
+ class AccessBeforeDataError < AccessDataError
59
+ end
60
+
61
+ class CancelParsingError < RuntimeError
62
+ end
63
+
64
+ class SegmentsAnalyzer
65
+
66
+ # Is the parsing being cancelled?
67
+ # Boolean
68
+ attr_reader :parsing_cancelled
69
+
70
+ # Constructor
71
+ #
72
+ # Parameters::
73
+ # * *options* (<em>map<Symbol,Object></em>): Options [default = {}]
74
+ # * *:block_size* (_Fixnum_): Block size in bytes to read from the file at once [default = 134217728]
75
+ def initialize(options = {})
76
+ @block_size = (options[:block_size] || 134217728)
77
+ @plugins = RUtilAnts::Plugins::PluginsManager.new
78
+ @plugins.parse_plugins_from_dir(:Decoders, "#{File.dirname(__FILE__)}/Decoders", 'FilesHunter::Decoders')
79
+ # Following are variables that may be accessed in a multithreaded environment
80
+ @parsing_cancelled = false
81
+ @nbr_bytes = nil
82
+ @nbr_bytes_decoded = nil
83
+ end
84
+
85
+ # Get segments by analyzing a given file
86
+ #
87
+ # Parameters::
88
+ # * *file_name* (_String_): File to analyze
89
+ # Result::
90
+ # * <em>list<Segment></em>: List of segments for this file
91
+ def get_segments(file_name)
92
+ segments = []
93
+
94
+ @parsing_cancelled = false
95
+
96
+ File.open(file_name, 'rb') do |file|
97
+ content = IOBlockReader.init(file, :block_size => @block_size)
98
+
99
+ @nbr_bytes = File.size(file_name)
100
+ @nbr_bytes_decoded = 0
101
+ log_debug "File size: #{@nbr_bytes}"
102
+ segments << Segment.new(0, @nbr_bytes, :unknown, false, false, {})
103
+
104
+ begin
105
+ # Get decoders in a given order.
106
+ # This is important as some containers can include segments of other containers.
107
+ # A given format MUST NOT be able to include a format specified BEFORE him in the list.
108
+ # A given format CAN be able to include a format specified AFTER him in the list.
109
+ [
110
+ 'CFBF', # includes Thumbs.db, DOC, XLS, PPT, MSI
111
+ 'ASF', # includes WMV
112
+ 'CAB', # includes CAB, MSU, MZZ
113
+ 'EXE', # includes DLL, EXE, OCX, OBJ, DRV, SYS, FON. Cannot detect data concatenated after some EXE files.
114
+ 'MPG_Video', # not generic enough
115
+ 'M2V', # not generic enough
116
+ 'EBML', # includes MKV, WEBM
117
+ 'MP4', # includes 3GP, MOV, M4A and many others
118
+ 'OGG',
119
+ 'RIFF', # includes AVI, WAV, ANI
120
+ 'FLAC',
121
+ 'BMP',
122
+ 'MP3',
123
+ 'Text', # includes TXT, LOG, SRT, RTF, HTML, XML (both ASCII-8BIT and UTF-16)
124
+ 'JPEG', # includes JPG, THM
125
+ 'TIFF',
126
+ 'ICO' # includes ICO, CUR
127
+ ].each do |decoder_name|
128
+ @plugins.access_plugin(:Decoders, decoder_name) do |decoder|
129
+ log_debug "[#{file_name}] - Try #{decoder_name}"
130
+ # require 'ruby-prof'
131
+ # RubyProf.start
132
+ segments = foreach_unknown_segment(segments) do |begin_offset, end_offset|
133
+ log_debug "[#{file_name}] - Try #{decoder_name} for segment [#{begin_offset}, #{end_offset}]"
134
+ content.set_limits(begin_offset, end_offset)
135
+ decoder.setup(self, content, begin_offset, end_offset)
136
+ begin
137
+ decoder.find_segments
138
+ rescue AccessDataError
139
+ log_err "Decoder #{decoder_name} exceeded data ranges: #{$!}.\n#{$!.backtrace.join("\n")}"
140
+ end
141
+ next decoder.segments_found
142
+ end
143
+ # result = RubyProf.stop
144
+ # RubyProf::FlatPrinter.new(result).print(STDOUT)
145
+ end
146
+ end
147
+ rescue CancelParsingError
148
+ log_info "[#{file_name}] - Parsing cancelled"
149
+ end
150
+ end
151
+
152
+ return segments
153
+ end
154
+
155
+ # Cancel the parsing.
156
+ # This method has to be called from a different thread than the one who is currently calling get_segments.
157
+ def cancel_parsing
158
+ @parsing_cancelled = true
159
+ end
160
+
161
+ # Add some bytes as being decoded
162
+ #
163
+ # Parameters::
164
+ # * *nbr_bytes* (_Fixnum_): Number of bytes just being decoded
165
+ def add_bytes_decoded(nbr_bytes)
166
+ @nbr_bytes_decoded = nbr_bytes
167
+ #puts "Progression: #{@nbr_bytes_decoded} / #{@nbr_bytes}"
168
+ end
169
+
170
+ # Get the current progression
171
+ #
172
+ # Result::
173
+ # * _Fixnum_: Total number of bytes
174
+ # * _Fixnum_: Total number of bytes decoded
175
+ def progression
176
+ return @nbr_bytes, @nbr_bytes_decoded
177
+ end
178
+
179
+ private
180
+
181
+ # Call the block for each unknown segments.
182
+ # Blocks have to return a list of segments they managed to decode.
183
+ # Return the list of segments splitted.
184
+ #
185
+ # Parameters::
186
+ # * *lst_segments* (<em>list<Segment></em>): The list of current segments to loop on
187
+ # * _Block_: Code called for each unknown segment found. This code is responsible for splitting each segment with decoded segments if possible.
188
+ # * Parameters::
189
+ # * *begin_offset* (_Fixnum_): Begin offset of the unknown segment
190
+ # * *end_offset* (_Fixnum_): End offset of the unknown segment
191
+ # * Result::
192
+ # * <em>list<Segment></em>: List of decoded segments (can be empty if none have been decoded)
193
+ # Result::
194
+ # * <em>list<Segment></em>: The resulting list of segments
195
+ def foreach_unknown_segment(lst_segments)
196
+ result_segments = []
197
+
198
+ # Split segments that can be decoded
199
+ splitted_segments = []
200
+ lst_segments.each do |segment|
201
+ if (segment.extensions == [:unknown])
202
+ log_debug "Try to find segments in #{segment.begin_offset}..#{segment.end_offset}"
203
+ decoded_segments = yield(segment.begin_offset, segment.end_offset)
204
+ log_debug "Decoded #{decoded_segments.size} new segments: #{decoded_segments.map { |decoded_segment| "[#{decoded_segment.extensions.join(',')}#{decoded_segment.truncated ? '(truncated)' : ''}#{decoded_segment.missing_previous_data ? '(missing previous data)' : ''}:#{decoded_segment.begin_offset}..#{decoded_segment.end_offset}]" }}"
205
+ if (decoded_segments.empty?)
206
+ splitted_segments << segment
207
+ else
208
+ last_written_offset = segment.begin_offset
209
+ decoded_segments.each do |decoded_segment|
210
+ splitted_segments << Segment.new(last_written_offset, decoded_segment.begin_offset, :unknown, false, false, {}) if (decoded_segment.begin_offset > last_written_offset)
211
+ splitted_segments << decoded_segment
212
+ last_written_offset = decoded_segment.end_offset
213
+ end
214
+ splitted_segments << Segment.new(last_written_offset, segment.end_offset, :unknown, false, false, {}) if (segment.end_offset > last_written_offset)
215
+ end
216
+ else
217
+ splitted_segments << segment
218
+ end
219
+ end
220
+
221
+ # Merge consecutives :unknown segments
222
+ nbr_consecutive_unknown = 0
223
+ splitted_segments.each_with_index do |segment, iIdx|
224
+ if (segment.extensions == [:unknown])
225
+ nbr_consecutive_unknown += 1
226
+ else
227
+ if (nbr_consecutive_unknown == 1)
228
+ # Just 1 unknown segment previously encountered
229
+ result_segments << splitted_segments[iIdx-1]
230
+ elsif (nbr_consecutive_unknown > 1)
231
+ # Several consecutive segments encountered: merge them
232
+ result_segments << Segment.new(splitted_segments[iIdx-nbr_consecutive_unknown].begin_offset, splitted_segments[iIdx-1].end_offset, :unknown, false, {})
233
+ end
234
+ result_segments << segment
235
+ nbr_consecutive_unknown = 0
236
+ end
237
+ end
238
+ if (nbr_consecutive_unknown == 1)
239
+ # Just 1 unknown segment previously encountered
240
+ result_segments << splitted_segments[-1]
241
+ elsif (nbr_consecutive_unknown > 1)
242
+ # Several consecutive segments encountered
243
+ result_segments << Segment.new(splitted_segments[-nbr_consecutive_unknown].begin_offset, splitted_segments[-1].end_offset, :unknown, false, false, {})
244
+ end
245
+
246
+ return result_segments
247
+ end
248
+
249
+ end
250
+
251
+ end
@@ -0,0 +1,15 @@
1
+ require 'rUtilAnts/Logging'
2
+ RUtilAnts::Logging::install_logger_on_object
3
+ require 'fileshunter/SegmentsAnalyzer'
4
+
5
+ module FilesHunter
6
+
7
+ # Get a SegmentsAnalyzer that can be used to decode files.
8
+ #
9
+ # Parameters::
10
+ # * *options* (<em>map<Symbol,Object></em>): Options given to the SegmentsAnalyzer. See its documentation to know possible options. [default = {}]
11
+ def self.get_segments_analyzer(options = {})
12
+ return ::FilesHunter::SegmentsAnalyzer.new(options)
13
+ end
14
+
15
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fileshunter
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0.20130725
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Muriel Salvan
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-07-25 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rUtilAnts
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '1.0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '1.0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: ioblockreader
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: 1.0.3
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: 1.0.3
46
+ - !ruby/object:Gem::Dependency
47
+ name: bindata
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: Analyze files and guess their true content's format. Extract hidden files
63
+ from corrupted ones. Easily extensible by adding new plug-ins for new formats. Handles
64
+ documents, videos, images, music, executables...
65
+ email: muriel@x-aeon.com
66
+ executables: []
67
+ extensions:
68
+ - ext/fileshunter/Decoders/extconf.rb
69
+ extra_rdoc_files: []
70
+ files:
71
+ - AUTHORS
72
+ - bin/fileshunt
73
+ - ChangeLog
74
+ - Credits
75
+ - ext/fileshunter/Decoders/extconf.rb
76
+ - ext/fileshunter/Decoders/_FLAC.c
77
+ - lib/fileshunter/BeginPatternDecoder.rb
78
+ - lib/fileshunter/Decoder.rb
79
+ - lib/fileshunter/Decoders/ASF.rb
80
+ - lib/fileshunter/Decoders/BMP.rb
81
+ - lib/fileshunter/Decoders/CAB.rb
82
+ - lib/fileshunter/Decoders/CFBF.rb
83
+ - lib/fileshunter/Decoders/EBML.rb
84
+ - lib/fileshunter/Decoders/EXE.rb
85
+ - lib/fileshunter/Decoders/FLAC.rb
86
+ - lib/fileshunter/Decoders/ICO.rb
87
+ - lib/fileshunter/Decoders/JPEG.rb
88
+ - lib/fileshunter/Decoders/M2V.rb
89
+ - lib/fileshunter/Decoders/MP3.rb
90
+ - lib/fileshunter/Decoders/MP4.rb
91
+ - lib/fileshunter/Decoders/MPG_Video.rb
92
+ - lib/fileshunter/Decoders/OGG.rb
93
+ - lib/fileshunter/Decoders/RIFF.rb
94
+ - lib/fileshunter/Decoders/Text.rb
95
+ - lib/fileshunter/Decoders/TIFF.rb
96
+ - lib/fileshunter/Segment.rb
97
+ - lib/fileshunter/SegmentsAnalyzer.rb
98
+ - lib/fileshunter.rb
99
+ - LICENSE
100
+ - Rakefile
101
+ - README
102
+ - README.md
103
+ - ReleaseInfo
104
+ homepage: https://github.com/Muriel-Salvan/files-hunter
105
+ licenses: []
106
+ post_install_message:
107
+ rdoc_options: []
108
+ require_paths:
109
+ - lib
110
+ - ext
111
+ required_ruby_version: !ruby/object:Gem::Requirement
112
+ none: false
113
+ requirements:
114
+ - - ! '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ required_rubygems_version: !ruby/object:Gem::Requirement
118
+ none: false
119
+ requirements:
120
+ - - ! '>='
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
123
+ requirements: []
124
+ rubyforge_project: files-hunter
125
+ rubygems_version: 1.8.24
126
+ signing_key:
127
+ specification_version: 3
128
+ summary: Analyze files to get their real format. Ideal to retrieve hidden and corrupted
129
+ files.
130
+ test_files: []