cabriolet 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +3 -0
  3. data/lib/cabriolet/binary/bitstream.rb +32 -21
  4. data/lib/cabriolet/binary/bitstream_writer.rb +21 -4
  5. data/lib/cabriolet/cab/compressor.rb +85 -53
  6. data/lib/cabriolet/cab/decompressor.rb +2 -1
  7. data/lib/cabriolet/cab/extractor.rb +2 -35
  8. data/lib/cabriolet/cab/file_compression_work.rb +52 -0
  9. data/lib/cabriolet/cab/file_compression_worker.rb +89 -0
  10. data/lib/cabriolet/checksum.rb +49 -0
  11. data/lib/cabriolet/collections/file_collection.rb +175 -0
  12. data/lib/cabriolet/compressors/quantum.rb +3 -51
  13. data/lib/cabriolet/decompressors/quantum.rb +81 -52
  14. data/lib/cabriolet/extraction/base_extractor.rb +88 -0
  15. data/lib/cabriolet/extraction/extractor.rb +171 -0
  16. data/lib/cabriolet/extraction/file_extraction_work.rb +60 -0
  17. data/lib/cabriolet/extraction/file_extraction_worker.rb +106 -0
  18. data/lib/cabriolet/format_base.rb +79 -0
  19. data/lib/cabriolet/hlp/quickhelp/compressor.rb +28 -503
  20. data/lib/cabriolet/hlp/quickhelp/file_writer.rb +125 -0
  21. data/lib/cabriolet/hlp/quickhelp/offset_calculator.rb +61 -0
  22. data/lib/cabriolet/hlp/quickhelp/structure_builder.rb +93 -0
  23. data/lib/cabriolet/hlp/quickhelp/topic_builder.rb +52 -0
  24. data/lib/cabriolet/hlp/quickhelp/topic_compressor.rb +83 -0
  25. data/lib/cabriolet/huffman/encoder.rb +15 -12
  26. data/lib/cabriolet/lit/compressor.rb +45 -689
  27. data/lib/cabriolet/lit/content_encoder.rb +76 -0
  28. data/lib/cabriolet/lit/content_type_detector.rb +50 -0
  29. data/lib/cabriolet/lit/directory_builder.rb +153 -0
  30. data/lib/cabriolet/lit/guid_generator.rb +16 -0
  31. data/lib/cabriolet/lit/header_writer.rb +124 -0
  32. data/lib/cabriolet/lit/piece_builder.rb +74 -0
  33. data/lib/cabriolet/lit/structure_builder.rb +252 -0
  34. data/lib/cabriolet/quantum_shared.rb +105 -0
  35. data/lib/cabriolet/version.rb +1 -1
  36. data/lib/cabriolet.rb +114 -3
  37. metadata +38 -4
  38. data/lib/cabriolet/auto.rb +0 -173
  39. data/lib/cabriolet/parallel.rb +0 -333
@@ -1,173 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative "format_detector"
4
-
5
- module Cabriolet
6
- # Auto-detection and extraction module
7
- module Auto
8
- class << self
9
- # Open and parse an archive with automatic format detection
10
- #
11
- # @param path [String] Path to the archive file
12
- # @param options [Hash] Options to pass to the parser
13
- # @return [Object] Parsed archive object
14
- # @raise [UnsupportedFormatError] if format cannot be detected or is unsupported
15
- #
16
- # @example
17
- # archive = Cabriolet::Auto.open('unknown.archive')
18
- # archive.files.each { |f| puts f.name }
19
- def open(path, **options)
20
- format = FormatDetector.detect(path)
21
- unless format
22
- raise UnsupportedFormatError,
23
- "Unable to detect format for: #{path}"
24
- end
25
-
26
- parser_class = FormatDetector.format_to_parser(format)
27
- unless parser_class
28
- raise UnsupportedFormatError,
29
- "No parser available for format: #{format}"
30
- end
31
-
32
- parser_class.new(**options).parse(path)
33
- end
34
-
35
- # Detect format and extract all files automatically
36
- #
37
- # @param archive_path [String] Path to the archive
38
- # @param output_dir [String] Directory to extract to
39
- # @param options [Hash] Extraction options
40
- # @option options [Boolean] :preserve_paths (true) Preserve directory structure
41
- # @option options [Boolean] :overwrite (false) Overwrite existing files
42
- # @option options [Boolean] :parallel (false) Use parallel extraction
43
- # @option options [Integer] :workers (4) Number of parallel workers
44
- # @return [Hash] Extraction statistics
45
- #
46
- # @example
47
- # Cabriolet::Auto.extract('archive.cab', 'output/')
48
- # Cabriolet::Auto.extract('file.chm', 'docs/', parallel: true, workers: 8)
49
- def extract(archive_path, output_dir, **options)
50
- archive = open(archive_path)
51
-
52
- extractor = if options[:parallel]
53
- ParallelExtractor.new(archive, output_dir, **options)
54
- else
55
- SimpleExtractor.new(archive, output_dir, **options)
56
- end
57
-
58
- extractor.extract_all
59
- end
60
-
61
- # Detect format only without parsing
62
- #
63
- # @param path [String] Path to the file
64
- # @return [Symbol, nil] Detected format symbol or nil
65
- #
66
- # @example
67
- # format = Cabriolet::Auto.detect_format('file.cab')
68
- # # => :cab
69
- def detect_format(path)
70
- FormatDetector.detect(path)
71
- end
72
-
73
- # Get information about an archive without full extraction
74
- #
75
- # @param path [String] Path to the archive
76
- # @return [Hash] Archive information
77
- #
78
- # @example
79
- # info = Cabriolet::Auto.info('archive.cab')
80
- # # => { format: :cab, file_count: 145, total_size: 52428800, ... }
81
- def info(path)
82
- archive = open(path)
83
- format = detect_format(path)
84
-
85
- {
86
- format: format,
87
- path: path,
88
- file_count: archive.files.count,
89
- total_size: archive.files.sum { |f| f.size || 0 },
90
- compressed_size: File.size(path),
91
- compression_ratio: calculate_compression_ratio(archive, path),
92
- files: archive.files.map { |f| file_info(f) },
93
- }
94
- end
95
-
96
- private
97
-
98
- def calculate_compression_ratio(archive, path)
99
- total_uncompressed = archive.files.sum { |f| f.size || 0 }
100
- compressed = File.size(path)
101
-
102
- return 0 if total_uncompressed.zero?
103
-
104
- ((compressed.to_f / total_uncompressed) * 100).round(2)
105
- end
106
-
107
- def file_info(file)
108
- {
109
- name: file.name,
110
- size: file.size,
111
- compressed_size: file.respond_to?(:compressed_size) ? file.compressed_size : nil,
112
- attributes: file.respond_to?(:attributes) ? file.attributes : nil,
113
- date: file.respond_to?(:date) ? file.date : nil,
114
- time: file.respond_to?(:time) ? file.time : nil,
115
- }
116
- end
117
- end
118
-
119
- # Simple sequential extractor
120
- class SimpleExtractor
121
- def initialize(archive, output_dir, **options)
122
- @archive = archive
123
- @output_dir = output_dir
124
- @options = options
125
- @preserve_paths = options.fetch(:preserve_paths, true)
126
- @overwrite = options.fetch(:overwrite, false)
127
- @stats = { extracted: 0, skipped: 0, failed: 0, bytes: 0 }
128
- end
129
-
130
- def extract_all
131
- FileUtils.mkdir_p(@output_dir)
132
-
133
- @archive.files.each do |file|
134
- extract_file(file)
135
- end
136
-
137
- @stats
138
- end
139
-
140
- private
141
-
142
- def extract_file(file)
143
- output_path = build_output_path(file.name)
144
-
145
- if File.exist?(output_path) && !@overwrite
146
- @stats[:skipped] += 1
147
- return
148
- end
149
-
150
- FileUtils.mkdir_p(File.dirname(output_path))
151
- File.write(output_path, file.data, mode: "wb")
152
-
153
- @stats[:extracted] += 1
154
- @stats[:bytes] += file.data.bytesize
155
- rescue StandardError => e
156
- @stats[:failed] += 1
157
- warn "Failed to extract #{file.name}: #{e.message}"
158
- end
159
-
160
- def build_output_path(filename)
161
- if @preserve_paths
162
- # Keep directory structure
163
- clean_name = filename.gsub("\\", "/")
164
- File.join(@output_dir, clean_name)
165
- else
166
- # Flatten to output directory
167
- base_name = File.basename(filename.gsub("\\", "/"))
168
- File.join(@output_dir, base_name)
169
- end
170
- end
171
- end
172
- end
173
- end
@@ -1,333 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Cabriolet
4
- # Parallel extraction for multi-core performance
5
- module Parallel
6
- # Parallel extractor for archives
7
- class Extractor
8
- DEFAULT_WORKERS = 4
9
-
10
- def initialize(archive, output_dir, workers: DEFAULT_WORKERS, **options)
11
- @archive = archive
12
- @output_dir = output_dir
13
- @workers = [workers, 1].max # At least 1 worker
14
- @options = options
15
- @preserve_paths = options.fetch(:preserve_paths, true)
16
- @overwrite = options.fetch(:overwrite, false)
17
- @queue = Queue.new
18
- @stats = { extracted: 0, skipped: 0, failed: 0, bytes: 0 }
19
- @stats_mutex = Mutex.new
20
- end
21
-
22
- # Extract all files using parallel workers
23
- #
24
- # @return [Hash] Extraction statistics
25
- #
26
- # @example
27
- # extractor = Cabriolet::Parallel::Extractor.new(cab, 'output/', workers: 8)
28
- # stats = extractor.extract_all
29
- def extract_all
30
- FileUtils.mkdir_p(@output_dir)
31
-
32
- # Queue all files
33
- @archive.files.each { |file| @queue << file }
34
-
35
- # Add termination signals
36
- @workers.times { @queue << :done }
37
-
38
- # Start worker threads
39
- threads = Array.new(@workers) do |worker_id|
40
- Thread.new { worker_loop(worker_id) }
41
- end
42
-
43
- # Wait for all workers to complete
44
- threads.each(&:join)
45
-
46
- @stats
47
- end
48
-
49
- # Extract files with progress callback
50
- #
51
- # @yield [current, total, file] Progress callback
52
- # @return [Hash] Extraction statistics
53
- #
54
- # @example
55
- # extractor.extract_with_progress do |current, total, file|
56
- # puts "#{current}/#{total}: #{file.name}"
57
- # end
58
- def extract_with_progress(&block)
59
- return extract_all unless block
60
-
61
- total = @archive.files.count
62
- current = 0
63
- current_mutex = Mutex.new
64
-
65
- FileUtils.mkdir_p(@output_dir)
66
-
67
- # Queue all files
68
- @archive.files.each { |file| @queue << file }
69
- @workers.times { @queue << :done }
70
-
71
- # Start worker threads with progress
72
- threads = Array.new(@workers) do |_worker_id|
73
- Thread.new do
74
- loop do
75
- file = @queue.pop
76
- break if file == :done
77
-
78
- extract_file(file)
79
-
80
- current_mutex.synchronize do
81
- current += 1
82
- yield(current, total, file)
83
- end
84
- end
85
- end
86
- end
87
-
88
- threads.each(&:join)
89
- @stats
90
- end
91
-
92
- private
93
-
94
- def worker_loop(_worker_id)
95
- loop do
96
- file = @queue.pop
97
- break if file == :done
98
-
99
- extract_file(file)
100
- end
101
- end
102
-
103
- def extract_file(file)
104
- output_path = build_output_path(file.name)
105
-
106
- if File.exist?(output_path) && !@overwrite
107
- update_stats(:skipped)
108
- return
109
- end
110
-
111
- begin
112
- # Create directory (thread-safe)
113
- FileUtils.mkdir_p(File.dirname(output_path))
114
-
115
- # Extract file data
116
- data = file.data
117
-
118
- # Write file (one at a time per file)
119
- File.write(output_path, data, mode: "wb")
120
-
121
- # Preserve timestamps if available
122
- if file.respond_to?(:datetime) && file.datetime
123
- File.utime(File.atime(output_path), file.datetime, output_path)
124
- end
125
-
126
- update_stats(:extracted, data.bytesize)
127
- rescue StandardError => e
128
- update_stats(:failed)
129
- warn "Worker error extracting #{file.name}: #{e.message}"
130
- end
131
- end
132
-
133
- def build_output_path(filename)
134
- if @preserve_paths
135
- clean_name = filename.gsub("\\", "/")
136
- File.join(@output_dir, clean_name)
137
- else
138
- base_name = File.basename(filename.gsub("\\", "/"))
139
- File.join(@output_dir, base_name)
140
- end
141
- end
142
-
143
- def update_stats(stat_type, bytes = 0)
144
- @stats_mutex.synchronize do
145
- @stats[stat_type] += 1
146
- @stats[:bytes] += bytes if bytes.positive?
147
- end
148
- end
149
- end
150
-
151
- # Parallel batch processor
152
- class BatchProcessor
153
- def initialize(workers: Extractor::DEFAULT_WORKERS)
154
- @workers = workers
155
- @stats = { total: 0, successful: 0, failed: 0 }
156
- @stats_mutex = Mutex.new
157
- end
158
-
159
- # Process multiple archives in parallel
160
- #
161
- # @param archive_paths [Array<String>] Paths to archives
162
- # @param output_base [String] Base output directory
163
- # @yield [archive_path, stats] Optional callback per archive
164
- # @return [Hash] Overall statistics
165
- #
166
- # @example
167
- # processor = Cabriolet::Parallel::BatchProcessor.new(workers: 8)
168
- # stats = processor.process_all(Dir.glob('*.cab'), 'output/')
169
- def process_all(archive_paths, output_base, &block)
170
- queue = Queue.new
171
- archive_paths.each { |path| queue << path }
172
- @workers.times { queue << :done }
173
-
174
- threads = Array.new(@workers) do
175
- Thread.new { process_loop(queue, output_base, &block) }
176
- end
177
-
178
- threads.each(&:join)
179
- @stats
180
- end
181
-
182
- private
183
-
184
- def process_loop(queue, output_base, &block)
185
- loop do
186
- archive_path = queue.pop
187
- break if archive_path == :done
188
-
189
- process_one(archive_path, output_base, &block)
190
- end
191
- end
192
-
193
- def process_one(archive_path, output_base)
194
- update_stats(:total)
195
-
196
- begin
197
- archive = Cabriolet::Auto.open(archive_path)
198
- output_dir = File.join(output_base, File.basename(archive_path, ".*"))
199
-
200
- extractor = Extractor.new(archive, output_dir, workers: 2)
201
- stats = extractor.extract_all
202
-
203
- update_stats(:successful)
204
-
205
- yield(archive_path, stats) if block_given?
206
- rescue StandardError => e
207
- update_stats(:failed)
208
- warn "Failed to process #{archive_path}: #{e.message}"
209
- end
210
- end
211
-
212
- def update_stats(stat_type)
213
- @stats_mutex.synchronize do
214
- @stats[stat_type] += 1
215
- end
216
- end
217
-
218
- attr_reader :stats
219
- end
220
-
221
- # Thread pool for custom parallel operations
222
- class ThreadPool
223
- def initialize(size: Extractor::DEFAULT_WORKERS)
224
- @size = size
225
- @queue = Queue.new
226
- @threads = []
227
- @running = false
228
- end
229
-
230
- # Start the thread pool
231
- def start
232
- return if @running
233
-
234
- @running = true
235
- @threads = Array.new(@size) do
236
- Thread.new { worker_loop }
237
- end
238
- end
239
-
240
- # Submit a task to the pool
241
- #
242
- # @yield Task to execute
243
- def submit(&block)
244
- start unless @running
245
- @queue << block
246
- end
247
-
248
- # Shutdown the thread pool
249
- #
250
- # @param wait [Boolean] Wait for pending tasks to complete
251
- def shutdown(wait: true)
252
- return unless @running
253
-
254
- if wait
255
- # Wait for queue to empty
256
- sleep 0.01 until @queue.empty?
257
- end
258
-
259
- # Send termination signals
260
- @size.times { @queue << :shutdown }
261
-
262
- # Wait for threads to finish
263
- @threads.each(&:join)
264
- @threads.clear
265
- @running = false
266
- end
267
-
268
- # Execute tasks in parallel with automatic cleanup
269
- #
270
- # @param items [Array] Items to process
271
- # @yield [item] Process each item
272
- # @return [Array] Results from each task
273
- def map(items)
274
- start
275
- results = []
276
- results_mutex = Mutex.new
277
-
278
- items.each_with_index do |item, index|
279
- submit do
280
- result = yield(item)
281
- results_mutex.synchronize do
282
- results[index] = result
283
- end
284
- end
285
- end
286
-
287
- shutdown(wait: true)
288
- results
289
- end
290
-
291
- private
292
-
293
- def worker_loop
294
- loop do
295
- task = @queue.pop
296
- break if task == :shutdown
297
-
298
- begin
299
- task.call
300
- rescue StandardError => e
301
- warn "Thread pool worker error: #{e.message}"
302
- end
303
- end
304
- end
305
- end
306
-
307
- class << self
308
- # Extract archive using parallel workers
309
- #
310
- # @param archive [Object] Archive object
311
- # @param output_dir [String] Output directory
312
- # @param workers [Integer] Number of parallel workers
313
- # @return [Hash] Extraction statistics
314
- def extract(archive, output_dir, workers: Extractor::DEFAULT_WORKERS,
315
- **options)
316
- extractor = Extractor.new(archive, output_dir, workers: workers,
317
- **options)
318
- extractor.extract_all
319
- end
320
-
321
- # Process multiple archives in parallel
322
- #
323
- # @param paths [Array<String>] Archive paths
324
- # @param output_base [String] Base output directory
325
- # @param workers [Integer] Number of parallel workers
326
- # @return [Hash] Processing statistics
327
- def process_batch(paths, output_base, workers: Extractor::DEFAULT_WORKERS)
328
- processor = BatchProcessor.new(workers: workers)
329
- processor.process_all(paths, output_base)
330
- end
331
- end
332
- end
333
- end