wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,428 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pastel"
4
+ require "tty-spinner"
5
+ require "tty-progressbar"
6
+ require_relative "constants"
7
+
8
+ module Wp2txt
9
+ # CLI UI helper module for consistent styling
10
+ module CliUI
11
+ # Exit codes for CLI
12
+ EXIT_SUCCESS = 0
13
+ EXIT_ERROR = 1
14
+ EXIT_PARTIAL = 2 # Partial success (e.g., some articles not found)
15
+
16
+ # Icons for status indicators
17
+ ICONS = {
18
+ success: "✔",
19
+ error: "✖",
20
+ warning: "!",
21
+ info: "ℹ",
22
+ arrow: "→",
23
+ bullet: "•",
24
+ check: "✓",
25
+ cross: "✗",
26
+ star: "★"
27
+ }.freeze
28
+
29
+ # Configure UI settings
30
+ # @param no_color [Boolean] Disable colors
31
+ # @param quiet [Boolean] Suppress progress output
32
+ def configure_ui(no_color: false, quiet: false)
33
+ @no_color = no_color || color_disabled_by_env?
34
+ @quiet = quiet
35
+ end
36
+
37
+ # Check if color is disabled by environment
38
+ # @return [Boolean]
39
+ def color_disabled_by_env?
40
+ # NO_COLOR is a standard: https://no-color.org/
41
+ ENV.key?("NO_COLOR") || ENV["TERM"] == "dumb"
42
+ end
43
+
44
+ # Check if quiet mode is enabled
45
+ # @return [Boolean]
46
+ def quiet?
47
+ @quiet || false
48
+ end
49
+
50
+ # Check if color is disabled
51
+ # @return [Boolean]
52
+ def no_color?
53
+ @no_color || false
54
+ end
55
+
56
+ # Initialize pastel for colors
57
+ def pastel
58
+ @pastel ||= Pastel.new(enabled: !no_color?)
59
+ end
60
+
61
+ # Reset pastel instance (needed after configure_ui)
62
+ def reset_pastel!
63
+ @pastel = nil
64
+ end
65
+
66
+ # Print a section header with optional step indicator
67
+ # @param title [String] Section title
68
+ # @param step [Integer, nil] Current step number
69
+ # @param total_steps [Integer, nil] Total number of steps
70
+ def print_header(title, step: nil, total_steps: nil)
71
+ return if quiet?
72
+
73
+ puts
74
+ if step && total_steps
75
+ step_indicator = pastel.dim("[#{step}/#{total_steps}]")
76
+ puts "#{step_indicator} #{pastel.cyan.bold(title)}"
77
+ else
78
+ puts pastel.cyan.bold("═══ #{title} ═══")
79
+ end
80
+ end
81
+
82
+ # Print a sub-header
83
+ # @param title [String] Sub-header title
84
+ def print_subheader(title)
85
+ return if quiet?
86
+
87
+ puts
88
+ puts pastel.bold("─── #{title} ───")
89
+ end
90
+
91
+ # Print key-value info line
92
+ # @param key [String] Label
93
+ # @param value [String] Value
94
+ # @param indent [Integer] Indentation level
95
+ def print_info(key, value, indent: 0)
96
+ return if quiet?
97
+
98
+ prefix = " " * indent
99
+ puts "#{prefix}#{pastel.dim(key + ":")} #{value}"
100
+ end
101
+
102
+ # Print a success message
103
+ # @param message [String] Message
104
+ def print_success(message)
105
+ return if quiet?
106
+
107
+ puts "#{pastel.green(ICONS[:success])} #{message}"
108
+ end
109
+
110
+ # Print an error message (always shown, even in quiet mode)
111
+ # @param message [String] Message
112
+ def print_error(message)
113
+ # Errors are always shown
114
+ $stderr.puts "#{pastel.red(ICONS[:error])} #{message}"
115
+ end
116
+
117
+ # Print a warning message (always shown, even in quiet mode)
118
+ # @param message [String] Message
119
+ def print_warning(message)
120
+ # Warnings are always shown
121
+ $stderr.puts "#{pastel.yellow(ICONS[:warning])} #{message}"
122
+ end
123
+
124
+ # Print an info message
125
+ # @param message [String] Message
126
+ def print_info_message(message)
127
+ return if quiet?
128
+
129
+ puts "#{pastel.blue(ICONS[:info])} #{message}"
130
+ end
131
+
132
+ # Print a list item with status
133
+ # @param text [String] Item text
134
+ # @param status [Symbol] :success, :error, :warning, :pending
135
+ # @param indent [Integer] Indentation level
136
+ def print_list_item(text, status: :pending, indent: 1)
137
+ return if quiet?
138
+
139
+ prefix = " " * indent
140
+ icon = case status
141
+ when :success then pastel.green(ICONS[:check])
142
+ when :error then pastel.red(ICONS[:cross])
143
+ when :warning then pastel.yellow(ICONS[:warning])
144
+ else pastel.dim(ICONS[:bullet])
145
+ end
146
+ puts "#{prefix}#{icon} #{text}"
147
+ end
148
+
149
+ # Print a completion summary box (always shown, even in quiet mode)
150
+ # @param title [String] Summary title
151
+ # @param stats [Hash] Statistics to display
152
+ # @param status [Symbol] :success, :warning, :error
153
+ def print_summary(title, stats, status: :success)
154
+ # Summary is always shown (it's the final result)
155
+ puts
156
+ color = case status
157
+ when :success then :green
158
+ when :warning then :yellow
159
+ when :error then :red
160
+ else :white
161
+ end
162
+
163
+ # Calculate box width based on content
164
+ width = 40
165
+ title_line = " #{title}"
166
+ content_lines = stats.map { |k, v| " #{k}: #{v}" }
167
+
168
+ # Draw box
169
+ puts pastel.send(color, "┌#{"─" * width}┐")
170
+ puts pastel.send(color, "│") + pastel.bold(title_line.ljust(width)) + pastel.send(color, "│")
171
+ puts pastel.send(color, "├#{"─" * width}┤")
172
+
173
+ content_lines.each do |line|
174
+ puts pastel.send(color, "│") + line.ljust(width) + pastel.send(color, "│")
175
+ end
176
+
177
+ puts pastel.send(color, "└#{"─" * width}┘")
178
+ end
179
+
180
+ # Print elapsed time
181
+ # @param seconds [Float] Elapsed seconds
182
+ def print_elapsed_time(seconds)
183
+ return if quiet?
184
+
185
+ formatted = format_duration(seconds)
186
+ puts pastel.dim("Completed in #{formatted}")
187
+ end
188
+
189
+ # Format duration in human-readable form
190
+ # @param seconds [Float] Duration in seconds
191
+ # @return [String] Formatted duration
192
+ def format_duration(seconds)
193
+ if seconds < 60
194
+ "#{seconds.round(1)}s"
195
+ elsif seconds < 3600
196
+ mins = (seconds / 60).floor
197
+ secs = (seconds % 60).round
198
+ "#{mins}m #{secs}s"
199
+ else
200
+ hours = (seconds / 3600).floor
201
+ mins = ((seconds % 3600) / 60).floor
202
+ "#{hours}h #{mins}m"
203
+ end
204
+ end
205
+
206
+ # Format file size in human-readable form
207
+ # @param bytes [Integer] Size in bytes
208
+ # @return [String] Formatted size
209
+ def format_size(bytes)
210
+ Wp2txt.format_file_size(bytes)
211
+ end
212
+
213
+ # Format ETA (Estimated Time of Arrival) in HH:MM:SS format
214
+ # @param seconds [Numeric] Remaining seconds
215
+ # @return [String] Formatted ETA or "--:--:--" if nil/invalid
216
+ def format_eta(seconds)
217
+ return "--:--:--" if seconds.nil? || seconds.negative? || !seconds.finite?
218
+
219
+ seconds = seconds.to_i
220
+ hours = seconds / 3600
221
+ mins = (seconds % 3600) / 60
222
+ secs = seconds % 60
223
+
224
+ if hours > 99
225
+ ">99h"
226
+ elsif hours > 0
227
+ format("%02d:%02d:%02d", hours, mins, secs)
228
+ else
229
+ format("%02d:%02d", mins, secs)
230
+ end
231
+ end
232
+
233
+ # Calculate ETA based on current progress
234
+ # @param processed [Integer] Items processed so far
235
+ # @param total [Integer] Total items to process
236
+ # @param elapsed_seconds [Numeric] Time elapsed in seconds
237
+ # @return [Float, nil] Estimated seconds remaining, or nil if cannot calculate
238
+ def calculate_eta(processed, total, elapsed_seconds)
239
+ return nil if processed.zero? || total.nil? || total.zero?
240
+ return nil if processed > total
241
+
242
+ rate = processed.to_f / elapsed_seconds
243
+ return nil if rate.zero?
244
+
245
+ remaining = total - processed
246
+ remaining / rate
247
+ end
248
+
249
+ # Estimate total article count from multistream index or file size
250
+ # @param input_path [String] Path to input file
251
+ # @return [Integer, nil] Estimated total articles, or nil if cannot estimate
252
+ def estimate_total_articles(input_path)
253
+ return nil unless input_path
254
+
255
+ # Check for multistream index file
256
+ index_path = find_multistream_index(input_path)
257
+ if index_path && File.exist?(index_path)
258
+ count = count_articles_from_index(index_path)
259
+ return count if count && count > 0
260
+ end
261
+
262
+ # Fallback: estimate from file size
263
+ # English Wikipedia: ~25GB compressed → ~7M articles ≈ 280 articles/MB
264
+ # Other languages vary, but this gives a reasonable estimate
265
+ estimate_from_file_size(input_path)
266
+ end
267
+
268
+ # Find multistream index file for a given input file
269
+ # @param input_path [String] Path to multistream file
270
+ # @return [String, nil] Path to index file, or nil if not found
271
+ def find_multistream_index(input_path)
272
+ return nil unless input_path.include?("multistream")
273
+
274
+ # Pattern: *-multistream.xml.bz2 → *-multistream-index.txt.bz2
275
+ base = input_path.sub(/multistream\.xml\.bz2$/, "multistream-index.txt.bz2")
276
+ return base if File.exist?(base)
277
+
278
+ # Try alternate patterns
279
+ dir = File.dirname(input_path)
280
+ basename = File.basename(input_path)
281
+
282
+ # Extract language and date from filename
283
+ if basename =~ /^(\w+wiki)-(\d+)-/
284
+ lang = $1
285
+ date = $2
286
+ alt_path = File.join(dir, "#{lang}-#{date}-pages-articles-multistream-index.txt.bz2")
287
+ return alt_path if File.exist?(alt_path)
288
+ end
289
+
290
+ nil
291
+ end
292
+
293
+ # Count articles from multistream index file (quick count, not full load)
294
+ # @param index_path [String] Path to index file
295
+ # @return [Integer, nil] Article count, or nil if cannot count
296
+ def count_articles_from_index(index_path)
297
+ count = 0
298
+
299
+ begin
300
+ if index_path.end_with?(".bz2")
301
+ IO.popen(["bzcat", index_path], "r") do |io|
302
+ io.each_line { count += 1 }
303
+ end
304
+ else
305
+ File.foreach(index_path) { count += 1 }
306
+ end
307
+ count
308
+ rescue StandardError
309
+ nil
310
+ end
311
+ end
312
+
313
+ # Estimate article count from file size
314
+ # @param input_path [String] Path to input file
315
+ # @return [Integer, nil] Estimated article count
316
+ def estimate_from_file_size(input_path)
317
+ return nil unless File.exist?(input_path)
318
+
319
+ size_mb = File.size(input_path) / (1024.0 * 1024)
320
+
321
+ # Empirical estimates (articles per MB of compressed bz2):
322
+ # - English Wikipedia: ~280 articles/MB
323
+ # - Japanese Wikipedia: ~100 articles/MB (longer articles on average)
324
+ # - Other languages: ~200 articles/MB (rough average)
325
+ # Using conservative estimate of 200 articles/MB
326
+ (size_mb * 200).to_i
327
+ end
328
+
329
+ # Create a spinner with consistent styling
330
+ # @param message [String] Spinner message
331
+ # @return [TTY::Spinner, NullSpinner] Configured spinner or null spinner in quiet mode
332
+ def create_spinner(message)
333
+ return NullSpinner.new if quiet?
334
+
335
+ TTY::Spinner.new(
336
+ "[:spinner] #{message}",
337
+ format: :dots,
338
+ hide_cursor: true
339
+ )
340
+ end
341
+
342
+ # Create a progress bar with consistent styling
343
+ # @param message [String] Progress message
344
+ # @param total [Integer] Total count
345
+ # @return [TTY::ProgressBar, NullProgressBar] Configured progress bar or null in quiet mode
346
+ def create_progress_bar(message, total)
347
+ return NullProgressBar.new if quiet?
348
+
349
+ TTY::ProgressBar.new(
350
+ "#{message} [:bar] :percent (:current/:total) :eta",
351
+ total: total,
352
+ bar_format: :block,
353
+ width: 30,
354
+ hide_cursor: true
355
+ )
356
+ end
357
+
358
+ # Create a download progress bar
359
+ # @param filename [String] File being downloaded
360
+ # @param total_bytes [Integer] Total size in bytes
361
+ # @return [TTY::ProgressBar, NullProgressBar] Configured progress bar or null in quiet mode
362
+ def create_download_bar(filename, total_bytes)
363
+ return NullProgressBar.new if quiet?
364
+
365
+ size_str = format_size(total_bytes)
366
+ TTY::ProgressBar.new(
367
+ " #{filename} [:bar] :percent (:eta)",
368
+ total: total_bytes,
369
+ bar_format: :block,
370
+ width: 25,
371
+ hide_cursor: true,
372
+ unknown: "#{size_str} (size unknown)"
373
+ )
374
+ end
375
+
376
+ # Prompt for confirmation
377
+ # @param message [String] Prompt message
378
+ # @param default [Boolean] Default response
379
+ # @return [Boolean] User response
380
+ def confirm?(message, default: false)
381
+ return default unless $stdin.tty?
382
+
383
+ suffix = default ? "[Y/n]" : "[y/N]"
384
+ print "#{message} #{suffix}: "
385
+
386
+ response = $stdin.gets&.strip&.downcase
387
+ return default if response.nil? || response.empty?
388
+
389
+ %w[y yes].include?(response)
390
+ end
391
+
392
+ # Print a mode banner
393
+ # @param mode [String] Mode name
394
+ # @param details [Hash] Mode details
395
+ def print_mode_banner(mode, details = {})
396
+ return if quiet?
397
+
398
+ puts
399
+ puts pastel.cyan.bold("═" * 50)
400
+ puts pastel.cyan.bold(" #{mode}")
401
+ puts pastel.cyan.bold("═" * 50)
402
+ puts
403
+
404
+ details.each do |key, value|
405
+ print_info(key.to_s, value.to_s)
406
+ end
407
+ puts
408
+ end
409
+ end
410
+
411
+ # Null spinner for quiet mode (does nothing)
412
+ class NullSpinner
413
+ def auto_spin; end
414
+ def success(_msg = nil); end
415
+ def error(_msg = nil); end
416
+ def stop; end
417
+ def update(**_options); end
418
+ end
419
+
420
+ # Null progress bar for quiet mode (does nothing)
421
+ class NullProgressBar
422
+ def advance(_count = 1); end
423
+ def finish; end
424
+ def current=(_value); end
425
+ def start; end
426
+ def stop; end
427
+ end
428
+ end
@@ -0,0 +1,158 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+ require "fileutils"
5
+
6
+ module Wp2txt
7
+ # Configuration management for wp2txt
8
+ # Loads and saves settings from ~/.wp2txt/config.yml
9
+ class Config
10
+ # Default configuration file path
11
+ DEFAULT_CONFIG_PATH = File.expand_path("~/.wp2txt/config.yml")
12
+ # Default cache directory
13
+ DEFAULT_CACHE_DIR = File.expand_path("~/.wp2txt/cache")
14
+
15
+ # Validation ranges
16
+ DUMP_EXPIRY_RANGE = (1..365)
17
+ CATEGORY_EXPIRY_RANGE = (1..90)
18
+ DEPTH_RANGE = (0..10)
19
+ VALID_FORMATS = %w[text json].freeze
20
+
21
+ # Default values
22
+ DEFAULTS = {
23
+ dump_expiry_days: 30,
24
+ category_expiry_days: 7,
25
+ cache_directory: DEFAULT_CACHE_DIR,
26
+ default_format: "text",
27
+ default_depth: 0
28
+ }.freeze
29
+
30
+ attr_reader :dump_expiry_days, :category_expiry_days, :cache_directory,
31
+ :default_format, :default_depth
32
+
33
+ def initialize(
34
+ dump_expiry_days: DEFAULTS[:dump_expiry_days],
35
+ category_expiry_days: DEFAULTS[:category_expiry_days],
36
+ cache_directory: DEFAULTS[:cache_directory],
37
+ default_format: DEFAULTS[:default_format],
38
+ default_depth: DEFAULTS[:default_depth]
39
+ )
40
+ @dump_expiry_days = clamp(dump_expiry_days.to_i, DUMP_EXPIRY_RANGE)
41
+ @category_expiry_days = clamp(category_expiry_days.to_i, CATEGORY_EXPIRY_RANGE)
42
+ @cache_directory = cache_directory.to_s.empty? ? DEFAULT_CACHE_DIR : cache_directory.to_s
43
+ @default_format = validate_format(default_format.to_s)
44
+ @default_depth = clamp(default_depth.to_i, DEPTH_RANGE)
45
+ end
46
+
47
+ # Load configuration from file
48
+ # @param path [String] Path to config file (default: ~/.wp2txt/config.yml)
49
+ # @return [Config] Configuration object
50
+ def self.load(path = default_path)
51
+ return new unless File.exist?(path)
52
+
53
+ begin
54
+ data = YAML.safe_load(File.read(path), symbolize_names: true) || {}
55
+ from_hash(data)
56
+ rescue Psych::SyntaxError, StandardError
57
+ # Return defaults on parse error
58
+ new
59
+ end
60
+ end
61
+
62
+ # Create Config from hash
63
+ # @param data [Hash] Configuration hash
64
+ # @return [Config] Configuration object
65
+ def self.from_hash(data)
66
+ cache = data[:cache] || {}
67
+ defaults = data[:defaults] || {}
68
+
69
+ new(
70
+ dump_expiry_days: cache[:dump_expiry_days] || DEFAULTS[:dump_expiry_days],
71
+ category_expiry_days: cache[:category_expiry_days] || DEFAULTS[:category_expiry_days],
72
+ cache_directory: cache[:directory] || DEFAULTS[:cache_directory],
73
+ default_format: defaults[:format] || DEFAULTS[:default_format],
74
+ default_depth: defaults[:depth] || DEFAULTS[:default_depth]
75
+ )
76
+ end
77
+
78
+ # Default configuration file path
79
+ # @return [String] Path to default config file
80
+ def self.default_path
81
+ DEFAULT_CONFIG_PATH
82
+ end
83
+
84
+ # Create default configuration file
85
+ # @param path [String] Path to config file
86
+ # @param force [Boolean] Overwrite existing file
87
+ # @return [Boolean] True if file was created
88
+ def self.create_default(path = default_path, force: false)
89
+ return false if File.exist?(path) && !force
90
+
91
+ config = new
92
+ config.save(path)
93
+ true
94
+ end
95
+
96
+ # Save configuration to file
97
+ # @param path [String] Path to config file
98
+ def save(path = self.class.default_path)
99
+ FileUtils.mkdir_p(File.dirname(path))
100
+
101
+ content = generate_yaml
102
+ File.write(path, content)
103
+ end
104
+
105
+ # Convert to hash representation
106
+ # @return [Hash] Configuration as hash
107
+ def to_h
108
+ {
109
+ cache: {
110
+ dump_expiry_days: @dump_expiry_days,
111
+ category_expiry_days: @category_expiry_days,
112
+ directory: @cache_directory
113
+ },
114
+ defaults: {
115
+ format: @default_format,
116
+ depth: @default_depth
117
+ }
118
+ }
119
+ end
120
+
121
+ private
122
+
123
+ # Clamp value to range
124
+ def clamp(value, range)
125
+ [[value, range.min].max, range.max].min
126
+ end
127
+
128
+ # Validate format string
129
+ def validate_format(format)
130
+ VALID_FORMATS.include?(format) ? format : DEFAULTS[:default_format]
131
+ end
132
+
133
+ # Generate YAML content with comments
134
+ def generate_yaml
135
+ <<~YAML
136
+ # WP2TXT Configuration File
137
+ # Location: ~/.wp2txt/config.yml
138
+
139
+ cache:
140
+ # Number of days before dump files are considered stale (1-365)
141
+ dump_expiry_days: #{@dump_expiry_days}
142
+
143
+ # Number of days before category cache expires (1-90)
144
+ category_expiry_days: #{@category_expiry_days}
145
+
146
+ # Cache directory for downloaded dumps
147
+ directory: #{@cache_directory}
148
+
149
+ defaults:
150
+ # Default output format: text or json
151
+ format: #{@default_format}
152
+
153
+ # Default subcategory recursion depth (0-10)
154
+ depth: #{@default_depth}
155
+ YAML
156
+ end
157
+ end
158
+ end