wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,182 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "fileutils"
5
+
6
+ module Wp2txt
7
+ # OutputWriter handles output file management with rotation
8
+ # Supports both text and JSONL formats
9
+ class OutputWriter
10
+ # @param output_dir [String] Output directory path
11
+ # @param base_name [String] Base name for output files
12
+ # @param format [Symbol] Output format (:text or :json)
13
+ # @param file_size_mb [Integer] Target file size in MB for rotation (0 = single file)
14
+ def initialize(output_dir:, base_name:, format: :text, file_size_mb: 10)
15
+ @output_dir = output_dir
16
+ @base_name = base_name
17
+ @format = format
18
+ @file_size_mb = file_size_mb
19
+ @file_size_bytes = file_size_mb * 1024 * 1024
20
+
21
+ @current_file = nil
22
+ @current_size = 0
23
+ @file_index = 1
24
+ @mutex = Mutex.new
25
+ @output_files = []
26
+
27
+ FileUtils.mkdir_p(@output_dir) unless File.directory?(@output_dir)
28
+ end
29
+
30
+ # Write formatted article to output
31
+ # Thread-safe for parallel processing
32
+ # @param content [String, Hash] Content to write (String for text, Hash for JSON)
33
+ # @raise [Wp2txt::FileIOError] on disk full or other I/O errors
34
+ def write(content)
35
+ return if content.nil? || (content.is_a?(String) && content.strip.empty?)
36
+
37
+ @mutex.synchronize do
38
+ ensure_file_open
39
+
40
+ output = format_output(content)
41
+ @current_file.write(output)
42
+ @current_size += output.bytesize
43
+
44
+ rotate_file_if_needed
45
+ end
46
+ rescue Errno::ENOSPC
47
+ close_on_error
48
+ raise Wp2txt::FileIOError, "Disk full: cannot write to output directory '#{@output_dir}'"
49
+ rescue IOError, SystemCallError => e
50
+ close_on_error
51
+ raise Wp2txt::FileIOError, "Write failed: #{e.message}"
52
+ end
53
+
54
+ # Write raw content directly without formatting
55
+ # Used for merging pre-formatted temp files
56
+ # @param content [String] Raw content to append
57
+ # @raise [Wp2txt::FileIOError] on disk full or other I/O errors
58
+ def write_raw(content)
59
+ return if content.nil? || content.empty?
60
+
61
+ @mutex.synchronize do
62
+ ensure_file_open
63
+
64
+ @current_file.write(content)
65
+ @current_size += content.bytesize
66
+
67
+ rotate_file_if_needed
68
+ end
69
+ rescue Errno::ENOSPC
70
+ close_on_error
71
+ raise Wp2txt::FileIOError, "Disk full: cannot write to output directory '#{@output_dir}'"
72
+ rescue IOError, SystemCallError => e
73
+ close_on_error
74
+ raise Wp2txt::FileIOError, "Write failed: #{e.message}"
75
+ end
76
+
77
+ # Stream content from a file, rotating only at article boundaries (blank lines)
78
+ # This ensures no article is split across output files
79
+ # @param source_path [String] Path to source file
80
+ # @raise [Wp2txt::FileIOError] on disk full or other I/O errors
81
+ def write_from_file(source_path)
82
+ return unless File.exist?(source_path)
83
+
84
+ @mutex.synchronize do
85
+ File.open(source_path, "r:UTF-8") do |src|
86
+ src.each_line do |line|
87
+ ensure_file_open
88
+ @current_file.write(line)
89
+ @current_size += line.bytesize
90
+ # Only rotate at blank lines (article boundaries)
91
+ rotate_file_if_needed if line.strip.empty?
92
+ end
93
+ end
94
+ end
95
+ rescue Errno::ENOSPC
96
+ close_on_error
97
+ raise Wp2txt::FileIOError, "Disk full: cannot write to output directory '#{@output_dir}'"
98
+ rescue IOError, SystemCallError => e
99
+ close_on_error
100
+ raise Wp2txt::FileIOError, "Write failed: #{e.message}"
101
+ end
102
+
103
+ # Close current file and finalize
104
+ def close
105
+ @mutex.synchronize do
106
+ close_current_file
107
+ end
108
+ @output_files
109
+ end
110
+
111
+ # Get list of output files created
112
+ attr_reader :output_files
113
+
114
+ # Get count of output files created so far
115
+ # @return [Integer] Number of output files
116
+ def file_count
117
+ @output_files.size
118
+ end
119
+
120
+ private
121
+
122
+ def ensure_file_open
123
+ return if @current_file && !@current_file.closed?
124
+
125
+ filename = generate_filename
126
+ # Use binary mode to avoid Ruby's encoding conversion on write;
127
+ # input is read as UTF-8 via each_line, which yields valid UTF-8 strings
128
+ @current_file = File.open(filename, "wb")
129
+ @output_files << filename
130
+ @current_size = 0
131
+ end
132
+
133
+ def close_on_error
134
+ @current_file&.close rescue nil # rubocop:disable Style/RescueModifier
135
+ end
136
+
137
+ def close_current_file
138
+ return unless @current_file && !@current_file.closed?
139
+
140
+ @current_file.close
141
+
142
+ # Remove empty files
143
+ last_file = @output_files.last
144
+ if last_file && File.exist?(last_file) && File.size(last_file).zero?
145
+ File.delete(last_file)
146
+ @output_files.pop
147
+ end
148
+ end
149
+
150
+ def rotate_file_if_needed
151
+ return if @file_size_bytes.zero? # No rotation if file_size is 0
152
+ return if @current_size < @file_size_bytes
153
+
154
+ close_current_file
155
+ @file_index += 1
156
+ end
157
+
158
+ def generate_filename
159
+ extension = @format == :json ? "jsonl" : "txt"
160
+ if @file_size_bytes.zero?
161
+ # Single file mode
162
+ File.join(@output_dir, "#{@base_name}.#{extension}")
163
+ else
164
+ # Multiple files with index
165
+ File.join(@output_dir, "#{@base_name}-#{@file_index}.#{extension}")
166
+ end
167
+ end
168
+
169
+ def format_output(content)
170
+ case @format
171
+ when :json
172
+ if content.is_a?(Hash)
173
+ JSON.generate(content) + "\n"
174
+ else
175
+ content.to_s
176
+ end
177
+ else
178
+ content.to_s
179
+ end
180
+ end
181
+ end
182
+ end