wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
|
|
6
|
+
module Wp2txt
|
|
7
|
+
# OutputWriter handles output file management with rotation
|
|
8
|
+
# Supports both text and JSONL formats
|
|
9
|
+
class OutputWriter
|
|
10
|
+
# @param output_dir [String] Output directory path
|
|
11
|
+
# @param base_name [String] Base name for output files
|
|
12
|
+
# @param format [Symbol] Output format (:text or :json)
|
|
13
|
+
# @param file_size_mb [Integer] Target file size in MB for rotation (0 = single file)
|
|
14
|
+
def initialize(output_dir:, base_name:, format: :text, file_size_mb: 10)
|
|
15
|
+
@output_dir = output_dir
|
|
16
|
+
@base_name = base_name
|
|
17
|
+
@format = format
|
|
18
|
+
@file_size_mb = file_size_mb
|
|
19
|
+
@file_size_bytes = file_size_mb * 1024 * 1024
|
|
20
|
+
|
|
21
|
+
@current_file = nil
|
|
22
|
+
@current_size = 0
|
|
23
|
+
@file_index = 1
|
|
24
|
+
@mutex = Mutex.new
|
|
25
|
+
@output_files = []
|
|
26
|
+
|
|
27
|
+
FileUtils.mkdir_p(@output_dir) unless File.directory?(@output_dir)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Write formatted article to output
|
|
31
|
+
# Thread-safe for parallel processing
|
|
32
|
+
# @param content [String, Hash] Content to write (String for text, Hash for JSON)
|
|
33
|
+
# @raise [Wp2txt::FileIOError] on disk full or other I/O errors
|
|
34
|
+
def write(content)
|
|
35
|
+
return if content.nil? || (content.is_a?(String) && content.strip.empty?)
|
|
36
|
+
|
|
37
|
+
@mutex.synchronize do
|
|
38
|
+
ensure_file_open
|
|
39
|
+
|
|
40
|
+
output = format_output(content)
|
|
41
|
+
@current_file.write(output)
|
|
42
|
+
@current_size += output.bytesize
|
|
43
|
+
|
|
44
|
+
rotate_file_if_needed
|
|
45
|
+
end
|
|
46
|
+
rescue Errno::ENOSPC
|
|
47
|
+
close_on_error
|
|
48
|
+
raise Wp2txt::FileIOError, "Disk full: cannot write to output directory '#{@output_dir}'"
|
|
49
|
+
rescue IOError, SystemCallError => e
|
|
50
|
+
close_on_error
|
|
51
|
+
raise Wp2txt::FileIOError, "Write failed: #{e.message}"
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Write raw content directly without formatting
|
|
55
|
+
# Used for merging pre-formatted temp files
|
|
56
|
+
# @param content [String] Raw content to append
|
|
57
|
+
# @raise [Wp2txt::FileIOError] on disk full or other I/O errors
|
|
58
|
+
def write_raw(content)
|
|
59
|
+
return if content.nil? || content.empty?
|
|
60
|
+
|
|
61
|
+
@mutex.synchronize do
|
|
62
|
+
ensure_file_open
|
|
63
|
+
|
|
64
|
+
@current_file.write(content)
|
|
65
|
+
@current_size += content.bytesize
|
|
66
|
+
|
|
67
|
+
rotate_file_if_needed
|
|
68
|
+
end
|
|
69
|
+
rescue Errno::ENOSPC
|
|
70
|
+
close_on_error
|
|
71
|
+
raise Wp2txt::FileIOError, "Disk full: cannot write to output directory '#{@output_dir}'"
|
|
72
|
+
rescue IOError, SystemCallError => e
|
|
73
|
+
close_on_error
|
|
74
|
+
raise Wp2txt::FileIOError, "Write failed: #{e.message}"
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Stream content from a file, rotating only at article boundaries (blank lines)
|
|
78
|
+
# This ensures no article is split across output files
|
|
79
|
+
# @param source_path [String] Path to source file
|
|
80
|
+
# @raise [Wp2txt::FileIOError] on disk full or other I/O errors
|
|
81
|
+
def write_from_file(source_path)
|
|
82
|
+
return unless File.exist?(source_path)
|
|
83
|
+
|
|
84
|
+
@mutex.synchronize do
|
|
85
|
+
File.open(source_path, "r:UTF-8") do |src|
|
|
86
|
+
src.each_line do |line|
|
|
87
|
+
ensure_file_open
|
|
88
|
+
@current_file.write(line)
|
|
89
|
+
@current_size += line.bytesize
|
|
90
|
+
# Only rotate at blank lines (article boundaries)
|
|
91
|
+
rotate_file_if_needed if line.strip.empty?
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
rescue Errno::ENOSPC
|
|
96
|
+
close_on_error
|
|
97
|
+
raise Wp2txt::FileIOError, "Disk full: cannot write to output directory '#{@output_dir}'"
|
|
98
|
+
rescue IOError, SystemCallError => e
|
|
99
|
+
close_on_error
|
|
100
|
+
raise Wp2txt::FileIOError, "Write failed: #{e.message}"
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Close current file and finalize
|
|
104
|
+
def close
|
|
105
|
+
@mutex.synchronize do
|
|
106
|
+
close_current_file
|
|
107
|
+
end
|
|
108
|
+
@output_files
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Get list of output files created
|
|
112
|
+
attr_reader :output_files
|
|
113
|
+
|
|
114
|
+
# Get count of output files created so far
|
|
115
|
+
# @return [Integer] Number of output files
|
|
116
|
+
def file_count
|
|
117
|
+
@output_files.size
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
private
|
|
121
|
+
|
|
122
|
+
def ensure_file_open
|
|
123
|
+
return if @current_file && !@current_file.closed?
|
|
124
|
+
|
|
125
|
+
filename = generate_filename
|
|
126
|
+
# Use binary mode to avoid Ruby's encoding conversion on write;
|
|
127
|
+
# input is read as UTF-8 via each_line, which yields valid UTF-8 strings
|
|
128
|
+
@current_file = File.open(filename, "wb")
|
|
129
|
+
@output_files << filename
|
|
130
|
+
@current_size = 0
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def close_on_error
|
|
134
|
+
@current_file&.close rescue nil # rubocop:disable Style/RescueModifier
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def close_current_file
|
|
138
|
+
return unless @current_file && !@current_file.closed?
|
|
139
|
+
|
|
140
|
+
@current_file.close
|
|
141
|
+
|
|
142
|
+
# Remove empty files
|
|
143
|
+
last_file = @output_files.last
|
|
144
|
+
if last_file && File.exist?(last_file) && File.size(last_file).zero?
|
|
145
|
+
File.delete(last_file)
|
|
146
|
+
@output_files.pop
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def rotate_file_if_needed
|
|
151
|
+
return if @file_size_bytes.zero? # No rotation if file_size is 0
|
|
152
|
+
return if @current_size < @file_size_bytes
|
|
153
|
+
|
|
154
|
+
close_current_file
|
|
155
|
+
@file_index += 1
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def generate_filename
|
|
159
|
+
extension = @format == :json ? "jsonl" : "txt"
|
|
160
|
+
if @file_size_bytes.zero?
|
|
161
|
+
# Single file mode
|
|
162
|
+
File.join(@output_dir, "#{@base_name}.#{extension}")
|
|
163
|
+
else
|
|
164
|
+
# Multiple files with index
|
|
165
|
+
File.join(@output_dir, "#{@base_name}-#{@file_index}.#{extension}")
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def format_output(content)
|
|
170
|
+
case @format
|
|
171
|
+
when :json
|
|
172
|
+
if content.is_a?(Hash)
|
|
173
|
+
JSON.generate(content) + "\n"
|
|
174
|
+
else
|
|
175
|
+
content.to_s
|
|
176
|
+
end
|
|
177
|
+
else
|
|
178
|
+
content.to_s
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
end
|