wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
require "tempfile"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
require "json"
|
|
7
|
+
|
|
8
|
+
RSpec.describe Wp2txt::OutputWriter do
|
|
9
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
10
|
+
|
|
11
|
+
after do
|
|
12
|
+
FileUtils.rm_rf(temp_dir)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
describe "#write" do
|
|
16
|
+
context "with text format" do
|
|
17
|
+
it "writes text content to file" do
|
|
18
|
+
writer = described_class.new(
|
|
19
|
+
output_dir: temp_dir,
|
|
20
|
+
base_name: "output",
|
|
21
|
+
format: :text,
|
|
22
|
+
file_size_mb: 0
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
writer.write("Article 1 content\n")
|
|
26
|
+
writer.write("Article 2 content\n")
|
|
27
|
+
files = writer.close
|
|
28
|
+
|
|
29
|
+
expect(files.size).to eq(1)
|
|
30
|
+
content = File.read(files.first)
|
|
31
|
+
expect(content).to include("Article 1 content")
|
|
32
|
+
expect(content).to include("Article 2 content")
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
it "creates .txt extension" do
|
|
36
|
+
writer = described_class.new(
|
|
37
|
+
output_dir: temp_dir,
|
|
38
|
+
base_name: "output",
|
|
39
|
+
format: :text,
|
|
40
|
+
file_size_mb: 0
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
writer.write("content")
|
|
44
|
+
files = writer.close
|
|
45
|
+
|
|
46
|
+
expect(files.first).to end_with(".txt")
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
context "with JSON format" do
|
|
51
|
+
it "writes JSON content to file" do
|
|
52
|
+
writer = described_class.new(
|
|
53
|
+
output_dir: temp_dir,
|
|
54
|
+
base_name: "output",
|
|
55
|
+
format: :json,
|
|
56
|
+
file_size_mb: 0
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
writer.write({ title: "Article 1", text: "Content 1" })
|
|
60
|
+
writer.write({ title: "Article 2", text: "Content 2" })
|
|
61
|
+
files = writer.close
|
|
62
|
+
|
|
63
|
+
expect(files.size).to eq(1)
|
|
64
|
+
lines = File.readlines(files.first)
|
|
65
|
+
expect(lines.size).to eq(2)
|
|
66
|
+
|
|
67
|
+
json1 = JSON.parse(lines[0])
|
|
68
|
+
expect(json1["title"]).to eq("Article 1")
|
|
69
|
+
|
|
70
|
+
json2 = JSON.parse(lines[1])
|
|
71
|
+
expect(json2["title"]).to eq("Article 2")
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
it "creates .jsonl extension" do
|
|
75
|
+
writer = described_class.new(
|
|
76
|
+
output_dir: temp_dir,
|
|
77
|
+
base_name: "output",
|
|
78
|
+
format: :json,
|
|
79
|
+
file_size_mb: 0
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
writer.write({ title: "Test" })
|
|
83
|
+
files = writer.close
|
|
84
|
+
|
|
85
|
+
expect(files.first).to end_with(".jsonl")
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
context "with file rotation" do
|
|
90
|
+
it "rotates files based on size" do
|
|
91
|
+
writer = described_class.new(
|
|
92
|
+
output_dir: temp_dir,
|
|
93
|
+
base_name: "output",
|
|
94
|
+
format: :text,
|
|
95
|
+
file_size_mb: 1 # 1 MB threshold
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Write content that exceeds 1 MB
|
|
99
|
+
large_content = "x" * (512 * 1024) # 512 KB each
|
|
100
|
+
writer.write(large_content + "\n")
|
|
101
|
+
writer.write(large_content + "\n")
|
|
102
|
+
writer.write(large_content + "\n") # This should trigger rotation
|
|
103
|
+
files = writer.close
|
|
104
|
+
|
|
105
|
+
expect(files.size).to be >= 2
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
it "uses indexed filenames when rotating" do
|
|
109
|
+
writer = described_class.new(
|
|
110
|
+
output_dir: temp_dir,
|
|
111
|
+
base_name: "output",
|
|
112
|
+
format: :text,
|
|
113
|
+
file_size_mb: 1
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
large_content = "x" * (600 * 1024)
|
|
117
|
+
writer.write(large_content + "\n")
|
|
118
|
+
writer.write(large_content + "\n")
|
|
119
|
+
writer.write(large_content + "\n")
|
|
120
|
+
files = writer.close
|
|
121
|
+
|
|
122
|
+
expect(files.any? { |f| f.include?("-1.") }).to be true
|
|
123
|
+
expect(files.any? { |f| f.include?("-2.") }).to be true
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
context "with empty content" do
|
|
128
|
+
it "ignores nil content" do
|
|
129
|
+
writer = described_class.new(
|
|
130
|
+
output_dir: temp_dir,
|
|
131
|
+
base_name: "output",
|
|
132
|
+
format: :text,
|
|
133
|
+
file_size_mb: 0
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
writer.write(nil)
|
|
137
|
+
writer.write("valid content\n")
|
|
138
|
+
files = writer.close
|
|
139
|
+
|
|
140
|
+
content = File.read(files.first)
|
|
141
|
+
expect(content).to eq("valid content\n")
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
it "ignores empty string content" do
|
|
145
|
+
writer = described_class.new(
|
|
146
|
+
output_dir: temp_dir,
|
|
147
|
+
base_name: "output",
|
|
148
|
+
format: :text,
|
|
149
|
+
file_size_mb: 0
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
writer.write("")
|
|
153
|
+
writer.write(" ")
|
|
154
|
+
writer.write("valid content\n")
|
|
155
|
+
files = writer.close
|
|
156
|
+
|
|
157
|
+
content = File.read(files.first)
|
|
158
|
+
expect(content).to eq("valid content\n")
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
context "thread safety" do
|
|
163
|
+
it "handles concurrent writes" do
|
|
164
|
+
writer = described_class.new(
|
|
165
|
+
output_dir: temp_dir,
|
|
166
|
+
base_name: "output",
|
|
167
|
+
format: :text,
|
|
168
|
+
file_size_mb: 0
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
threads = 10.times.map do |i|
|
|
172
|
+
Thread.new do
|
|
173
|
+
10.times do |j|
|
|
174
|
+
writer.write("Thread #{i} Line #{j}\n")
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
threads.each(&:join)
|
|
180
|
+
files = writer.close
|
|
181
|
+
|
|
182
|
+
content = File.read(files.first)
|
|
183
|
+
lines = content.lines
|
|
184
|
+
expect(lines.size).to eq(100)
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
describe "#write_raw" do
|
|
190
|
+
it "writes raw content directly" do
|
|
191
|
+
writer = described_class.new(
|
|
192
|
+
output_dir: temp_dir,
|
|
193
|
+
base_name: "output",
|
|
194
|
+
format: :text,
|
|
195
|
+
file_size_mb: 0
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
writer.write_raw("raw line 1\n")
|
|
199
|
+
writer.write_raw("raw line 2\n")
|
|
200
|
+
files = writer.close
|
|
201
|
+
|
|
202
|
+
content = File.read(files.first)
|
|
203
|
+
expect(content).to eq("raw line 1\nraw line 2\n")
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
it "ignores nil and empty content" do
|
|
207
|
+
writer = described_class.new(
|
|
208
|
+
output_dir: temp_dir,
|
|
209
|
+
base_name: "output",
|
|
210
|
+
format: :text,
|
|
211
|
+
file_size_mb: 0
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
writer.write_raw(nil)
|
|
215
|
+
writer.write_raw("")
|
|
216
|
+
writer.write_raw("valid\n")
|
|
217
|
+
files = writer.close
|
|
218
|
+
|
|
219
|
+
content = File.read(files.first)
|
|
220
|
+
expect(content).to eq("valid\n")
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
describe "#write_from_file" do
|
|
225
|
+
it "streams content from source file" do
|
|
226
|
+
source = File.join(temp_dir, "source.txt")
|
|
227
|
+
File.write(source, "source content\n")
|
|
228
|
+
|
|
229
|
+
writer = described_class.new(
|
|
230
|
+
output_dir: temp_dir,
|
|
231
|
+
base_name: "output",
|
|
232
|
+
format: :text,
|
|
233
|
+
file_size_mb: 0
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
writer.write_from_file(source)
|
|
237
|
+
files = writer.close
|
|
238
|
+
|
|
239
|
+
content = File.read(files.first)
|
|
240
|
+
expect(content).to eq("source content\n")
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
it "ignores non-existent source file" do
|
|
244
|
+
writer = described_class.new(
|
|
245
|
+
output_dir: temp_dir,
|
|
246
|
+
base_name: "output",
|
|
247
|
+
format: :text,
|
|
248
|
+
file_size_mb: 0
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
writer.write_from_file("/nonexistent/file.txt")
|
|
252
|
+
files = writer.close
|
|
253
|
+
|
|
254
|
+
expect(files).to be_empty
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
it "rotates files only at article boundaries (blank lines)" do
|
|
258
|
+
# Build a source file with two articles separated by a blank line,
|
|
259
|
+
# where each article is larger than the rotation threshold
|
|
260
|
+
article1 = "Title: Article 1\n" + ("A" * 600 + "\n") * 10
|
|
261
|
+
article2 = "Title: Article 2\n" + ("B" * 600 + "\n") * 10
|
|
262
|
+
|
|
263
|
+
source = File.join(temp_dir, "source.txt")
|
|
264
|
+
File.write(source, article1 + "\n" + article2 + "\n")
|
|
265
|
+
|
|
266
|
+
writer = described_class.new(
|
|
267
|
+
output_dir: File.join(temp_dir, "out"),
|
|
268
|
+
base_name: "output",
|
|
269
|
+
format: :text,
|
|
270
|
+
file_size_mb: 0.005 # ~5KB threshold to force rotation
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
writer.write_from_file(source)
|
|
274
|
+
files = writer.close
|
|
275
|
+
|
|
276
|
+
expect(files.size).to be >= 2
|
|
277
|
+
|
|
278
|
+
# Article 1 should be entirely in the first file (not split mid-article)
|
|
279
|
+
first_content = File.read(files.first)
|
|
280
|
+
expect(first_content).to include("Title: Article 1")
|
|
281
|
+
expect(first_content.lines.last.strip).to eq("") # ends at blank line
|
|
282
|
+
|
|
283
|
+
# Article 2 should start in a subsequent file
|
|
284
|
+
remaining = files[1..].map { |f| File.read(f) }.join
|
|
285
|
+
expect(remaining).to include("Title: Article 2")
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
it "handles UTF-8 content without encoding errors" do
|
|
289
|
+
source = File.join(temp_dir, "source_utf8.txt")
|
|
290
|
+
utf8_content = "タイトル: 日本語記事\n" \
|
|
291
|
+
"本文テキスト。漢字、ひらがな、カタカナ。\n" \
|
|
292
|
+
"\n" \
|
|
293
|
+
"Title: English Article\n" \
|
|
294
|
+
"Body text with accents: café, naïve, résumé.\n"
|
|
295
|
+
File.write(source, utf8_content)
|
|
296
|
+
|
|
297
|
+
writer = described_class.new(
|
|
298
|
+
output_dir: temp_dir,
|
|
299
|
+
base_name: "output",
|
|
300
|
+
format: :text,
|
|
301
|
+
file_size_mb: 0
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
expect { writer.write_from_file(source) }.not_to raise_error
|
|
305
|
+
files = writer.close
|
|
306
|
+
|
|
307
|
+
content = File.read(files.first)
|
|
308
|
+
expect(content).to include("日本語記事")
|
|
309
|
+
expect(content).to include("café")
|
|
310
|
+
end
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
describe "error handling" do
|
|
314
|
+
it "raises FileIOError on disk full (ENOSPC) in write" do
|
|
315
|
+
writer = described_class.new(
|
|
316
|
+
output_dir: temp_dir,
|
|
317
|
+
base_name: "output",
|
|
318
|
+
format: :text,
|
|
319
|
+
file_size_mb: 0
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# Force ENOSPC by stubbing File#write
|
|
323
|
+
allow_any_instance_of(File).to receive(:write).and_raise(Errno::ENOSPC)
|
|
324
|
+
|
|
325
|
+
expect { writer.write("content") }.to raise_error(Wp2txt::FileIOError, /Disk full/)
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
it "raises FileIOError on disk full (ENOSPC) in write_raw" do
|
|
329
|
+
writer = described_class.new(
|
|
330
|
+
output_dir: temp_dir,
|
|
331
|
+
base_name: "output",
|
|
332
|
+
format: :text,
|
|
333
|
+
file_size_mb: 0
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
allow_any_instance_of(File).to receive(:write).and_raise(Errno::ENOSPC)
|
|
337
|
+
|
|
338
|
+
expect { writer.write_raw("content") }.to raise_error(Wp2txt::FileIOError, /Disk full/)
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
it "raises FileIOError on I/O error in write" do
|
|
342
|
+
writer = described_class.new(
|
|
343
|
+
output_dir: temp_dir,
|
|
344
|
+
base_name: "output",
|
|
345
|
+
format: :text,
|
|
346
|
+
file_size_mb: 0
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
allow_any_instance_of(File).to receive(:write).and_raise(IOError, "stream closed")
|
|
350
|
+
|
|
351
|
+
expect { writer.write("content") }.to raise_error(Wp2txt::FileIOError, /Write failed/)
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
it "includes output directory in error message" do
|
|
355
|
+
writer = described_class.new(
|
|
356
|
+
output_dir: temp_dir,
|
|
357
|
+
base_name: "output",
|
|
358
|
+
format: :text,
|
|
359
|
+
file_size_mb: 0
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
allow_any_instance_of(File).to receive(:write).and_raise(Errno::ENOSPC)
|
|
363
|
+
|
|
364
|
+
expect { writer.write("content") }.to raise_error(Wp2txt::FileIOError, /#{Regexp.escape(temp_dir)}/)
|
|
365
|
+
end
|
|
366
|
+
end
|
|
367
|
+
|
|
368
|
+
describe "#close" do
|
|
369
|
+
it "removes empty files" do
|
|
370
|
+
writer = described_class.new(
|
|
371
|
+
output_dir: temp_dir,
|
|
372
|
+
base_name: "output",
|
|
373
|
+
format: :text,
|
|
374
|
+
file_size_mb: 0
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
# Don't write anything
|
|
378
|
+
files = writer.close
|
|
379
|
+
|
|
380
|
+
# Should have created then removed empty file
|
|
381
|
+
expect(files).to be_empty
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
it "returns list of created files" do
|
|
385
|
+
writer = described_class.new(
|
|
386
|
+
output_dir: temp_dir,
|
|
387
|
+
base_name: "output",
|
|
388
|
+
format: :text,
|
|
389
|
+
file_size_mb: 0
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
writer.write("content\n")
|
|
393
|
+
files = writer.close
|
|
394
|
+
|
|
395
|
+
expect(files).to be_an(Array)
|
|
396
|
+
expect(files.size).to eq(1)
|
|
397
|
+
expect(File.exist?(files.first)).to be true
|
|
398
|
+
end
|
|
399
|
+
end
|
|
400
|
+
end
|