wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,400 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+ require "tempfile"
5
+ require "fileutils"
6
+ require "json"
7
+
8
+ RSpec.describe Wp2txt::OutputWriter do
9
+ let(:temp_dir) { Dir.mktmpdir }
10
+
11
+ after do
12
+ FileUtils.rm_rf(temp_dir)
13
+ end
14
+
15
+ describe "#write" do
16
+ context "with text format" do
17
+ it "writes text content to file" do
18
+ writer = described_class.new(
19
+ output_dir: temp_dir,
20
+ base_name: "output",
21
+ format: :text,
22
+ file_size_mb: 0
23
+ )
24
+
25
+ writer.write("Article 1 content\n")
26
+ writer.write("Article 2 content\n")
27
+ files = writer.close
28
+
29
+ expect(files.size).to eq(1)
30
+ content = File.read(files.first)
31
+ expect(content).to include("Article 1 content")
32
+ expect(content).to include("Article 2 content")
33
+ end
34
+
35
+ it "creates .txt extension" do
36
+ writer = described_class.new(
37
+ output_dir: temp_dir,
38
+ base_name: "output",
39
+ format: :text,
40
+ file_size_mb: 0
41
+ )
42
+
43
+ writer.write("content")
44
+ files = writer.close
45
+
46
+ expect(files.first).to end_with(".txt")
47
+ end
48
+ end
49
+
50
+ context "with JSON format" do
51
+ it "writes JSON content to file" do
52
+ writer = described_class.new(
53
+ output_dir: temp_dir,
54
+ base_name: "output",
55
+ format: :json,
56
+ file_size_mb: 0
57
+ )
58
+
59
+ writer.write({ title: "Article 1", text: "Content 1" })
60
+ writer.write({ title: "Article 2", text: "Content 2" })
61
+ files = writer.close
62
+
63
+ expect(files.size).to eq(1)
64
+ lines = File.readlines(files.first)
65
+ expect(lines.size).to eq(2)
66
+
67
+ json1 = JSON.parse(lines[0])
68
+ expect(json1["title"]).to eq("Article 1")
69
+
70
+ json2 = JSON.parse(lines[1])
71
+ expect(json2["title"]).to eq("Article 2")
72
+ end
73
+
74
+ it "creates .jsonl extension" do
75
+ writer = described_class.new(
76
+ output_dir: temp_dir,
77
+ base_name: "output",
78
+ format: :json,
79
+ file_size_mb: 0
80
+ )
81
+
82
+ writer.write({ title: "Test" })
83
+ files = writer.close
84
+
85
+ expect(files.first).to end_with(".jsonl")
86
+ end
87
+ end
88
+
89
+ context "with file rotation" do
90
+ it "rotates files based on size" do
91
+ writer = described_class.new(
92
+ output_dir: temp_dir,
93
+ base_name: "output",
94
+ format: :text,
95
+ file_size_mb: 1 # 1 MB threshold
96
+ )
97
+
98
+ # Write content that exceeds 1 MB
99
+ large_content = "x" * (512 * 1024) # 512 KB each
100
+ writer.write(large_content + "\n")
101
+ writer.write(large_content + "\n")
102
+ writer.write(large_content + "\n") # This should trigger rotation
103
+ files = writer.close
104
+
105
+ expect(files.size).to be >= 2
106
+ end
107
+
108
+ it "uses indexed filenames when rotating" do
109
+ writer = described_class.new(
110
+ output_dir: temp_dir,
111
+ base_name: "output",
112
+ format: :text,
113
+ file_size_mb: 1
114
+ )
115
+
116
+ large_content = "x" * (600 * 1024)
117
+ writer.write(large_content + "\n")
118
+ writer.write(large_content + "\n")
119
+ writer.write(large_content + "\n")
120
+ files = writer.close
121
+
122
+ expect(files.any? { |f| f.include?("-1.") }).to be true
123
+ expect(files.any? { |f| f.include?("-2.") }).to be true
124
+ end
125
+ end
126
+
127
+ context "with empty content" do
128
+ it "ignores nil content" do
129
+ writer = described_class.new(
130
+ output_dir: temp_dir,
131
+ base_name: "output",
132
+ format: :text,
133
+ file_size_mb: 0
134
+ )
135
+
136
+ writer.write(nil)
137
+ writer.write("valid content\n")
138
+ files = writer.close
139
+
140
+ content = File.read(files.first)
141
+ expect(content).to eq("valid content\n")
142
+ end
143
+
144
+ it "ignores empty string content" do
145
+ writer = described_class.new(
146
+ output_dir: temp_dir,
147
+ base_name: "output",
148
+ format: :text,
149
+ file_size_mb: 0
150
+ )
151
+
152
+ writer.write("")
153
+ writer.write(" ")
154
+ writer.write("valid content\n")
155
+ files = writer.close
156
+
157
+ content = File.read(files.first)
158
+ expect(content).to eq("valid content\n")
159
+ end
160
+ end
161
+
162
+ context "thread safety" do
163
+ it "handles concurrent writes" do
164
+ writer = described_class.new(
165
+ output_dir: temp_dir,
166
+ base_name: "output",
167
+ format: :text,
168
+ file_size_mb: 0
169
+ )
170
+
171
+ threads = 10.times.map do |i|
172
+ Thread.new do
173
+ 10.times do |j|
174
+ writer.write("Thread #{i} Line #{j}\n")
175
+ end
176
+ end
177
+ end
178
+
179
+ threads.each(&:join)
180
+ files = writer.close
181
+
182
+ content = File.read(files.first)
183
+ lines = content.lines
184
+ expect(lines.size).to eq(100)
185
+ end
186
+ end
187
+ end
188
+
189
+ describe "#write_raw" do
190
+ it "writes raw content directly" do
191
+ writer = described_class.new(
192
+ output_dir: temp_dir,
193
+ base_name: "output",
194
+ format: :text,
195
+ file_size_mb: 0
196
+ )
197
+
198
+ writer.write_raw("raw line 1\n")
199
+ writer.write_raw("raw line 2\n")
200
+ files = writer.close
201
+
202
+ content = File.read(files.first)
203
+ expect(content).to eq("raw line 1\nraw line 2\n")
204
+ end
205
+
206
+ it "ignores nil and empty content" do
207
+ writer = described_class.new(
208
+ output_dir: temp_dir,
209
+ base_name: "output",
210
+ format: :text,
211
+ file_size_mb: 0
212
+ )
213
+
214
+ writer.write_raw(nil)
215
+ writer.write_raw("")
216
+ writer.write_raw("valid\n")
217
+ files = writer.close
218
+
219
+ content = File.read(files.first)
220
+ expect(content).to eq("valid\n")
221
+ end
222
+ end
223
+
224
+ describe "#write_from_file" do
225
+ it "streams content from source file" do
226
+ source = File.join(temp_dir, "source.txt")
227
+ File.write(source, "source content\n")
228
+
229
+ writer = described_class.new(
230
+ output_dir: temp_dir,
231
+ base_name: "output",
232
+ format: :text,
233
+ file_size_mb: 0
234
+ )
235
+
236
+ writer.write_from_file(source)
237
+ files = writer.close
238
+
239
+ content = File.read(files.first)
240
+ expect(content).to eq("source content\n")
241
+ end
242
+
243
+ it "ignores non-existent source file" do
244
+ writer = described_class.new(
245
+ output_dir: temp_dir,
246
+ base_name: "output",
247
+ format: :text,
248
+ file_size_mb: 0
249
+ )
250
+
251
+ writer.write_from_file("/nonexistent/file.txt")
252
+ files = writer.close
253
+
254
+ expect(files).to be_empty
255
+ end
256
+
257
+ it "rotates files only at article boundaries (blank lines)" do
258
+ # Build a source file with two articles separated by a blank line,
259
+ # where each article is larger than the rotation threshold
260
+ article1 = "Title: Article 1\n" + ("A" * 600 + "\n") * 10
261
+ article2 = "Title: Article 2\n" + ("B" * 600 + "\n") * 10
262
+
263
+ source = File.join(temp_dir, "source.txt")
264
+ File.write(source, article1 + "\n" + article2 + "\n")
265
+
266
+ writer = described_class.new(
267
+ output_dir: File.join(temp_dir, "out"),
268
+ base_name: "output",
269
+ format: :text,
270
+ file_size_mb: 0.005 # ~5KB threshold to force rotation
271
+ )
272
+
273
+ writer.write_from_file(source)
274
+ files = writer.close
275
+
276
+ expect(files.size).to be >= 2
277
+
278
+ # Article 1 should be entirely in the first file (not split mid-article)
279
+ first_content = File.read(files.first)
280
+ expect(first_content).to include("Title: Article 1")
281
+ expect(first_content.lines.last.strip).to eq("") # ends at blank line
282
+
283
+ # Article 2 should start in a subsequent file
284
+ remaining = files[1..].map { |f| File.read(f) }.join
285
+ expect(remaining).to include("Title: Article 2")
286
+ end
287
+
288
+ it "handles UTF-8 content without encoding errors" do
289
+ source = File.join(temp_dir, "source_utf8.txt")
290
+ utf8_content = "タイトル: 日本語記事\n" \
291
+ "本文テキスト。漢字、ひらがな、カタカナ。\n" \
292
+ "\n" \
293
+ "Title: English Article\n" \
294
+ "Body text with accents: café, naïve, résumé.\n"
295
+ File.write(source, utf8_content)
296
+
297
+ writer = described_class.new(
298
+ output_dir: temp_dir,
299
+ base_name: "output",
300
+ format: :text,
301
+ file_size_mb: 0
302
+ )
303
+
304
+ expect { writer.write_from_file(source) }.not_to raise_error
305
+ files = writer.close
306
+
307
+ content = File.read(files.first)
308
+ expect(content).to include("日本語記事")
309
+ expect(content).to include("café")
310
+ end
311
+ end
312
+
313
+ describe "error handling" do
314
+ it "raises FileIOError on disk full (ENOSPC) in write" do
315
+ writer = described_class.new(
316
+ output_dir: temp_dir,
317
+ base_name: "output",
318
+ format: :text,
319
+ file_size_mb: 0
320
+ )
321
+
322
+ # Force ENOSPC by stubbing File#write
323
+ allow_any_instance_of(File).to receive(:write).and_raise(Errno::ENOSPC)
324
+
325
+ expect { writer.write("content") }.to raise_error(Wp2txt::FileIOError, /Disk full/)
326
+ end
327
+
328
+ it "raises FileIOError on disk full (ENOSPC) in write_raw" do
329
+ writer = described_class.new(
330
+ output_dir: temp_dir,
331
+ base_name: "output",
332
+ format: :text,
333
+ file_size_mb: 0
334
+ )
335
+
336
+ allow_any_instance_of(File).to receive(:write).and_raise(Errno::ENOSPC)
337
+
338
+ expect { writer.write_raw("content") }.to raise_error(Wp2txt::FileIOError, /Disk full/)
339
+ end
340
+
341
+ it "raises FileIOError on I/O error in write" do
342
+ writer = described_class.new(
343
+ output_dir: temp_dir,
344
+ base_name: "output",
345
+ format: :text,
346
+ file_size_mb: 0
347
+ )
348
+
349
+ allow_any_instance_of(File).to receive(:write).and_raise(IOError, "stream closed")
350
+
351
+ expect { writer.write("content") }.to raise_error(Wp2txt::FileIOError, /Write failed/)
352
+ end
353
+
354
+ it "includes output directory in error message" do
355
+ writer = described_class.new(
356
+ output_dir: temp_dir,
357
+ base_name: "output",
358
+ format: :text,
359
+ file_size_mb: 0
360
+ )
361
+
362
+ allow_any_instance_of(File).to receive(:write).and_raise(Errno::ENOSPC)
363
+
364
+ expect { writer.write("content") }.to raise_error(Wp2txt::FileIOError, /#{Regexp.escape(temp_dir)}/)
365
+ end
366
+ end
367
+
368
+ describe "#close" do
369
+ it "removes empty files" do
370
+ writer = described_class.new(
371
+ output_dir: temp_dir,
372
+ base_name: "output",
373
+ format: :text,
374
+ file_size_mb: 0
375
+ )
376
+
377
+ # Don't write anything
378
+ files = writer.close
379
+
380
+ # Should have created then removed empty file
381
+ expect(files).to be_empty
382
+ end
383
+
384
+ it "returns list of created files" do
385
+ writer = described_class.new(
386
+ output_dir: temp_dir,
387
+ base_name: "output",
388
+ format: :text,
389
+ file_size_mb: 0
390
+ )
391
+
392
+ writer.write("content\n")
393
+ files = writer.close
394
+
395
+ expect(files).to be_an(Array)
396
+ expect(files.size).to eq(1)
397
+ expect(File.exist?(files.first)).to be true
398
+ end
399
+ end
400
+ end