wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
data/spec/spec_helper.rb CHANGED
@@ -1,6 +1,69 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "simplecov"
4
+ SimpleCov.start do
5
+ add_filter "/spec/"
6
+ add_group "Core", "lib/wp2txt"
7
+ minimum_coverage 20 # Temporarily lowered, will increase as we add tests
8
+ end
9
+
3
10
  require "rspec"
11
+ require "stringio"
12
+
13
+ # Load wp2txt modules
14
+ require_relative "../lib/wp2txt"
15
+ require_relative "../lib/wp2txt/article"
16
+ require_relative "../lib/wp2txt/utils"
17
+ require_relative "../lib/wp2txt/regex"
18
+ require_relative "../lib/wp2txt/multistream"
19
+ require_relative "../lib/wp2txt/config"
20
+ require_relative "../lib/wp2txt/template_expander"
21
+ require_relative "../lib/wp2txt/parser_functions"
4
22
 
5
23
  RSpec.configure do |config|
24
+ config.expect_with :rspec do |expectations|
25
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
26
+ end
27
+
28
+ # Helper to suppress stderr output during tests
29
+ config.include Module.new {
30
+ def suppress_stderr
31
+ original_stderr = $stderr
32
+ $stderr = StringIO.new
33
+ yield
34
+ ensure
35
+ $stderr = original_stderr
36
+ end
37
+
38
+ def suppress_stdout
39
+ original_stdout = $stdout
40
+ $stdout = StringIO.new
41
+ yield
42
+ ensure
43
+ $stdout = original_stdout
44
+ end
45
+
46
+ def suppress_output
47
+ original_stdout = $stdout
48
+ original_stderr = $stderr
49
+ $stdout = StringIO.new
50
+ $stderr = StringIO.new
51
+ yield
52
+ ensure
53
+ $stdout = original_stdout
54
+ $stderr = original_stderr
55
+ end
56
+ }
57
+
58
+ config.mock_with :rspec do |mocks|
59
+ mocks.verify_partial_doubles = true
60
+ end
61
+
62
+ config.shared_context_metadata_behavior = :apply_to_host_groups
63
+ config.filter_run_when_matching :focus
64
+ config.example_status_persistence_file_path = "spec/examples.txt"
65
+ config.disable_monkey_patching!
66
+ config.warnings = false # Suppress warnings during test runs
67
+ config.order = :random
68
+ Kernel.srand config.seed
6
69
  end
@@ -0,0 +1,579 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+ require "tempfile"
5
+ require "fileutils"
6
+
7
+ RSpec.describe Wp2txt::StreamProcessor do
8
+ let(:temp_dir) { Dir.mktmpdir }
9
+
10
+ after do
11
+ FileUtils.rm_rf(temp_dir)
12
+ end
13
+
14
+ describe "#each_page" do
15
+ context "with XML file input" do
16
+ let(:xml_content) do
17
+ <<~XML
18
+ <mediawiki>
19
+ <page>
20
+ <title>Test Article</title>
21
+ <revision>
22
+ <text>This is the article content.</text>
23
+ </revision>
24
+ </page>
25
+ <page>
26
+ <title>Second Article</title>
27
+ <revision>
28
+ <text>Second article content.</text>
29
+ </revision>
30
+ </page>
31
+ </mediawiki>
32
+ XML
33
+ end
34
+
35
+ let(:xml_file) { File.join(temp_dir, "test.xml") }
36
+
37
+ before do
38
+ File.write(xml_file, xml_content)
39
+ end
40
+
41
+ it "extracts pages from XML file" do
42
+ processor = described_class.new(xml_file)
43
+ pages = processor.each_page.to_a
44
+
45
+ expect(pages.size).to eq(2)
46
+ expect(pages[0][0]).to eq("Test Article")
47
+ expect(pages[0][1]).to include("article content")
48
+ expect(pages[1][0]).to eq("Second Article")
49
+ end
50
+
51
+ it "yields title and text for each page" do
52
+ processor = described_class.new(xml_file)
53
+ titles = []
54
+ texts = []
55
+
56
+ processor.each_page do |title, text|
57
+ titles << title
58
+ texts << text
59
+ end
60
+
61
+ expect(titles).to eq(["Test Article", "Second Article"])
62
+ expect(texts[0]).to include("article content")
63
+ end
64
+ end
65
+
66
+ context "with directory input" do
67
+ let(:xml_content1) do
68
+ <<~XML
69
+ <page>
70
+ <title>Article One</title>
71
+ <revision>
72
+ <text>Content one.</text>
73
+ </revision>
74
+ </page>
75
+ XML
76
+ end
77
+
78
+ let(:xml_content2) do
79
+ <<~XML
80
+ <page>
81
+ <title>Article Two</title>
82
+ <revision>
83
+ <text>Content two.</text>
84
+ </revision>
85
+ </page>
86
+ XML
87
+ end
88
+
89
+ before do
90
+ File.write(File.join(temp_dir, "part1.xml"), xml_content1)
91
+ File.write(File.join(temp_dir, "part2.xml"), xml_content2)
92
+ end
93
+
94
+ it "processes all XML files in directory" do
95
+ processor = described_class.new(temp_dir)
96
+ pages = processor.each_page.to_a
97
+
98
+ expect(pages.size).to eq(2)
99
+ titles = pages.map(&:first)
100
+ expect(titles).to include("Article One", "Article Two")
101
+ end
102
+ end
103
+
104
+ context "with special pages" do
105
+ let(:xml_content) do
106
+ <<~XML
107
+ <page>
108
+ <title>Normal Article</title>
109
+ <revision>
110
+ <text>Normal content.</text>
111
+ </revision>
112
+ </page>
113
+ <page>
114
+ <title>Wikipedia:Help</title>
115
+ <revision>
116
+ <text>Help content.</text>
117
+ </revision>
118
+ </page>
119
+ <page>
120
+ <title>File:Image.jpg</title>
121
+ <revision>
122
+ <text>File description.</text>
123
+ </revision>
124
+ </page>
125
+ XML
126
+ end
127
+
128
+ let(:xml_file) { File.join(temp_dir, "test.xml") }
129
+
130
+ before do
131
+ File.write(xml_file, xml_content)
132
+ end
133
+
134
+ it "skips pages with colon in title (special pages)" do
135
+ processor = described_class.new(xml_file)
136
+ pages = processor.each_page.to_a
137
+
138
+ expect(pages.size).to eq(1)
139
+ expect(pages[0][0]).to eq("Normal Article")
140
+ end
141
+ end
142
+
143
+ context "with HTML comments" do
144
+ let(:xml_content) do
145
+ <<~XML
146
+ <page>
147
+ <title>Article With Comments</title>
148
+ <revision>
149
+ <text>Before <!-- hidden comment --> after.</text>
150
+ </revision>
151
+ </page>
152
+ XML
153
+ end
154
+
155
+ let(:xml_file) { File.join(temp_dir, "test.xml") }
156
+
157
+ before do
158
+ File.write(xml_file, xml_content)
159
+ end
160
+
161
+ it "removes HTML comments from text" do
162
+ processor = described_class.new(xml_file)
163
+ pages = processor.each_page.to_a
164
+
165
+ expect(pages[0][1]).not_to include("hidden comment")
166
+ expect(pages[0][1]).to include("Before")
167
+ expect(pages[0][1]).to include("after")
168
+ end
169
+ end
170
+
171
+ context "returns enumerator when no block given" do
172
+ let(:xml_content) do
173
+ <<~XML
174
+ <page>
175
+ <title>Test</title>
176
+ <revision>
177
+ <text>Content.</text>
178
+ </revision>
179
+ </page>
180
+ XML
181
+ end
182
+
183
+ let(:xml_file) { File.join(temp_dir, "test.xml") }
184
+
185
+ before do
186
+ File.write(xml_file, xml_content)
187
+ end
188
+
189
+ it "returns an Enumerator" do
190
+ processor = described_class.new(xml_file)
191
+ result = processor.each_page
192
+
193
+ expect(result).to be_an(Enumerator)
194
+ expect(result.to_a.size).to eq(1)
195
+ end
196
+ end
197
+
198
+ context "with unsupported format" do
199
+ let(:unsupported_file) { File.join(temp_dir, "test.txt") }
200
+
201
+ before do
202
+ File.write(unsupported_file, "plain text content")
203
+ end
204
+
205
+ it "raises ArgumentError for unsupported format" do
206
+ processor = described_class.new(unsupported_file)
207
+ expect { processor.each_page.to_a }.to raise_error(ArgumentError, /Unsupported input format/)
208
+ end
209
+ end
210
+
211
+ context "with malformed XML" do
212
+ let(:xml_content) do
213
+ <<~XML
214
+ <page>
215
+ <title>Test Article</title>
216
+ <revision>
217
+ <text>Content with unclosed tag <b>
218
+ </revision>
219
+ </page>
220
+ XML
221
+ end
222
+
223
+ let(:xml_file) { File.join(temp_dir, "malformed.xml") }
224
+
225
+ before do
226
+ File.write(xml_file, xml_content)
227
+ end
228
+
229
+ it "skips malformed XML gracefully" do
230
+ processor = described_class.new(xml_file)
231
+ pages = processor.each_page.to_a
232
+ # Should not raise error, just skip malformed page
233
+ expect(pages).to be_an(Array)
234
+ end
235
+ end
236
+
237
+ context "with empty text node" do
238
+ let(:xml_content) do
239
+ <<~XML
240
+ <page>
241
+ <title>Empty Article</title>
242
+ <revision>
243
+ <text></text>
244
+ </revision>
245
+ </page>
246
+ XML
247
+ end
248
+
249
+ let(:xml_file) { File.join(temp_dir, "empty.xml") }
250
+
251
+ before do
252
+ File.write(xml_file, xml_content)
253
+ end
254
+
255
+ it "handles empty text" do
256
+ processor = described_class.new(xml_file)
257
+ pages = processor.each_page.to_a
258
+ expect(pages.size).to eq(1)
259
+ expect(pages[0][1]).to eq("")
260
+ end
261
+ end
262
+
263
+ context "with missing title" do
264
+ let(:xml_content) do
265
+ <<~XML
266
+ <page>
267
+ <revision>
268
+ <text>Content without title.</text>
269
+ </revision>
270
+ </page>
271
+ XML
272
+ end
273
+
274
+ let(:xml_file) { File.join(temp_dir, "no_title.xml") }
275
+
276
+ before do
277
+ File.write(xml_file, xml_content)
278
+ end
279
+
280
+ it "skips pages without title" do
281
+ processor = described_class.new(xml_file)
282
+ pages = processor.each_page.to_a
283
+ expect(pages).to be_empty
284
+ end
285
+ end
286
+
287
+ context "with multi-line HTML comments" do
288
+ let(:xml_content) do
289
+ <<~XML
290
+ <page>
291
+ <title>Multi Comment Article</title>
292
+ <revision>
293
+ <text>Before
294
+ <!--
295
+ Multi-line
296
+ comment
297
+ here
298
+ -->
299
+ After</text>
300
+ </revision>
301
+ </page>
302
+ XML
303
+ end
304
+
305
+ let(:xml_file) { File.join(temp_dir, "multiline_comment.xml") }
306
+
307
+ before do
308
+ File.write(xml_file, xml_content)
309
+ end
310
+
311
+ it "preserves newline count from multi-line comments" do
312
+ processor = described_class.new(xml_file)
313
+ pages = processor.each_page.to_a
314
+ expect(pages.size).to eq(1)
315
+ text = pages[0][1]
316
+ expect(text).not_to include("Multi-line")
317
+ expect(text).not_to include("comment")
318
+ # Check that newlines are preserved (original content has newlines)
319
+ expect(text.count("\n")).to be >= 1
320
+ end
321
+ end
322
+
323
+ context "with multiple pages in buffer" do
324
+ let(:xml_content) do
325
+ (1..10).map do |i|
326
+ <<~XML
327
+ <page>
328
+ <title>Article #{i}</title>
329
+ <revision>
330
+ <text>Content for article #{i}.</text>
331
+ </revision>
332
+ </page>
333
+ XML
334
+ end.join("\n")
335
+ end
336
+
337
+ let(:xml_file) { File.join(temp_dir, "many_pages.xml") }
338
+
339
+ before do
340
+ File.write(xml_file, xml_content)
341
+ end
342
+
343
+ it "processes all pages correctly" do
344
+ processor = described_class.new(xml_file)
345
+ pages = processor.each_page.to_a
346
+ expect(pages.size).to eq(10)
347
+ expect(pages.map(&:first)).to eq((1..10).map { |i| "Article #{i}" })
348
+ end
349
+ end
350
+
351
+ context "with redirect pages" do
352
+ let(:xml_content) do
353
+ <<~XML
354
+ <page>
355
+ <title>Normal Article</title>
356
+ <revision>
357
+ <text>This is a normal article with content.</text>
358
+ </revision>
359
+ </page>
360
+ <page>
361
+ <title>English Redirect</title>
362
+ <revision>
363
+ <text>#REDIRECT [[Target Article]]</text>
364
+ </revision>
365
+ </page>
366
+ <page>
367
+ <title>Japanese Redirect</title>
368
+ <revision>
369
+ <text>#転送 [[ターゲット記事]]</text>
370
+ </revision>
371
+ </page>
372
+ <page>
373
+ <title>Another Normal</title>
374
+ <revision>
375
+ <text>Another normal article.</text>
376
+ </revision>
377
+ </page>
378
+ <page>
379
+ <title>Fullwidth Hash Redirect</title>
380
+ <revision>
381
+ <text>#REDIRECT [[Target]]</text>
382
+ </revision>
383
+ </page>
384
+ XML
385
+ end
386
+
387
+ let(:xml_file) { File.join(temp_dir, "redirects.xml") }
388
+
389
+ before do
390
+ File.write(xml_file, xml_content)
391
+ end
392
+
393
+ it "skips redirect pages by default" do
394
+ processor = described_class.new(xml_file)
395
+ pages = processor.each_page.to_a
396
+
397
+ expect(pages.size).to eq(2)
398
+ titles = pages.map(&:first)
399
+ expect(titles).to include("Normal Article", "Another Normal")
400
+ expect(titles).not_to include("English Redirect", "Japanese Redirect", "Fullwidth Hash Redirect")
401
+ end
402
+
403
+ it "counts skipped redirects" do
404
+ processor = described_class.new(xml_file)
405
+ processor.each_page.to_a
406
+
407
+ expect(processor.redirects_skipped).to eq(3)
408
+ end
409
+
410
+ it "includes redirect pages when skip_redirects is false" do
411
+ processor = described_class.new(xml_file, skip_redirects: false)
412
+ pages = processor.each_page.to_a
413
+
414
+ expect(pages.size).to eq(5)
415
+ expect(processor.redirects_skipped).to eq(0)
416
+ end
417
+
418
+ it "includes redirects_skipped in stats" do
419
+ processor = described_class.new(xml_file)
420
+ processor.each_page.to_a
421
+
422
+ stats = processor.stats
423
+ expect(stats[:redirects_skipped]).to eq(3)
424
+ expect(stats[:pages_processed]).to eq(2)
425
+ end
426
+ end
427
+ end
428
+
429
+ describe "#initialize" do
430
+ it "accepts input path" do
431
+ processor = described_class.new("/path/to/file.xml")
432
+ expect(processor.instance_variable_get(:@input_path)).to eq("/path/to/file.xml")
433
+ end
434
+
435
+ it "accepts bz2_gem option" do
436
+ processor = described_class.new("/path/to/file.bz2", bz2_gem: true)
437
+ expect(processor.instance_variable_get(:@bz2_gem)).to be true
438
+ end
439
+
440
+ it "defaults bz2_gem to false" do
441
+ processor = described_class.new("/path/to/file.bz2")
442
+ expect(processor.instance_variable_get(:@bz2_gem)).to be false
443
+ end
444
+ end
445
+
446
+ describe "private methods" do
447
+ let(:temp_dir) { Dir.mktmpdir }
448
+
449
+ after { FileUtils.rm_rf(temp_dir) }
450
+
451
+ describe "#find_bzip2_command" do
452
+ it "returns path to bzip2 command if available" do
453
+ xml_file = File.join(temp_dir, "test.xml")
454
+ File.write(xml_file, "<page></page>")
455
+ processor = described_class.new(xml_file)
456
+
457
+ # On most Unix systems, at least one bzip2 command should exist
458
+ result = processor.send(:find_bzip2_command)
459
+ # Result is either a path string or nil
460
+ expect(result.nil? || result.is_a?(String)).to be true
461
+ end
462
+ end
463
+
464
+ describe "#fill_buffer" do
465
+ let(:xml_content) do
466
+ <<~XML
467
+ <page>
468
+ <title>Buffer Test</title>
469
+ <revision>
470
+ <text>Test content for buffer.</text>
471
+ </revision>
472
+ </page>
473
+ XML
474
+ end
475
+
476
+ let(:xml_file) { File.join(temp_dir, "buffer_test.xml") }
477
+
478
+ before do
479
+ File.write(xml_file, xml_content)
480
+ end
481
+
482
+ it "fills buffer from file" do
483
+ processor = described_class.new(xml_file)
484
+ processor.instance_variable_set(:@buffer, +"")
485
+ processor.instance_variable_set(:@file_pointer, File.open(xml_file, "r:UTF-8"))
486
+
487
+ result = processor.send(:fill_buffer)
488
+ expect(result).to be true
489
+ expect(processor.instance_variable_get(:@buffer)).not_to be_empty
490
+ end
491
+
492
+ it "returns false when file is exhausted" do
493
+ processor = described_class.new(xml_file)
494
+ processor.instance_variable_set(:@buffer, +"")
495
+
496
+ # Open and read entire file
497
+ fp = File.open(xml_file, "r:UTF-8")
498
+ fp.read # Exhaust the file
499
+ processor.instance_variable_set(:@file_pointer, fp)
500
+
501
+ result = processor.send(:fill_buffer)
502
+ expect(result).to be false
503
+ end
504
+ end
505
+
506
+ describe "#extract_next_page" do
507
+ let(:xml_file) { File.join(temp_dir, "extract_test.xml") }
508
+
509
+ it "extracts page from buffer" do
510
+ xml_content = "<page><title>Test</title></page>"
511
+ File.write(xml_file, xml_content)
512
+
513
+ processor = described_class.new(xml_file)
514
+ processor.instance_variable_set(:@buffer, +"<page><title>Test</title></page>rest")
515
+ processor.instance_variable_set(:@file_pointer, File.open(xml_file, "r:UTF-8"))
516
+
517
+ page = processor.send(:extract_next_page)
518
+ expect(page).to eq("<page><title>Test</title></page>")
519
+ end
520
+
521
+ it "returns nil when no complete page in buffer" do
522
+ File.write(xml_file, "<incomplete>")
523
+
524
+ processor = described_class.new(xml_file)
525
+ processor.instance_variable_set(:@buffer, +"<page><title>Incomplete")
526
+ fp = File.open(xml_file, "r:UTF-8")
527
+ fp.read # Exhaust
528
+ processor.instance_variable_set(:@file_pointer, fp)
529
+
530
+ page = processor.send(:extract_next_page)
531
+ expect(page).to be_nil
532
+ end
533
+ end
534
+
535
+ describe "#parse_page_xml" do
536
+ let(:xml_file) { File.join(temp_dir, "parse_test.xml") }
537
+
538
+ before do
539
+ File.write(xml_file, "<page></page>")
540
+ end
541
+
542
+ it "parses valid page XML" do
543
+ processor = described_class.new(xml_file)
544
+ page_xml = <<~XML
545
+ <page>
546
+ <title>Test Article</title>
547
+ <revision>
548
+ <text>Article content here.</text>
549
+ </revision>
550
+ </page>
551
+ XML
552
+
553
+ result = processor.send(:parse_page_xml, page_xml)
554
+ expect(result).not_to be_nil
555
+ expect(result[0]).to eq("Test Article")
556
+ expect(result[1]).to include("Article content")
557
+ end
558
+
559
+ it "returns nil for page without text node" do
560
+ processor = described_class.new(xml_file)
561
+ page_xml = "<page><title>No Text</title></page>"
562
+
563
+ result = processor.send(:parse_page_xml, page_xml)
564
+ expect(result).to be_nil
565
+ end
566
+
567
+ it "handles severely malformed XML" do
568
+ processor = described_class.new(xml_file)
569
+ # This is intentionally broken XML that should trigger SyntaxError
570
+ page_xml = "<page><title>Test</title><revision><text>Content</page>"
571
+
572
+ # Should not raise, just return nil
573
+ result = processor.send(:parse_page_xml, page_xml)
574
+ # May return nil or may parse partially - either is acceptable
575
+ expect(result.nil? || result.is_a?(Array)).to be true
576
+ end
577
+ end
578
+ end
579
+ end