wp2txt 1.1.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +261 -121
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -159
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,510 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "spec_helper"
4
+ require "tempfile"
5
+ require "fileutils"
6
+
7
+ RSpec.describe Wp2txt do
8
+ let(:temp_dir) { Dir.mktmpdir }
9
+
10
+ after do
11
+ FileUtils.rm_rf(temp_dir) if temp_dir && Dir.exist?(temp_dir)
12
+ end
13
+
14
+ describe Wp2txt::Splitter do
15
+ let(:sample_xml) do
16
+ <<~XML
17
+ <mediawiki>
18
+ <page>
19
+ <title>Test Article 1</title>
20
+ <text>'''Test''' is a [[test]].</text>
21
+ </page>
22
+ <page>
23
+ <title>Test Article 2</title>
24
+ <text>Another '''article''' with [[links]].</text>
25
+ </page>
26
+ </mediawiki>
27
+ XML
28
+ end
29
+
30
+ let(:xml_file) do
31
+ file = File.join(temp_dir, "test_input.xml")
32
+ File.write(file, sample_xml)
33
+ file
34
+ end
35
+
36
+ describe "#initialize" do
37
+ it "creates a splitter with default parameters" do
38
+ splitter = Wp2txt::Splitter.new(xml_file, temp_dir)
39
+ expect(splitter).to be_a(Wp2txt::Splitter)
40
+ end
41
+
42
+ it "creates output file base from input file" do
43
+ splitter = Wp2txt::Splitter.new(xml_file, temp_dir)
44
+ expect(splitter.instance_variable_get(:@outfile_base)).to eq("test_input-")
45
+ end
46
+ end
47
+
48
+ describe "#command_exist?" do
49
+ let(:splitter) { Wp2txt::Splitter.new(xml_file, temp_dir) }
50
+
51
+ it "returns path for existing command" do
52
+ # 'ls' should exist on all Unix systems
53
+ result = suppress_stdout { splitter.command_exist?("ls") }
54
+ expect(result).to be_truthy
55
+ expect(result).to include("ls")
56
+ end
57
+
58
+ it "returns false for non-existing command" do
59
+ result = suppress_stdout { splitter.command_exist?("nonexistent_command_xyz123") }
60
+ expect(result).to be false
61
+ end
62
+ end
63
+
64
+ describe "#get_newline" do
65
+ let(:splitter) { Wp2txt::Splitter.new(xml_file, temp_dir) }
66
+
67
+ it "reads lines from file" do
68
+ # Reset buffer for testing
69
+ splitter.instance_variable_set(:@buffer, [+""])
70
+ line = splitter.get_newline
71
+ expect(line).to be_a(String)
72
+ end
73
+ end
74
+
75
+ describe "#split_file" do
76
+ it "splits XML file and creates output files" do
77
+ splitter = Wp2txt::Splitter.new(xml_file, temp_dir, 1) # 1MB split size
78
+ splitter.split_file
79
+
80
+ outfiles = splitter.instance_variable_get(:@outfiles)
81
+ expect(outfiles).not_to be_empty
82
+
83
+ # Check that output files were created and renamed to .xml
84
+ outfiles.each do |f|
85
+ xml_file_path = f.sub(/\d+$/, "") + "*.xml"
86
+ matching_files = Dir.glob(File.join(temp_dir, "*.xml"))
87
+ expect(matching_files).not_to be_empty
88
+ end
89
+ end
90
+ end
91
+
92
+ describe "#fill_buffer" do
93
+ let(:splitter) { Wp2txt::Splitter.new(xml_file, temp_dir) }
94
+
95
+ it "fills buffer with content from file" do
96
+ splitter.instance_variable_set(:@buffer, [+""])
97
+ result = splitter.fill_buffer
98
+ buffer = splitter.instance_variable_get(:@buffer)
99
+
100
+ expect(result).to be true
101
+ expect(buffer.size).to be >= 1
102
+ end
103
+ end
104
+ end
105
+
106
+ describe Wp2txt::Runner do
107
+ let(:sample_xml) do
108
+ <<~XML
109
+ <page>
110
+ <title>Test Article</title>
111
+ <revision>
112
+ <text>'''Test Article''' is about [[testing]].
113
+
114
+ == Section ==
115
+ This is content.
116
+
117
+ [[Category:Testing]]
118
+ </text>
119
+ </revision>
120
+ </page>
121
+ XML
122
+ end
123
+
124
+ let(:xml_file) do
125
+ file = File.join(temp_dir, "test_runner.xml")
126
+ File.write(file, sample_xml)
127
+ file
128
+ end
129
+
130
+ describe "#initialize" do
131
+ it "creates a runner" do
132
+ runner = Wp2txt::Runner.new(xml_file, temp_dir, false, false)
133
+ expect(runner).to be_a(Wp2txt::Runner)
134
+ end
135
+ end
136
+
137
+ describe "#prepare" do
138
+ it "sets up file pointer and output base" do
139
+ runner = Wp2txt::Runner.new(xml_file, temp_dir, false, false)
140
+ expect(runner.instance_variable_get(:@outfile_base)).to eq("test_runner")
141
+ expect(runner.instance_variable_get(:@file_pointer)).not_to be_nil
142
+ end
143
+ end
144
+
145
+ describe "#get_newline" do
146
+ let(:runner) { Wp2txt::Runner.new(xml_file, temp_dir, false, false) }
147
+
148
+ it "returns lines from file" do
149
+ runner.instance_variable_set(:@buffer, [+""])
150
+ line = runner.get_newline
151
+ expect(line).to be_a(String)
152
+ end
153
+ end
154
+
155
+ describe "#fill_buffer" do
156
+ let(:runner) { Wp2txt::Runner.new(xml_file, temp_dir, false, false) }
157
+
158
+ it "reads content into buffer" do
159
+ runner.instance_variable_set(:@buffer, [+""])
160
+ result = runner.fill_buffer
161
+ expect(result).to be true
162
+ end
163
+ end
164
+
165
+ describe "#get_page" do
166
+ let(:runner) { Wp2txt::Runner.new(xml_file, temp_dir, false, false) }
167
+
168
+ it "extracts page content" do
169
+ page = runner.get_page
170
+ expect(page).to be_a(String)
171
+ expect(page).to include("<page>")
172
+ expect(page).to include("</page>")
173
+ expect(page).to include("Test Article")
174
+ end
175
+
176
+ it "returns false when no more pages" do
177
+ runner.get_page # consume first page
178
+ result = runner.get_page
179
+ expect(result).to be false
180
+ end
181
+ end
182
+
183
+ describe "#extract_text" do
184
+ let(:multi_page_xml) do
185
+ <<~XML
186
+ <page>
187
+ <title>Article One</title>
188
+ <revision>
189
+ <text>'''Article One''' is first.</text>
190
+ </revision>
191
+ </page>
192
+ XML
193
+ end
194
+
195
+ let(:multi_page_file) do
196
+ file = File.join(temp_dir, "multi_page.xml")
197
+ File.write(file, multi_page_xml)
198
+ file
199
+ end
200
+
201
+ it "processes pages and calls block for each article" do
202
+ runner = Wp2txt::Runner.new(multi_page_file, temp_dir, false, false)
203
+ articles_processed = []
204
+
205
+ runner.extract_text do |article|
206
+ articles_processed << article.title
207
+ "processed: #{article.title}\n"
208
+ end
209
+
210
+ expect(articles_processed).to include("Article One")
211
+
212
+ # Check output file was created
213
+ output_file = File.join(temp_dir, "multi_page.txt")
214
+ expect(File.exist?(output_file)).to be true
215
+ end
216
+ end
217
+ end
218
+
219
+ describe "Module methods" do
220
+ include Wp2txt
221
+
222
+ describe "#rename" do
223
+ it "renames files with extension" do
224
+ # Create test files
225
+ files = []
226
+ 3.times do |i|
227
+ f = File.join(temp_dir, "testfile#{i}")
228
+ File.write(f, "content #{i}")
229
+ files << f
230
+ end
231
+
232
+ rename(files, "txt")
233
+
234
+ files.each_with_index do |f, i|
235
+ new_name = "#{f}.txt"
236
+ expect(File.exist?(new_name)).to be true
237
+ expect(File.read(new_name)).to eq("content #{i}")
238
+ end
239
+ end
240
+ end
241
+ end
242
+ end
243
+
244
+ RSpec.describe "Splitter with edge cases" do
245
+ let(:temp_dir) { Dir.mktmpdir }
246
+
247
+ after do
248
+ FileUtils.rm_rf(temp_dir) if temp_dir && Dir.exist?(temp_dir)
249
+ end
250
+
251
+ describe "empty file handling" do
252
+ let(:empty_file) do
253
+ file = File.join(temp_dir, "empty.xml")
254
+ File.write(file, "")
255
+ file
256
+ end
257
+
258
+ it "handles empty input file" do
259
+ expect {
260
+ splitter = Wp2txt::Splitter.new(empty_file, temp_dir)
261
+ splitter.split_file
262
+ }.not_to raise_error
263
+ end
264
+ end
265
+
266
+ describe "large content handling" do
267
+ let(:large_xml) do
268
+ content = +"<mediawiki>\n"
269
+ 50.times do |i|
270
+ content << "<page>\n"
271
+ content << " <title>Article #{i}</title>\n"
272
+ content << " <text>#{'x' * 1000} article #{i}</text>\n"
273
+ content << "</page>\n"
274
+ end
275
+ content << "</mediawiki>"
276
+ content
277
+ end
278
+
279
+ let(:large_file) do
280
+ file = File.join(temp_dir, "large.xml")
281
+ File.write(file, large_xml)
282
+ file
283
+ end
284
+
285
+ it "processes large files without error" do
286
+ expect {
287
+ splitter = Wp2txt::Splitter.new(large_file, temp_dir, 1)
288
+ splitter.split_file
289
+ }.not_to raise_error
290
+ end
291
+ end
292
+ end
293
+
294
+ RSpec.describe "Splitter additional tests" do
295
+ let(:temp_dir) { Dir.mktmpdir }
296
+
297
+ after do
298
+ FileUtils.rm_rf(temp_dir) if temp_dir && Dir.exist?(temp_dir)
299
+ end
300
+
301
+ describe "#file_size" do
302
+ let(:test_file) do
303
+ file = File.join(temp_dir, "test_size.xml")
304
+ File.write(file, "x" * 1000)
305
+ file
306
+ end
307
+
308
+ it "calculates file size" do
309
+ splitter = Wp2txt::Splitter.new(test_file, temp_dir)
310
+ size = splitter.file_size(File.open(test_file, "r"))
311
+ expect(size).to eq(1000)
312
+ end
313
+
314
+ it "handles empty file" do
315
+ empty_file = File.join(temp_dir, "empty.xml")
316
+ File.write(empty_file, "")
317
+ splitter = Wp2txt::Splitter.new(empty_file, temp_dir)
318
+ size = splitter.file_size(File.open(empty_file, "r"))
319
+ expect(size).to eq(0)
320
+ end
321
+ end
322
+
323
+ describe "#split_file edge cases" do
324
+ let(:single_page_xml) do
325
+ <<~XML
326
+ <mediawiki>
327
+ <page>
328
+ <title>Single Article</title>
329
+ <text>Content here.</text>
330
+ </page>
331
+ </mediawiki>
332
+ XML
333
+ end
334
+
335
+ let(:single_file) do
336
+ file = File.join(temp_dir, "single.xml")
337
+ File.write(file, single_page_xml)
338
+ file
339
+ end
340
+
341
+ it "handles single page file" do
342
+ splitter = Wp2txt::Splitter.new(single_file, temp_dir)
343
+ splitter.split_file
344
+
345
+ xml_files = Dir.glob(File.join(temp_dir, "*.xml"))
346
+ expect(xml_files.size).to be >= 1
347
+ end
348
+
349
+ it "creates output files with correct base name" do
350
+ splitter = Wp2txt::Splitter.new(single_file, temp_dir)
351
+ splitter.split_file
352
+
353
+ xml_files = Dir.glob(File.join(temp_dir, "single-*.xml"))
354
+ expect(xml_files).not_to be_empty
355
+ end
356
+ end
357
+
358
+ describe "#prepare" do
359
+ it "sets up file pointer for plain XML" do
360
+ xml_file = File.join(temp_dir, "test.xml")
361
+ File.write(xml_file, "<page></page>")
362
+ splitter = Wp2txt::Splitter.new(xml_file, temp_dir)
363
+
364
+ expect(splitter.instance_variable_get(:@file_pointer)).not_to be_nil
365
+ expect(splitter.instance_variable_get(:@outfile_base)).to eq("test-")
366
+ end
367
+ end
368
+ end
369
+
370
+ RSpec.describe "Runner additional tests" do
371
+ let(:temp_dir) { Dir.mktmpdir }
372
+
373
+ after do
374
+ FileUtils.rm_rf(temp_dir) if temp_dir && Dir.exist?(temp_dir)
375
+ end
376
+
377
+ describe "#extract_text with del_interfile" do
378
+ let(:xml_content) do
379
+ <<~XML
380
+ <page>
381
+ <title>Delete Test</title>
382
+ <revision>
383
+ <text>Test content.</text>
384
+ </revision>
385
+ </page>
386
+ XML
387
+ end
388
+
389
+ it "deletes intermediate file when del_interfile is true" do
390
+ xml_file = File.join(temp_dir, "to_delete.xml")
391
+ File.write(xml_file, xml_content)
392
+
393
+ runner = Wp2txt::Runner.new(xml_file, temp_dir, false, true)
394
+ runner.extract_text { |article| "#{article.title}\n" }
395
+
396
+ expect(File.exist?(xml_file)).to be false
397
+ end
398
+
399
+ it "keeps intermediate file when del_interfile is false" do
400
+ xml_file = File.join(temp_dir, "to_keep.xml")
401
+ File.write(xml_file, xml_content)
402
+
403
+ runner = Wp2txt::Runner.new(xml_file, temp_dir, false, false)
404
+ runner.extract_text { |article| "#{article.title}\n" }
405
+
406
+ expect(File.exist?(xml_file)).to be true
407
+ end
408
+ end
409
+
410
+ describe "#get_page edge cases" do
411
+ let(:incomplete_xml) do
412
+ <<~XML
413
+ <page>
414
+ <title>Incomplete</title>
415
+ <text>No closing page tag
416
+ XML
417
+ end
418
+
419
+ it "handles incomplete page" do
420
+ xml_file = File.join(temp_dir, "incomplete.xml")
421
+ File.write(xml_file, incomplete_xml)
422
+
423
+ runner = Wp2txt::Runner.new(xml_file, temp_dir, false, false)
424
+ result = runner.get_page
425
+ # Should return something even if incomplete
426
+ expect(result).to be_truthy
427
+ end
428
+ end
429
+ end
430
+
431
+ RSpec.describe "Runner edge cases" do
432
+ let(:temp_dir) { Dir.mktmpdir }
433
+
434
+ after do
435
+ FileUtils.rm_rf(temp_dir) if temp_dir && Dir.exist?(temp_dir)
436
+ end
437
+
438
+ describe "page with colon in title" do
439
+ let(:colon_title_xml) do
440
+ <<~XML
441
+ <page>
442
+ <title>Category:Test</title>
443
+ <revision>
444
+ <text>Category page content</text>
445
+ </revision>
446
+ </page>
447
+ <page>
448
+ <title>Normal Article</title>
449
+ <revision>
450
+ <text>Normal content</text>
451
+ </revision>
452
+ </page>
453
+ XML
454
+ end
455
+
456
+ let(:colon_file) do
457
+ file = File.join(temp_dir, "colon_test.xml")
458
+ File.write(file, colon_title_xml)
459
+ file
460
+ end
461
+
462
+ it "skips pages with colon in title (namespace pages)" do
463
+ runner = Wp2txt::Runner.new(colon_file, temp_dir, false, false)
464
+ titles = []
465
+
466
+ runner.extract_text do |article|
467
+ titles << article.title
468
+ "#{article.title}\n"
469
+ end
470
+
471
+ expect(titles).to include("Normal Article")
472
+ expect(titles).not_to include("Category:Test")
473
+ end
474
+ end
475
+
476
+ describe "page with HTML comments" do
477
+ let(:comment_xml) do
478
+ <<~XML
479
+ <page>
480
+ <title>Comment Test</title>
481
+ <revision>
482
+ <text>Before comment <!-- hidden
483
+ multiline
484
+ comment --> after comment</text>
485
+ </revision>
486
+ </page>
487
+ XML
488
+ end
489
+
490
+ let(:comment_file) do
491
+ file = File.join(temp_dir, "comment_test.xml")
492
+ File.write(file, comment_xml)
493
+ file
494
+ end
495
+
496
+ it "removes HTML comments preserving newlines" do
497
+ runner = Wp2txt::Runner.new(comment_file, temp_dir, false, false)
498
+ content = ""
499
+
500
+ runner.extract_text do |article|
501
+ content = article.elements.map { |e| e.last }.join("\n")
502
+ content
503
+ end
504
+
505
+ expect(content).to include("Before comment")
506
+ expect(content).to include("after comment")
507
+ expect(content).not_to include("hidden")
508
+ end
509
+ end
510
+ end
data/wp2txt.gemspec CHANGED
@@ -10,9 +10,8 @@ Gem::Specification.new do |s|
10
10
  s.homepage = "https://github.com/yohasebe/wp2txt"
11
11
  s.summary = "A command-line toolkit to extract text content and category data from Wikipedia dump files"
12
12
  s.description = "WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata."
13
- s.rubyforge_project = "wp2txt"
14
13
  s.license = "MIT"
15
- s.required_ruby_version = Gem::Requirement.new(">= 2.6")
14
+ s.required_ruby_version = Gem::Requirement.new(">= 3.0")
16
15
  s.files = `git ls-files`.split("\n")
17
16
  s.files -= ["data/*", "image/*"]
18
17
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
@@ -21,11 +20,14 @@ Gem::Specification.new do |s|
21
20
  s.add_development_dependency "bundler"
22
21
  s.add_development_dependency "rake"
23
22
  s.add_development_dependency "rspec"
23
+ s.add_development_dependency "simplecov"
24
+ s.add_development_dependency "webmock"
24
25
  s.add_dependency "htmlentities"
25
26
  s.add_dependency "nokogiri"
26
27
  s.add_dependency "optimist"
27
28
  s.add_dependency "parallel"
28
29
  s.add_dependency "pastel"
29
- s.add_dependency "ruby-progressbar"
30
+ s.add_dependency "tty-progressbar"
30
31
  s.add_dependency "tty-spinner"
32
+ s.add_dependency "sqlite3"
31
33
  end