wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,194 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "spec_helper"
4
+ require_relative "../lib/wp2txt/config"
5
+ require "tmpdir"
6
+ require "fileutils"
7
+
8
+ RSpec.describe Wp2txt::Config do
9
+ let(:tmpdir) { Dir.mktmpdir("wp2txt_config_test_") }
10
+ let(:config_path) { File.join(tmpdir, "config.yml") }
11
+
12
+ after do
13
+ FileUtils.rm_rf(tmpdir)
14
+ end
15
+
16
+ describe ".default_path" do
17
+ it "returns path in home directory" do
18
+ expect(described_class.default_path).to include(".wp2txt")
19
+ expect(described_class.default_path).to end_with("config.yml")
20
+ end
21
+ end
22
+
23
+ describe ".load" do
24
+ context "when config file does not exist" do
25
+ it "returns default configuration" do
26
+ config = described_class.load(config_path)
27
+
28
+ expect(config.dump_expiry_days).to eq 30
29
+ expect(config.category_expiry_days).to eq 7
30
+ expect(config.cache_directory).to eq Wp2txt::Config::DEFAULT_CACHE_DIR
31
+ expect(config.default_format).to eq "text"
32
+ expect(config.default_depth).to eq 0
33
+ end
34
+ end
35
+
36
+ context "when config file exists" do
37
+ it "loads settings from file" do
38
+ File.write(config_path, <<~YAML)
39
+ cache:
40
+ dump_expiry_days: 60
41
+ category_expiry_days: 14
42
+ directory: /custom/cache
43
+ defaults:
44
+ format: json
45
+ depth: 2
46
+ YAML
47
+
48
+ config = described_class.load(config_path)
49
+
50
+ expect(config.dump_expiry_days).to eq 60
51
+ expect(config.category_expiry_days).to eq 14
52
+ expect(config.cache_directory).to eq "/custom/cache"
53
+ expect(config.default_format).to eq "json"
54
+ expect(config.default_depth).to eq 2
55
+ end
56
+
57
+ it "uses defaults for missing keys" do
58
+ File.write(config_path, <<~YAML)
59
+ cache:
60
+ dump_expiry_days: 45
61
+ YAML
62
+
63
+ config = described_class.load(config_path)
64
+
65
+ expect(config.dump_expiry_days).to eq 45
66
+ expect(config.category_expiry_days).to eq 7 # default
67
+ expect(config.default_format).to eq "text" # default
68
+ end
69
+
70
+ it "handles empty file" do
71
+ File.write(config_path, "")
72
+
73
+ config = described_class.load(config_path)
74
+
75
+ expect(config.dump_expiry_days).to eq 30
76
+ end
77
+
78
+ it "handles malformed YAML gracefully" do
79
+ File.write(config_path, "invalid: yaml: content: [")
80
+
81
+ config = described_class.load(config_path)
82
+
83
+ # Should return defaults on parse error
84
+ expect(config.dump_expiry_days).to eq 30
85
+ end
86
+ end
87
+ end
88
+
89
+ describe "#save" do
90
+ it "writes configuration to file" do
91
+ config = described_class.new(
92
+ dump_expiry_days: 45,
93
+ category_expiry_days: 10,
94
+ cache_directory: "/my/cache",
95
+ default_format: "json",
96
+ default_depth: 1
97
+ )
98
+
99
+ config.save(config_path)
100
+
101
+ expect(File.exist?(config_path)).to be true
102
+ loaded = described_class.load(config_path)
103
+ expect(loaded.dump_expiry_days).to eq 45
104
+ expect(loaded.category_expiry_days).to eq 10
105
+ expect(loaded.cache_directory).to eq "/my/cache"
106
+ expect(loaded.default_format).to eq "json"
107
+ expect(loaded.default_depth).to eq 1
108
+ end
109
+
110
+ it "creates parent directories if needed" do
111
+ nested_path = File.join(tmpdir, "nested", "dir", "config.yml")
112
+ config = described_class.new
113
+
114
+ config.save(nested_path)
115
+
116
+ expect(File.exist?(nested_path)).to be true
117
+ end
118
+ end
119
+
120
+ describe ".create_default" do
121
+ it "creates a config file with default values" do
122
+ described_class.create_default(config_path)
123
+
124
+ expect(File.exist?(config_path)).to be true
125
+ content = File.read(config_path)
126
+ expect(content).to include("dump_expiry_days: 30")
127
+ expect(content).to include("category_expiry_days: 7")
128
+ end
129
+
130
+ it "does not overwrite existing file" do
131
+ File.write(config_path, "custom: value")
132
+
133
+ result = described_class.create_default(config_path)
134
+
135
+ expect(result).to be false
136
+ expect(File.read(config_path)).to eq "custom: value"
137
+ end
138
+
139
+ it "can force overwrite with force option" do
140
+ File.write(config_path, "custom: value")
141
+
142
+ result = described_class.create_default(config_path, force: true)
143
+
144
+ expect(result).to be true
145
+ expect(File.read(config_path)).to include("dump_expiry_days: 30")
146
+ end
147
+ end
148
+
149
+ describe "#to_h" do
150
+ it "returns hash representation" do
151
+ config = described_class.new(dump_expiry_days: 45)
152
+
153
+ hash = config.to_h
154
+
155
+ expect(hash).to be_a(Hash)
156
+ expect(hash[:cache][:dump_expiry_days]).to eq 45
157
+ expect(hash[:defaults][:format]).to eq "text"
158
+ end
159
+ end
160
+
161
+ describe "validation" do
162
+ it "clamps dump_expiry_days to valid range" do
163
+ config = described_class.new(dump_expiry_days: -5)
164
+ expect(config.dump_expiry_days).to eq 1
165
+
166
+ config = described_class.new(dump_expiry_days: 400)
167
+ expect(config.dump_expiry_days).to eq 365
168
+ end
169
+
170
+ it "clamps category_expiry_days to valid range" do
171
+ config = described_class.new(category_expiry_days: 0)
172
+ expect(config.category_expiry_days).to eq 1
173
+
174
+ config = described_class.new(category_expiry_days: 100)
175
+ expect(config.category_expiry_days).to eq 90
176
+ end
177
+
178
+ it "clamps default_depth to valid range" do
179
+ config = described_class.new(default_depth: -1)
180
+ expect(config.default_depth).to eq 0
181
+
182
+ config = described_class.new(default_depth: 20)
183
+ expect(config.default_depth).to eq 10
184
+ end
185
+
186
+ it "validates default_format" do
187
+ config = described_class.new(default_format: "invalid")
188
+ expect(config.default_format).to eq "text"
189
+
190
+ config = described_class.new(default_format: "json")
191
+ expect(config.default_format).to eq "json"
192
+ end
193
+ end
194
+ end
@@ -0,0 +1,138 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+ require "tempfile"
5
+
6
+ RSpec.describe "Wp2txt Constants" do
7
+ describe "Time Constants" do
8
+ it "defines SECONDS_PER_DAY" do
9
+ expect(Wp2txt::SECONDS_PER_DAY).to eq(86_400)
10
+ end
11
+
12
+ it "defines SECONDS_PER_HOUR" do
13
+ expect(Wp2txt::SECONDS_PER_HOUR).to eq(3_600)
14
+ end
15
+
16
+ it "defines SECONDS_PER_MINUTE" do
17
+ expect(Wp2txt::SECONDS_PER_MINUTE).to eq(60)
18
+ end
19
+ end
20
+
21
+ describe "Cache Settings" do
22
+ it "defines DEFAULT_DUMP_EXPIRY_DAYS" do
23
+ expect(Wp2txt::DEFAULT_DUMP_EXPIRY_DAYS).to eq(30)
24
+ end
25
+
26
+ it "defines DEFAULT_CATEGORY_CACHE_EXPIRY_DAYS" do
27
+ expect(Wp2txt::DEFAULT_CATEGORY_CACHE_EXPIRY_DAYS).to eq(7)
28
+ end
29
+
30
+ end
31
+
32
+ describe "Network Settings" do
33
+ it "defines DEFAULT_HTTP_TIMEOUT" do
34
+ expect(Wp2txt::DEFAULT_HTTP_TIMEOUT).to eq(30)
35
+ end
36
+
37
+ it "defines DEFAULT_PROGRESS_INTERVAL" do
38
+ expect(Wp2txt::DEFAULT_PROGRESS_INTERVAL).to eq(10)
39
+ end
40
+
41
+ it "defines INDEX_PROGRESS_THRESHOLD" do
42
+ expect(Wp2txt::INDEX_PROGRESS_THRESHOLD).to eq(500_000)
43
+ end
44
+
45
+ it "defines DEFAULT_TOP_N_SECTIONS" do
46
+ expect(Wp2txt::DEFAULT_TOP_N_SECTIONS).to eq(50)
47
+ end
48
+
49
+ it "defines RESUME_METADATA_MAX_AGE_DAYS" do
50
+ expect(Wp2txt::RESUME_METADATA_MAX_AGE_DAYS).to eq(7)
51
+ end
52
+
53
+ it "defines MAX_HTTP_RETRIES" do
54
+ expect(Wp2txt::MAX_HTTP_RETRIES).to eq(3)
55
+ end
56
+ end
57
+
58
+ describe "Processing Limits" do
59
+ it "defines MAX_NESTING_ITERATIONS" do
60
+ expect(Wp2txt::MAX_NESTING_ITERATIONS).to eq(50_000)
61
+ end
62
+
63
+ it "defines DEFAULT_BUFFER_SIZE" do
64
+ expect(Wp2txt::DEFAULT_BUFFER_SIZE).to eq(10_485_760) # 10 MB
65
+ end
66
+ end
67
+
68
+ describe "File Size Units" do
69
+ it "defines BYTES_PER_KB" do
70
+ expect(Wp2txt::BYTES_PER_KB).to eq(1024)
71
+ end
72
+
73
+ it "defines BYTES_PER_MB" do
74
+ expect(Wp2txt::BYTES_PER_MB).to eq(1024 * 1024)
75
+ end
76
+
77
+ it "defines BYTES_PER_GB" do
78
+ expect(Wp2txt::BYTES_PER_GB).to eq(1024 * 1024 * 1024)
79
+ end
80
+ end
81
+
82
+ describe ".days_to_seconds" do
83
+ it "converts days to seconds" do
84
+ expect(Wp2txt.days_to_seconds(1)).to eq(86_400)
85
+ expect(Wp2txt.days_to_seconds(7)).to eq(7 * 86_400)
86
+ expect(Wp2txt.days_to_seconds(0.5)).to eq(43_200)
87
+ end
88
+ end
89
+
90
+ describe ".file_fresh?" do
91
+ let(:temp_file) { Tempfile.new("test_file") }
92
+
93
+ after { temp_file.unlink }
94
+
95
+ it "returns true for recently created file" do
96
+ expect(Wp2txt.file_fresh?(temp_file.path, 1)).to be true
97
+ end
98
+
99
+ it "returns false for non-existent file" do
100
+ expect(Wp2txt.file_fresh?("/nonexistent/path", 1)).to be false
101
+ end
102
+ end
103
+
104
+ describe ".file_age_days" do
105
+ let(:temp_file) { Tempfile.new("test_file") }
106
+
107
+ after { temp_file.unlink }
108
+
109
+ it "returns age in days for existing file" do
110
+ age = Wp2txt.file_age_days(temp_file.path)
111
+ expect(age).to be_a(Float)
112
+ expect(age).to be >= 0
113
+ expect(age).to be < 1 # File just created
114
+ end
115
+
116
+ it "returns nil for non-existent file" do
117
+ expect(Wp2txt.file_age_days("/nonexistent/path")).to be_nil
118
+ end
119
+ end
120
+
121
+ describe ".format_file_size" do
122
+ it "formats bytes" do
123
+ expect(Wp2txt.format_file_size(500)).to eq("500 B")
124
+ end
125
+
126
+ it "formats kilobytes" do
127
+ expect(Wp2txt.format_file_size(2048)).to eq("2.0 KB")
128
+ end
129
+
130
+ it "formats megabytes" do
131
+ expect(Wp2txt.format_file_size(5_242_880)).to eq("5.0 MB")
132
+ end
133
+
134
+ it "formats gigabytes" do
135
+ expect(Wp2txt.format_file_size(2_147_483_648)).to eq("2.0 GB")
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,170 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+ require "tempfile"
5
+ require "fileutils"
6
+
7
+ RSpec.describe "Wp2txt FileUtils" do
8
+ include Wp2txt
9
+
10
+ describe "collect_files" do
11
+ let(:temp_dir) { Dir.mktmpdir }
12
+
13
+ after { FileUtils.remove_entry(temp_dir) }
14
+
15
+ it "collects all files in a directory" do
16
+ # Create test files
17
+ File.write(File.join(temp_dir, "file1.txt"), "content1")
18
+ File.write(File.join(temp_dir, "file2.txt"), "content2")
19
+
20
+ files = collect_files(temp_dir)
21
+ expect(files).to include(a_string_ending_with("file1.txt"))
22
+ expect(files).to include(a_string_ending_with("file2.txt"))
23
+ end
24
+
25
+ it "filters files by regex" do
26
+ File.write(File.join(temp_dir, "file1.txt"), "content1")
27
+ File.write(File.join(temp_dir, "file2.rb"), "content2")
28
+
29
+ files = collect_files(temp_dir, /\.txt$/)
30
+ expect(files).to include(a_string_ending_with("file1.txt"))
31
+ expect(files).not_to include(a_string_ending_with("file2.rb"))
32
+ end
33
+
34
+ it "returns sorted list" do
35
+ File.write(File.join(temp_dir, "z_file.txt"), "")
36
+ File.write(File.join(temp_dir, "a_file.txt"), "")
37
+
38
+ files = collect_files(temp_dir, /\.txt$/)
39
+ txt_files = files.select { |f| f.end_with?(".txt") }
40
+ expect(txt_files).to eq(txt_files.sort)
41
+ end
42
+ end
43
+
44
+ describe "correct_separator" do
45
+ it "converts backslashes to forward slashes on non-Windows" do
46
+ skip "Only runs on non-Windows" if RUBY_PLATFORM.index("win32")
47
+ expect(correct_separator("path\\to\\file")).to eq("path/to/file")
48
+ end
49
+
50
+ it "handles arrays of paths" do
51
+ skip "Only runs on non-Windows" if RUBY_PLATFORM.index("win32")
52
+ result = correct_separator(["path\\to\\file1", "path\\to\\file2"])
53
+ expect(result).to eq(["path/to/file1", "path/to/file2"])
54
+ end
55
+
56
+ it "returns nil for nil input" do
57
+ expect(correct_separator(nil)).to be_nil
58
+ end
59
+ end
60
+
61
+ describe "sec_to_str" do
62
+ it "converts seconds to HH:MM:SS format" do
63
+ expect(sec_to_str(3661)).to eq("01:01:01")
64
+ end
65
+
66
+ it "handles zero" do
67
+ expect(sec_to_str(0)).to eq("00:00:00")
68
+ end
69
+
70
+ it "handles large values" do
71
+ expect(sec_to_str(86400)).to eq("24:00:00") # 1 day
72
+ end
73
+
74
+ it "handles nil input" do
75
+ expect(sec_to_str(nil)).to eq("--:--:--")
76
+ end
77
+
78
+ it "formats with leading zeros" do
79
+ expect(sec_to_str(61)).to eq("00:01:01")
80
+ end
81
+ end
82
+
83
+ describe "file_mod" do
84
+ let(:temp_file) { Tempfile.new("test_file") }
85
+
86
+ after do
87
+ temp_file.close
88
+ temp_file.unlink
89
+ File.unlink("temp") if File.exist?("temp")
90
+ end
91
+
92
+ it "modifies file content using block" do
93
+ temp_file.write("original content")
94
+ temp_file.close
95
+
96
+ file_mod(temp_file.path) do |content|
97
+ content.upcase
98
+ end
99
+
100
+ expect(File.read(temp_file.path)).to eq("ORIGINAL CONTENT")
101
+ end
102
+
103
+ it "keeps backup when requested" do
104
+ temp_file.write("original")
105
+ temp_file.close
106
+
107
+ file_mod(temp_file.path, true) do |content|
108
+ "modified"
109
+ end
110
+
111
+ expect(File.exist?(temp_file.path + ".bak")).to be true
112
+ File.unlink(temp_file.path + ".bak")
113
+ end
114
+ end
115
+
116
+ describe "batch_file_mod" do
117
+ let(:temp_dir) { Dir.mktmpdir }
118
+
119
+ after { FileUtils.remove_entry(temp_dir) }
120
+
121
+ it "yields each file in directory" do
122
+ File.write(File.join(temp_dir, "file1.txt"), "")
123
+ File.write(File.join(temp_dir, "file2.txt"), "")
124
+
125
+ files_processed = []
126
+ batch_file_mod(temp_dir) do |file|
127
+ files_processed << File.basename(file)
128
+ end
129
+
130
+ expect(files_processed).to include("file1.txt", "file2.txt")
131
+ end
132
+
133
+ it "yields single file if path is a file" do
134
+ file_path = File.join(temp_dir, "single.txt")
135
+ File.write(file_path, "")
136
+
137
+ files_processed = []
138
+ batch_file_mod(file_path) do |file|
139
+ files_processed << file
140
+ end
141
+
142
+ expect(files_processed).to eq([file_path])
143
+ end
144
+ end
145
+
146
+ describe "rename" do
147
+ let(:temp_dir) { Dir.mktmpdir }
148
+
149
+ after { FileUtils.remove_entry(temp_dir) }
150
+
151
+ it "renames files with extension" do
152
+ file1 = File.join(temp_dir, "test-1")
153
+ file2 = File.join(temp_dir, "test-2")
154
+ File.write(file1, "")
155
+ File.write(file2, "")
156
+
157
+ rename([file1, file2], "txt")
158
+
159
+ expect(File.exist?(File.join(temp_dir, "test-1.txt"))).to be true
160
+ expect(File.exist?(File.join(temp_dir, "test-2.txt"))).to be true
161
+ end
162
+
163
+ it "returns true on success" do
164
+ file1 = File.join(temp_dir, "test-1")
165
+ File.write(file1, "")
166
+
167
+ expect(rename([file1])).to be true
168
+ end
169
+ end
170
+ end
@@ -0,0 +1,181 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Test samples for multilingual content
4
+ module Wp2txt
5
+ module TestSamples
6
+ ENGLISH_ARTICLE = <<~WIKI
7
+ '''Test Article''' is an [[English language|English]] article.
8
+ == Section ==
9
+ [[File:Example.jpg|thumb|A description]]
10
+ [[Category:Tests]]
11
+ [[Category:Examples]]
12
+ WIKI
13
+
14
+ JAPANESE_ARTICLE = <<~WIKI
15
+ '''テスト記事'''は[[日本語]]の記事です。
16
+ == セクション ==
17
+ [[ファイル:Example.jpg|thumb|説明文]]
18
+ [[カテゴリ:テスト]]
19
+ WIKI
20
+
21
+ GERMAN_ARTICLE = <<~WIKI
22
+ '''Testartikel''' ist ein [[Deutsch|deutscher]] Artikel.
23
+ == Abschnitt ==
24
+ [[Datei:Bild.png|miniatur|Beschreibung]]
25
+ [[Kategorie:Test]]
26
+ #WEITERLEITUNG [[Andere Seite]]
27
+ WIKI
28
+
29
+ FRENCH_ARTICLE = <<~WIKI
30
+ '''Article de test''' est un [[Français|article français]].
31
+ == Section ==
32
+ [[Fichier:Image.jpg|vignette|Description]]
33
+ [[Catégorie:Test]]
34
+ #REDIRECTION [[Autre page]]
35
+ WIKI
36
+
37
+ CHINESE_ARTICLE = <<~WIKI
38
+ '''测试文章'''是一个[[中文]]文章。
39
+ == 章节 ==
40
+ [[文件:图片.jpg|缩略图|说明]]
41
+ [[分类:测试]]
42
+ #重定向 [[其他页面]]
43
+ WIKI
44
+
45
+ RUSSIAN_ARTICLE = <<~WIKI
46
+ '''Тестовая статья''' — это [[Русский язык|русская]] статья.
47
+ == Раздел ==
48
+ [[Файл:Изображение.jpg|мини|Описание]]
49
+ [[Категория:Тест]]
50
+ #ПЕРЕНАПРАВЛЕНИЕ [[Другая страница]]
51
+ WIKI
52
+
53
+ KOREAN_ARTICLE = <<~WIKI
54
+ '''테스트 문서'''는 [[한국어]] 문서입니다.
55
+ == 섹션 ==
56
+ [[파일:Example.jpg|섬네일|설명]]
57
+ [[분류:테스트]]
58
+ #넘겨주기 [[다른 문서]]
59
+ WIKI
60
+
61
+ ARABIC_ARTICLE = <<~WIKI
62
+ '''مقالة اختبار''' هي [[اللغة العربية|مقالة عربية]].
63
+ == قسم ==
64
+ [[ملف:صورة.jpg|تصغير|وصف]]
65
+ [[تصنيف:اختبار]]
66
+ #تحويل [[صفحة أخرى]]
67
+ WIKI
68
+
69
+ # Edge cases
70
+ EMOJI_CONTENT = "Text with emoji &#x1F600; and &#128512; symbols"
71
+ DEEPLY_NESTED = "{{a|{{b|{{c|{{d|text}}}}}}}}"
72
+ MALFORMED_MARKUP = "[[Unclosed link\n{{Unclosed template"
73
+
74
+ # Complex nested structure
75
+ NESTED_TEMPLATES = <<~WIKI
76
+ {{Infobox person
77
+ |name = Test Person
78
+ |birth_date = {{Birth date|1990|1|15}}
79
+ |occupation = [[Scientist]]
80
+ }}
81
+ WIKI
82
+
83
+ # Table content
84
+ TABLE_CONTENT = <<~WIKI
85
+ {| class="wikitable"
86
+ |-
87
+ ! Header 1 !! Header 2
88
+ |-
89
+ | Cell 1 || Cell 2
90
+ |}
91
+ WIKI
92
+
93
+ # Reference content
94
+ REFERENCE_CONTENT = <<~WIKI
95
+ This is text with a reference.<ref>Citation here</ref>
96
+ Another reference.<ref name="test">Named citation</ref>
97
+ WIKI
98
+
99
+ # Multi-line link
100
+ MULTILINE_LINK = <<~WIKI
101
+ [[File:Example.jpg
102
+ |thumb
103
+ |200px
104
+ |A very long caption that spans
105
+ multiple lines]]
106
+ WIKI
107
+
108
+ # === Additional Edge Cases for v2.0.0 ===
109
+
110
+ # Special characters in titles
111
+ SPECIAL_TITLE_ARTICLE = <<~WIKI
112
+ '''C++ (programming language)''' is a [[programming language]].
113
+ '''O'Brien''' was an [[Irish people|Irish]] person.
114
+ '''Rock & Roll''' is a music genre.
115
+ [[Category:Programming languages]]
116
+ WIKI
117
+
118
+ # Very deeply nested templates (10 levels)
119
+ VERY_DEEPLY_NESTED = "{{a|{{b|{{c|{{d|{{e|{{f|{{g|{{h|{{i|{{j|content}}}}}}}}}}}}}}}}}}}}"
120
+
121
+ # Mixed multilingual content with emoji
122
+ MIXED_CONTENT = <<~WIKI
123
+ '''Test''' こんにちは 你好 مرحبا Привет 😀
124
+ == Section セクション ==
125
+ Text with emoji: &#x1F600; &#x1F4BB; &#x2764;
126
+ [[Category:Test]][[カテゴリ:テスト]][[分类:测试]]
127
+ WIKI
128
+
129
+ # Complex wikilinks with pipes and brackets
130
+ COMPLEX_LINKS = <<~WIKI
131
+ [[File:Photo.jpg|thumb|200px|alt=Alt text|Caption with [[nested link]]]]
132
+ [[Article|Display text with '''bold''' and ''italic'']]
133
+ [[Category:Test|Sort key]]
134
+ WIKI
135
+
136
+ # Multiple consecutive templates
137
+ CONSECUTIVE_TEMPLATES = <<~WIKI
138
+ {{Stub}}{{Cleanup}}{{Unreferenced}}
139
+ This article needs work.
140
+ {{Infobox|title=Test}}
141
+ WIKI
142
+
143
+ # HTML entities mixed with character references
144
+ HTML_ENTITIES_MIXED = <<~WIKI
145
+ &nbsp;&lt;tag&gt;&amp;&quot;
146
+ &#60;literal&#62;
147
+ &#x3C;hex&#x3E;
148
+ Japanese: &#x65E5;&#x672C;&#x8A9E;
149
+ WIKI
150
+
151
+ # Horizontal rules (various lengths)
152
+ HORIZONTAL_RULES = <<~WIKI
153
+ Text before
154
+ ----
155
+ Text between
156
+ --------
157
+ Text after
158
+ --
159
+ Not a rule
160
+ ---
161
+ Also not a rule
162
+ WIKI
163
+
164
+ # Headings with various formatting
165
+ COMPLEX_HEADINGS = <<~WIKI
166
+ == Simple Heading ==
167
+ === Heading with [[link]] ===
168
+ ==== Heading with '''bold''' ====
169
+ ===== Heading with trailing space =====
170
+ == 日本語見出し ==
171
+ WIKI
172
+
173
+ # Redirect variations
174
+ REDIRECT_VARIATIONS = <<~WIKI
175
+ #REDIRECT [[Target]]
176
+ #redirect [[lowercase]]
177
+ #REDIRECT[[no space]]
178
+ #REDIRECT [[extra spaces]]
179
+ WIKI
180
+ end
181
+ end