wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,472 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
|
|
5
|
+
RSpec.describe Wp2txt::TemplateExpander do
|
|
6
|
+
let(:expander) { described_class.new }
|
|
7
|
+
# Fixed reference date for age calculations
|
|
8
|
+
let(:reference_date) { Time.new(2024, 6, 15) }
|
|
9
|
+
let(:expander_with_date) { described_class.new(reference_date: reference_date) }
|
|
10
|
+
|
|
11
|
+
describe "date templates" do
|
|
12
|
+
describe "{{birth date}}" do
|
|
13
|
+
it "expands {{birth date|1990|5|15}} to formatted date" do
|
|
14
|
+
expect(expander.expand("{{birth date|1990|5|15}}")).to eq("May 15, 1990")
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
it "expands {{Birth date|1990|5|15}} (case-insensitive)" do
|
|
18
|
+
expect(expander.expand("{{Birth date|1990|5|15}}")).to eq("May 15, 1990")
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
it "handles single-digit day" do
|
|
22
|
+
expect(expander.expand("{{birth date|1990|5|5}}")).to eq("May 5, 1990")
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it "handles different months" do
|
|
26
|
+
expect(expander.expand("{{birth date|2000|12|25}}")).to eq("December 25, 2000")
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it "handles df=yes parameter (day first)" do
|
|
30
|
+
expect(expander.expand("{{birth date|1990|5|15|df=yes}}")).to eq("15 May 1990")
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it "handles mf=yes parameter (month first, default)" do
|
|
34
|
+
expect(expander.expand("{{birth date|1990|5|15|mf=yes}}")).to eq("May 15, 1990")
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
describe "{{birth date and age}}" do
|
|
39
|
+
it "expands with calculated age" do
|
|
40
|
+
result = expander_with_date.expand("{{birth date and age|1990|5|15}}")
|
|
41
|
+
expect(result).to eq("May 15, 1990 (age 34)")
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
it "calculates age correctly when birthday hasn't occurred yet" do
|
|
45
|
+
result = expander_with_date.expand("{{birth date and age|1990|12|25}}")
|
|
46
|
+
expect(result).to eq("December 25, 1990 (age 33)")
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
it "handles df=yes parameter" do
|
|
50
|
+
result = expander_with_date.expand("{{birth date and age|1990|5|15|df=yes}}")
|
|
51
|
+
expect(result).to eq("15 May 1990 (age 34)")
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
describe "{{death date}}" do
|
|
56
|
+
it "expands {{death date|2020|3|1}} to formatted date" do
|
|
57
|
+
expect(expander.expand("{{death date|2020|3|1}}")).to eq("March 1, 2020")
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it "handles df=yes parameter" do
|
|
61
|
+
expect(expander.expand("{{death date|2020|3|1|df=yes}}")).to eq("1 March 2020")
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
describe "{{death date and age}}" do
|
|
66
|
+
it "expands with age at death" do
|
|
67
|
+
result = expander.expand("{{death date and age|2020|3|1|1950|6|15}}")
|
|
68
|
+
expect(result).to eq("March 1, 2020 (aged 69)")
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
it "handles df=yes parameter" do
|
|
72
|
+
result = expander.expand("{{death date and age|2020|3|1|1950|6|15|df=yes}}")
|
|
73
|
+
expect(result).to eq("1 March 2020 (aged 69)")
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
describe "{{start date}}" do
|
|
78
|
+
it "expands to formatted date" do
|
|
79
|
+
expect(expander.expand("{{start date|2024|1|1}}")).to eq("January 1, 2024")
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
it "handles df=yes parameter" do
|
|
83
|
+
expect(expander.expand("{{start date|2024|1|1|df=yes}}")).to eq("1 January 2024")
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
describe "{{end date}}" do
|
|
88
|
+
it "expands to formatted date" do
|
|
89
|
+
expect(expander.expand("{{end date|2024|12|31}}")).to eq("December 31, 2024")
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
describe "{{date}}" do
|
|
94
|
+
it "expands simple date" do
|
|
95
|
+
expect(expander.expand("{{date|2024|6|15}}")).to eq("June 15, 2024")
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
it "handles year and month only" do
|
|
99
|
+
expect(expander.expand("{{date|2024|6}}")).to eq("June 2024")
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
it "handles year only" do
|
|
103
|
+
expect(expander.expand("{{date|2024}}")).to eq("2024")
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
describe "age templates" do
|
|
109
|
+
describe "{{age}}" do
|
|
110
|
+
it "calculates age from birth date" do
|
|
111
|
+
result = expander_with_date.expand("{{age|1990|5|15}}")
|
|
112
|
+
expect(result).to eq("34")
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
it "returns correct age when birthday hasn't occurred" do
|
|
116
|
+
result = expander_with_date.expand("{{age|1990|12|25}}")
|
|
117
|
+
expect(result).to eq("33")
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
describe "{{age in years}}" do
|
|
122
|
+
it "calculates age between two dates" do
|
|
123
|
+
result = expander.expand("{{age in years|1950|6|15|2020|3|1}}")
|
|
124
|
+
expect(result).to eq("69")
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
describe "{{age in days}}" do
|
|
129
|
+
it "calculates days between dates" do
|
|
130
|
+
result = expander.expand("{{age in days|2024|1|1|2024|1|10}}")
|
|
131
|
+
expect(result).to eq("9")
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
describe "convert templates" do
|
|
137
|
+
describe "length conversions" do
|
|
138
|
+
it "converts km to mi" do
|
|
139
|
+
result = expander.expand("{{convert|100|km|mi}}")
|
|
140
|
+
expect(result).to match(/100 km \(6[0-9](\.[0-9])? mi\)/)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
it "converts mi to km" do
|
|
144
|
+
result = expander.expand("{{convert|100|mi|km}}")
|
|
145
|
+
expect(result).to match(/100 mi \(16[0-9](\.[0-9])? km\)/)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
it "converts m to ft" do
|
|
149
|
+
result = expander.expand("{{convert|100|m|ft}}")
|
|
150
|
+
expect(result).to match(/100 m \(32[0-9](\.[0-9])? ft\)/)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it "converts ft to m" do
|
|
154
|
+
result = expander.expand("{{convert|100|ft|m}}")
|
|
155
|
+
expect(result).to match(/100 ft \(30(\.[0-9])? m\)/)
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
it "converts cm to in" do
|
|
159
|
+
result = expander.expand("{{convert|100|cm|in}}")
|
|
160
|
+
expect(result).to match(/100 cm \(39(\.[0-9])? in\)/)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
it "converts in to cm" do
|
|
164
|
+
result = expander.expand("{{convert|10|in|cm}}")
|
|
165
|
+
expect(result).to match(/10 in \(25(\.[0-9])? cm\)/)
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
describe "weight conversions" do
|
|
170
|
+
it "converts kg to lb" do
|
|
171
|
+
result = expander.expand("{{convert|100|kg|lb}}")
|
|
172
|
+
expect(result).to match(/100 kg \(22[0-9](\.[0-9])? lb\)/)
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
it "converts lb to kg" do
|
|
176
|
+
result = expander.expand("{{convert|100|lb|kg}}")
|
|
177
|
+
expect(result).to match(/100 lb \(4[0-9](\.[0-9])? kg\)/)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
it "converts g to oz" do
|
|
181
|
+
result = expander.expand("{{convert|100|g|oz}}")
|
|
182
|
+
expect(result).to match(/100 g \(3\.[0-9] oz\)/)
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
describe "temperature conversions" do
|
|
187
|
+
it "converts °C to °F" do
|
|
188
|
+
expect(expander.expand("{{convert|0|°C|°F}}")).to eq("0 °C (32 °F)")
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
it "converts C to F (without degree symbol)" do
|
|
192
|
+
expect(expander.expand("{{convert|100|C|F}}")).to eq("100 °C (212 °F)")
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
it "converts °F to °C" do
|
|
196
|
+
expect(expander.expand("{{convert|32|°F|°C}}")).to eq("32 °F (0 °C)")
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
it "converts F to C (without degree symbol)" do
|
|
200
|
+
expect(expander.expand("{{convert|212|F|C}}")).to eq("212 °F (100 °C)")
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
describe "area conversions" do
|
|
205
|
+
it "converts km2 to sqmi" do
|
|
206
|
+
result = expander.expand("{{convert|100|km2|sqmi}}")
|
|
207
|
+
expect(result).to match(/100 km² \(3[0-9](\.[0-9])? sq mi\)/)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
it "converts sqmi to km2" do
|
|
211
|
+
result = expander.expand("{{convert|100|sqmi|km2}}")
|
|
212
|
+
expect(result).to match(/100 sq mi \(25[0-9](\.[0-9])? km²\)/)
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
it "converts ha to acre" do
|
|
216
|
+
result = expander.expand("{{convert|100|ha|acre}}")
|
|
217
|
+
expect(result).to match(/100 ha \(24[0-9](\.[0-9])? acres\)/)
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
describe "speed conversions" do
|
|
222
|
+
it "converts km/h to mph" do
|
|
223
|
+
result = expander.expand("{{convert|100|km/h|mph}}")
|
|
224
|
+
expect(result).to match(/100 km\/h \(6[0-9](\.[0-9])? mph\)/)
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
it "converts mph to km/h" do
|
|
228
|
+
result = expander.expand("{{convert|60|mph|km/h}}")
|
|
229
|
+
expect(result).to match(/60 mph \(9[0-9](\.[0-9])? km\/h\)/)
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
describe "edge cases" do
|
|
234
|
+
it "handles decimal values" do
|
|
235
|
+
result = expander.expand("{{convert|3.5|km|mi}}")
|
|
236
|
+
expect(result).to match(/3\.5 km \(2\.[0-9] mi\)/)
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
it "handles unknown units gracefully" do
|
|
240
|
+
expect(expander.expand("{{convert|100|foo|bar}}")).to eq("100 foo")
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
it "handles abbr=on parameter" do
|
|
244
|
+
result = expander.expand("{{convert|100|km|mi|abbr=on}}")
|
|
245
|
+
expect(result).to match(/100 km \(6[0-9](\.[0-9])? mi\)/)
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
describe "common templates" do
|
|
251
|
+
describe "{{circa}}" do
|
|
252
|
+
it "expands to c. prefix" do
|
|
253
|
+
expect(expander.expand("{{circa|1500}}")).to eq("c. 1500")
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
it "handles range" do
|
|
257
|
+
expect(expander.expand("{{circa|1500|1550}}")).to eq("c. 1500 – c. 1550")
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
describe "{{floruit}}" do
|
|
262
|
+
it "expands single year" do
|
|
263
|
+
expect(expander.expand("{{floruit|1500}}")).to eq("fl. 1500")
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
it "expands year range" do
|
|
267
|
+
expect(expander.expand("{{floruit|1500|1550}}")).to eq("fl. 1500–1550")
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
describe "{{reign}}" do
|
|
272
|
+
it "expands reign years" do
|
|
273
|
+
expect(expander.expand("{{reign|1500|1550}}")).to eq("r. 1500–1550")
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
describe "{{marriage}}" do
|
|
278
|
+
it "expands simple marriage" do
|
|
279
|
+
expect(expander.expand("{{marriage|Jane Doe|1990}}")).to eq("Jane Doe (m. 1990)")
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
it "expands marriage with end" do
|
|
283
|
+
expect(expander.expand("{{marriage|Jane Doe|1990|2020}}")).to eq("Jane Doe (m. 1990; div. 2020)")
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
it "handles widowed end reason" do
|
|
287
|
+
expect(expander.expand("{{marriage|Jane Doe|1990|2020|reason=widowed}}")).to eq("Jane Doe (m. 1990; wid. 2020)")
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
it "handles died end reason" do
|
|
291
|
+
expect(expander.expand("{{marriage|Jane Doe|1990|2020|reason=died}}")).to eq("Jane Doe (m. 1990; d. 2020)")
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
describe "{{played years}}" do
|
|
296
|
+
it "expands playing career span" do
|
|
297
|
+
expect(expander.expand("{{played years|2000|2020}}")).to eq("2000–2020")
|
|
298
|
+
end
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
describe "{{age in years and days}}" do
|
|
302
|
+
it "formats age with years and days" do
|
|
303
|
+
result = expander.expand("{{age in years and days|1990|1|1|2024|6|15}}")
|
|
304
|
+
expect(result).to match(/34 years, \d+ days/)
|
|
305
|
+
end
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
describe "{{time ago}}" do
|
|
309
|
+
it "formats time since date" do
|
|
310
|
+
result = expander_with_date.expand("{{time ago|2024|1|1}}")
|
|
311
|
+
expect(result).to match(/\d+ months ago/)
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
describe "formatting preservation" do
|
|
317
|
+
it "preserves text around templates" do
|
|
318
|
+
result = expander.expand("Born on {{birth date|1990|5|15}} in Tokyo")
|
|
319
|
+
expect(result).to eq("Born on May 15, 1990 in Tokyo")
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
it "handles multiple templates in one string" do
|
|
323
|
+
result = expander.expand("{{birth date|1990|5|15}} – {{death date|2020|3|1}}")
|
|
324
|
+
expect(result).to eq("May 15, 1990 – March 1, 2020")
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
it "handles nested templates" do
|
|
328
|
+
# This tests that inner templates are expanded first
|
|
329
|
+
result = expander.expand("Born {{circa|1500}}")
|
|
330
|
+
expect(result).to eq("Born c. 1500")
|
|
331
|
+
end
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
describe "unknown templates" do
|
|
335
|
+
it "returns empty for unknown templates" do
|
|
336
|
+
expect(expander.expand("{{unknown template|foo|bar}}")).to eq("")
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
it "can be configured to preserve unknown templates" do
|
|
340
|
+
exp = described_class.new(preserve_unknown: true)
|
|
341
|
+
expect(exp.expand("{{unknown|foo}}")).to eq("{{unknown|foo}}")
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
describe "coordinate templates" do
|
|
346
|
+
describe "{{coord}}" do
|
|
347
|
+
it "expands decimal coordinates" do
|
|
348
|
+
result = expander.expand("{{coord|40.7128|N|74.0060|W}}")
|
|
349
|
+
expect(result).to match(/40\.7128°\s*N.*74\.0060°\s*W/i)
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
it "expands DMS coordinates" do
|
|
353
|
+
result = expander.expand("{{coord|40|42|46|N|74|0|22|W}}")
|
|
354
|
+
expect(result).to match(/40°42['′]46["″]?\s*N.*74°0['′]22["″]?\s*W/i)
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
it "expands coordinates with display parameter" do
|
|
358
|
+
result = expander.expand("{{coord|51.5074|N|0.1278|W|display=title}}")
|
|
359
|
+
expect(result).to include("51.5074")
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
it "handles simple lat/lon format" do
|
|
363
|
+
result = expander.expand("{{coord|35.6762|139.6503}}")
|
|
364
|
+
expect(result).to include("35.6762")
|
|
365
|
+
expect(result).to include("139.6503")
|
|
366
|
+
end
|
|
367
|
+
end
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
describe "language templates" do
|
|
371
|
+
describe "{{lang}}" do
|
|
372
|
+
it "expands basic lang template" do
|
|
373
|
+
expect(expander.expand("{{lang|fr|Bonjour}}")).to eq("Bonjour")
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
it "expands with literal translation" do
|
|
377
|
+
result = expander.expand("{{lang|la|Carpe diem|lit=seize the day}}")
|
|
378
|
+
expect(result).to include("Carpe diem")
|
|
379
|
+
expect(result).to include("seize the day")
|
|
380
|
+
end
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
describe "{{lang-xx}}" do
|
|
384
|
+
it "expands lang-fr template" do
|
|
385
|
+
result = expander.expand("{{lang-fr|Bonjour}}")
|
|
386
|
+
expect(result).to match(/French.*Bonjour/i)
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
it "expands lang-de template" do
|
|
390
|
+
result = expander.expand("{{lang-de|Guten Tag}}")
|
|
391
|
+
expect(result).to match(/German.*Guten Tag/i)
|
|
392
|
+
end
|
|
393
|
+
|
|
394
|
+
it "expands lang-ja template" do
|
|
395
|
+
result = expander.expand("{{lang-ja|こんにちは}}")
|
|
396
|
+
expect(result).to match(/Japanese.*こんにちは/i)
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
it "expands lang-la template with literal" do
|
|
400
|
+
result = expander.expand("{{lang-la|Veni, vidi, vici|lit=I came, I saw, I conquered}}")
|
|
401
|
+
expect(result).to include("Latin")
|
|
402
|
+
expect(result).to include("Veni, vidi, vici")
|
|
403
|
+
expect(result).to include("I came, I saw, I conquered")
|
|
404
|
+
end
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
describe "{{transl}}" do
|
|
408
|
+
it "expands transliteration template" do
|
|
409
|
+
result = expander.expand("{{transl|ru|Moskva}}")
|
|
410
|
+
expect(result).to eq("Moskva")
|
|
411
|
+
end
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
describe "{{nihongo}}" do
|
|
415
|
+
it "expands nihongo template" do
|
|
416
|
+
result = expander.expand("{{nihongo|Tokyo|東京|Tōkyō}}")
|
|
417
|
+
expect(result).to include("Tokyo")
|
|
418
|
+
expect(result).to include("東京")
|
|
419
|
+
expect(result).to include("Tōkyō")
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
it "handles nihongo without romaji" do
|
|
423
|
+
result = expander.expand("{{nihongo|Tokyo|東京}}")
|
|
424
|
+
expect(result).to include("Tokyo")
|
|
425
|
+
expect(result).to include("東京")
|
|
426
|
+
end
|
|
427
|
+
end
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
describe "formatting templates" do
|
|
431
|
+
describe "{{nowrap}}" do
|
|
432
|
+
it "preserves text" do
|
|
433
|
+
expect(expander.expand("{{nowrap|100 km}}")).to eq("100 km")
|
|
434
|
+
end
|
|
435
|
+
end
|
|
436
|
+
|
|
437
|
+
describe "{{small}}" do
|
|
438
|
+
it "preserves text" do
|
|
439
|
+
expect(expander.expand("{{small|tiny text}}")).to eq("tiny text")
|
|
440
|
+
end
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
describe "{{em}}" do
|
|
444
|
+
it "preserves text (emphasis)" do
|
|
445
|
+
expect(expander.expand("{{em|important}}")).to eq("important")
|
|
446
|
+
end
|
|
447
|
+
end
|
|
448
|
+
|
|
449
|
+
describe "{{abbr}}" do
|
|
450
|
+
it "returns abbreviation" do
|
|
451
|
+
expect(expander.expand("{{abbr|HTML|Hypertext Markup Language}}")).to eq("HTML")
|
|
452
|
+
end
|
|
453
|
+
end
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
describe "integration with format_wiki" do
|
|
457
|
+
include Wp2txt
|
|
458
|
+
|
|
459
|
+
it "expands templates during format_wiki processing" do
|
|
460
|
+
input = "He was born on {{birth date|1990|5|15}}."
|
|
461
|
+
result = format_wiki(input, title: "Test", expand_templates: true)
|
|
462
|
+
expect(result).to include("May 15, 1990")
|
|
463
|
+
end
|
|
464
|
+
|
|
465
|
+
it "expands convert templates" do
|
|
466
|
+
input = "The mountain is {{convert|8848|m|ft}} tall."
|
|
467
|
+
result = format_wiki(input, title: "Test", expand_templates: true)
|
|
468
|
+
expect(result).to include("8848 m")
|
|
469
|
+
expect(result).to include("ft")
|
|
470
|
+
end
|
|
471
|
+
end
|
|
472
|
+
end
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
|
|
5
|
+
RSpec.describe "Template Processing (Data-Driven)" do
|
|
6
|
+
include Wp2txt
|
|
7
|
+
|
|
8
|
+
# Helper to extract template name from {{...}} for testing regex
|
|
9
|
+
def template_content(str)
|
|
10
|
+
str.gsub(/^\{\{/, "").gsub(/\}\}$/, "")
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
describe "REMOVE_TEMPLATES_REGEX" do
|
|
14
|
+
it "is loaded from template_aliases.json" do
|
|
15
|
+
# Verify the constant exists and is a Regexp
|
|
16
|
+
expect(Wp2txt::REMOVE_TEMPLATES_REGEX).to be_a(Regexp)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it "matches English navigation templates" do
|
|
20
|
+
%w[sfn efn refn reflist notelist main portal].each do |template|
|
|
21
|
+
content = "#{template}|content"
|
|
22
|
+
expect(content).to match(Wp2txt::REMOVE_TEMPLATES_REGEX), "Expected '#{template}' to match"
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
it "matches hatnote templates" do
|
|
27
|
+
%w[about redirect distinguish further details].each do |template|
|
|
28
|
+
content = "#{template}|content"
|
|
29
|
+
expect(content).to match(Wp2txt::REMOVE_TEMPLATES_REGEX), "Expected '#{template}' to match"
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it "matches Japanese navigation templates" do
|
|
34
|
+
expect("脚注ヘルプ").to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
|
|
35
|
+
expect("関連項目|記事").to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
it "matches German navigation templates" do
|
|
39
|
+
expect("Hauptartikel|Artikel").to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
|
|
40
|
+
expect("Siehe auch|Artikel").to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it "matches French navigation templates" do
|
|
44
|
+
expect("Article principal|Article").to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
|
|
45
|
+
expect("Voir aussi|Article").to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it "does not match citation templates" do
|
|
49
|
+
expect("cite web|url=...").not_to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
|
|
50
|
+
expect("cite book|title=...").not_to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
describe "AUTHORITY_CONTROL_REGEX" do
|
|
55
|
+
it "is loaded from template_aliases.json" do
|
|
56
|
+
expect(Wp2txt::AUTHORITY_CONTROL_REGEX).to be_a(Regexp)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
it "matches English authority control" do
|
|
60
|
+
expect("Authority control").to match(Wp2txt::AUTHORITY_CONTROL_REGEX)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it "matches German Normdaten" do
|
|
64
|
+
expect("Normdaten").to match(Wp2txt::AUTHORITY_CONTROL_REGEX)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
it "matches identifier templates" do
|
|
68
|
+
%w[VIAF LCCN GND ISNI ORCID].each do |id|
|
|
69
|
+
expect(id).to match(Wp2txt::AUTHORITY_CONTROL_REGEX)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
describe "CLEANUP_REMNANTS_REGEX" do
|
|
75
|
+
it "is loaded from template_aliases.json" do
|
|
76
|
+
expect(Wp2txt::CLEANUP_REMNANTS_REGEX).to be_a(Regexp)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
it "matches layout templates" do
|
|
80
|
+
%w[Clear Clearleft Clearright].each do |template|
|
|
81
|
+
expect(template).to match(Wp2txt::CLEANUP_REMNANTS_REGEX)
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
it "matches notelist variants" do
|
|
86
|
+
expect("notelist").to match(Wp2txt::CLEANUP_REMNANTS_REGEX)
|
|
87
|
+
expect("notelist2").to match(Wp2txt::CLEANUP_REMNANTS_REGEX)
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
describe "correct_inline_template" do
|
|
92
|
+
# Ruby text templates (読み仮名 equivalent)
|
|
93
|
+
describe "ruby text templates" do
|
|
94
|
+
it "handles Japanese 読み仮名 template" do
|
|
95
|
+
result = correct_inline_template("{{読み仮名|漢字|かんじ}}")
|
|
96
|
+
expect(result).to eq("漢字(かんじ)")
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
it "handles English ruby template" do
|
|
100
|
+
result = correct_inline_template("{{ruby|漢字|かんじ}}")
|
|
101
|
+
expect(result).to include("漢字")
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Interwiki link templates (仮リンク equivalent)
|
|
106
|
+
describe "interwiki link templates" do
|
|
107
|
+
it "handles Japanese 仮リンク template" do
|
|
108
|
+
result = correct_inline_template("{{仮リンク|表示名|en|English Article}}")
|
|
109
|
+
expect(result).to eq("表示名")
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
it "handles English ill template" do
|
|
113
|
+
result = correct_inline_template("{{ill|Display|ja|日本語記事}}")
|
|
114
|
+
expect(result).to eq("Display")
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
it "handles interlanguage link template" do
|
|
118
|
+
result = correct_inline_template("{{interlanguage link|Display|de|Deutscher Artikel}}")
|
|
119
|
+
expect(result).to eq("Display")
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Mixed script templates (nihongo equivalent)
|
|
124
|
+
describe "mixed script templates" do
|
|
125
|
+
it "handles nihongo template with all parts" do
|
|
126
|
+
result = correct_inline_template("{{nihongo|Tokyo|東京|Tōkyō}}")
|
|
127
|
+
expect(result).to eq("Tokyo (東京, Tōkyō)")
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
it "handles nihongo template with only kanji" do
|
|
131
|
+
result = correct_inline_template("{{nihongo|Tokyo|東京}}")
|
|
132
|
+
expect(result).to eq("Tokyo (東京)")
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
it "handles transl template" do
|
|
136
|
+
result = correct_inline_template("{{transl|ja|tōkyō}}")
|
|
137
|
+
expect(result).to eq("tōkyō")
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Convert templates
|
|
142
|
+
describe "convert templates" do
|
|
143
|
+
it "handles convert template" do
|
|
144
|
+
result = correct_inline_template("{{convert|100|km}}")
|
|
145
|
+
expect(result).to eq("100 km")
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
it "handles Japanese 単位変換 template" do
|
|
149
|
+
result = correct_inline_template("{{単位変換|100|km}}")
|
|
150
|
+
expect(result).to eq("100 km")
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Flag templates
|
|
155
|
+
describe "flag templates" do
|
|
156
|
+
it "removes flag templates" do
|
|
157
|
+
result = correct_inline_template("{{flag|Japan}}")
|
|
158
|
+
expect(result).to eq("")
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
it "removes flagicon templates" do
|
|
162
|
+
result = correct_inline_template("{{flagicon|USA}}")
|
|
163
|
+
expect(result).to eq("")
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
it "removes country code templates" do
|
|
167
|
+
result = correct_inline_template("{{JPN}}")
|
|
168
|
+
expect(result).to eq("")
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Formatting templates
|
|
173
|
+
describe "formatting templates" do
|
|
174
|
+
it "extracts content from small template" do
|
|
175
|
+
result = correct_inline_template("{{small|text}}")
|
|
176
|
+
expect(result).to eq("text")
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
it "extracts content from nowrap template" do
|
|
180
|
+
result = correct_inline_template("{{nowrap|text here}}")
|
|
181
|
+
expect(result).to eq("text here")
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
it "handles nbsp template" do
|
|
185
|
+
result = correct_inline_template("before{{nbsp}}after")
|
|
186
|
+
expect(result).to eq("before after")
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
describe "cleanup" do
|
|
192
|
+
it "removes authority control remnants" do
|
|
193
|
+
text = "Article content\n\nAuthority control\n\n"
|
|
194
|
+
result = cleanup(text)
|
|
195
|
+
expect(result).not_to include("Authority control")
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
it "removes Normdaten remnants" do
|
|
199
|
+
text = "Article content\n\nNormdaten\n\n"
|
|
200
|
+
result = cleanup(text)
|
|
201
|
+
expect(result).not_to include("Normdaten")
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
it "removes cleanup remnants like Clearleft" do
|
|
205
|
+
text = "Content\n\nClearleft\n\nMore content"
|
|
206
|
+
result = cleanup(text)
|
|
207
|
+
expect(result).not_to include("Clearleft")
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
it "removes sister project markers" do
|
|
211
|
+
text = "Content\n\nCommons:\n\nWiktionary:\n\n"
|
|
212
|
+
result = cleanup(text)
|
|
213
|
+
expect(result).not_to include("Commons:")
|
|
214
|
+
expect(result).not_to include("Wiktionary:")
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
end
|