wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
data/spec/cli_spec.rb
ADDED
|
@@ -0,0 +1,876 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "spec_helper"
|
|
4
|
+
require_relative "../lib/wp2txt/cli"
|
|
5
|
+
require "tmpdir"
|
|
6
|
+
|
|
7
|
+
# Load the CLI app class for testing
|
|
8
|
+
require_relative "../lib/wp2txt"
|
|
9
|
+
require_relative "../lib/wp2txt/utils"
|
|
10
|
+
|
|
11
|
+
RSpec.describe Wp2txt::CLI do
|
|
12
|
+
describe ".parse_options" do
|
|
13
|
+
context "with --from-category option" do
|
|
14
|
+
it "requires --lang" do
|
|
15
|
+
suppress_stderr do
|
|
16
|
+
expect do
|
|
17
|
+
described_class.parse_options(["--from-category=Test"])
|
|
18
|
+
end.to raise_error(SystemExit)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
it "cannot be used with --input" do
|
|
23
|
+
Dir.mktmpdir do |dir|
|
|
24
|
+
# Create a dummy file
|
|
25
|
+
dummy_file = File.join(dir, "test.bz2")
|
|
26
|
+
File.write(dummy_file, "test")
|
|
27
|
+
|
|
28
|
+
suppress_stderr do
|
|
29
|
+
expect do
|
|
30
|
+
described_class.parse_options([
|
|
31
|
+
"--from-category=Test",
|
|
32
|
+
"--lang=en",
|
|
33
|
+
"--input=#{dummy_file}",
|
|
34
|
+
"-o", dir
|
|
35
|
+
])
|
|
36
|
+
end.to raise_error(SystemExit)
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
it "cannot be used with --articles" do
|
|
42
|
+
Dir.mktmpdir do |dir|
|
|
43
|
+
suppress_stderr do
|
|
44
|
+
expect do
|
|
45
|
+
described_class.parse_options([
|
|
46
|
+
"--from-category=Test",
|
|
47
|
+
"--lang=en",
|
|
48
|
+
"--articles=Article1",
|
|
49
|
+
"-o", dir
|
|
50
|
+
])
|
|
51
|
+
end.to raise_error(SystemExit)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it "accepts valid options" do
|
|
57
|
+
Dir.mktmpdir do |dir|
|
|
58
|
+
opts = described_class.parse_options([
|
|
59
|
+
"--from-category=Japanese cities",
|
|
60
|
+
"--lang=en",
|
|
61
|
+
"--depth=2",
|
|
62
|
+
"-o", dir
|
|
63
|
+
])
|
|
64
|
+
|
|
65
|
+
expect(opts[:from_category]).to eq "Japanese cities"
|
|
66
|
+
expect(opts[:lang]).to eq "en"
|
|
67
|
+
expect(opts[:depth]).to eq 2
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
context "with --depth option" do
|
|
73
|
+
it "defaults to 0" do
|
|
74
|
+
Dir.mktmpdir do |dir|
|
|
75
|
+
opts = described_class.parse_options([
|
|
76
|
+
"--from-category=Test",
|
|
77
|
+
"--lang=en",
|
|
78
|
+
"-o", dir
|
|
79
|
+
])
|
|
80
|
+
|
|
81
|
+
expect(opts[:depth]).to eq 0
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
it "rejects negative values" do
|
|
86
|
+
Dir.mktmpdir do |dir|
|
|
87
|
+
suppress_stderr do
|
|
88
|
+
expect do
|
|
89
|
+
described_class.parse_options([
|
|
90
|
+
"--from-category=Test",
|
|
91
|
+
"--lang=en",
|
|
92
|
+
"--depth=-1",
|
|
93
|
+
"-o", dir
|
|
94
|
+
])
|
|
95
|
+
end.to raise_error(SystemExit)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
it "warns when depth > 3" do
|
|
101
|
+
Dir.mktmpdir do |dir|
|
|
102
|
+
expect do
|
|
103
|
+
described_class.parse_options([
|
|
104
|
+
"--from-category=Test",
|
|
105
|
+
"--lang=en",
|
|
106
|
+
"--depth=4",
|
|
107
|
+
"-o", dir
|
|
108
|
+
])
|
|
109
|
+
end.to output(/Warning.*depth.*3/i).to_stderr
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
context "with --dry-run option" do
|
|
115
|
+
it "requires --from-category" do
|
|
116
|
+
Dir.mktmpdir do |dir|
|
|
117
|
+
suppress_stderr do
|
|
118
|
+
expect do
|
|
119
|
+
described_class.parse_options([
|
|
120
|
+
"--lang=en",
|
|
121
|
+
"--dry-run",
|
|
122
|
+
"-o", dir
|
|
123
|
+
])
|
|
124
|
+
end.to raise_error(SystemExit)
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
it "works with --from-category" do
|
|
130
|
+
Dir.mktmpdir do |dir|
|
|
131
|
+
opts = described_class.parse_options([
|
|
132
|
+
"--from-category=Test",
|
|
133
|
+
"--lang=en",
|
|
134
|
+
"--dry-run",
|
|
135
|
+
"-o", dir
|
|
136
|
+
])
|
|
137
|
+
|
|
138
|
+
expect(opts[:dry_run]).to be true
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
context "with --yes option" do
|
|
144
|
+
it "requires --from-category" do
|
|
145
|
+
Dir.mktmpdir do |dir|
|
|
146
|
+
suppress_stderr do
|
|
147
|
+
expect do
|
|
148
|
+
described_class.parse_options([
|
|
149
|
+
"--lang=en",
|
|
150
|
+
"--yes",
|
|
151
|
+
"-o", dir
|
|
152
|
+
])
|
|
153
|
+
end.to raise_error(SystemExit)
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
it "works with --from-category" do
|
|
159
|
+
Dir.mktmpdir do |dir|
|
|
160
|
+
opts = described_class.parse_options([
|
|
161
|
+
"--from-category=Test",
|
|
162
|
+
"--lang=en",
|
|
163
|
+
"--yes",
|
|
164
|
+
"-o", dir
|
|
165
|
+
])
|
|
166
|
+
|
|
167
|
+
expect(opts[:yes]).to be true
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
context "with --update-cache option" do
|
|
173
|
+
it "defaults to false" do
|
|
174
|
+
Dir.mktmpdir do |dir|
|
|
175
|
+
opts = described_class.parse_options([
|
|
176
|
+
"--from-category=Test",
|
|
177
|
+
"--lang=en",
|
|
178
|
+
"-o", dir
|
|
179
|
+
])
|
|
180
|
+
|
|
181
|
+
expect(opts[:update_cache]).to be false
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
it "can be set to true" do
|
|
186
|
+
Dir.mktmpdir do |dir|
|
|
187
|
+
opts = described_class.parse_options([
|
|
188
|
+
"--from-category=Test",
|
|
189
|
+
"--lang=en",
|
|
190
|
+
"--update-cache",
|
|
191
|
+
"-o", dir
|
|
192
|
+
])
|
|
193
|
+
|
|
194
|
+
expect(opts[:update_cache]).to be true
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
it "accepts short form -U" do
|
|
199
|
+
Dir.mktmpdir do |dir|
|
|
200
|
+
opts = described_class.parse_options([
|
|
201
|
+
"--from-category=Test",
|
|
202
|
+
"--lang=en",
|
|
203
|
+
"-U",
|
|
204
|
+
"-o", dir
|
|
205
|
+
])
|
|
206
|
+
|
|
207
|
+
expect(opts[:update_cache]).to be true
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
context "extraction mode mutual exclusion" do
|
|
213
|
+
it "rejects --category-only with --summary-only" do
|
|
214
|
+
Dir.mktmpdir do |dir|
|
|
215
|
+
suppress_stderr do
|
|
216
|
+
expect do
|
|
217
|
+
described_class.parse_options([
|
|
218
|
+
"--lang=en",
|
|
219
|
+
"--category-only",
|
|
220
|
+
"--summary-only",
|
|
221
|
+
"-o", dir
|
|
222
|
+
])
|
|
223
|
+
end.to raise_error(SystemExit)
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
it "rejects --category-only with --metadata-only" do
|
|
229
|
+
Dir.mktmpdir do |dir|
|
|
230
|
+
suppress_stderr do
|
|
231
|
+
expect do
|
|
232
|
+
described_class.parse_options([
|
|
233
|
+
"--lang=en",
|
|
234
|
+
"--category-only",
|
|
235
|
+
"--metadata-only",
|
|
236
|
+
"-o", dir
|
|
237
|
+
])
|
|
238
|
+
end.to raise_error(SystemExit)
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
it "rejects --summary-only with --metadata-only" do
|
|
244
|
+
Dir.mktmpdir do |dir|
|
|
245
|
+
suppress_stderr do
|
|
246
|
+
expect do
|
|
247
|
+
described_class.parse_options([
|
|
248
|
+
"--lang=en",
|
|
249
|
+
"--summary-only",
|
|
250
|
+
"--metadata-only",
|
|
251
|
+
"-o", dir
|
|
252
|
+
])
|
|
253
|
+
end.to raise_error(SystemExit)
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
it "rejects all three extraction modes combined" do
|
|
259
|
+
Dir.mktmpdir do |dir|
|
|
260
|
+
suppress_stderr do
|
|
261
|
+
expect do
|
|
262
|
+
described_class.parse_options([
|
|
263
|
+
"--lang=en",
|
|
264
|
+
"--category-only",
|
|
265
|
+
"--summary-only",
|
|
266
|
+
"--metadata-only",
|
|
267
|
+
"-o", dir
|
|
268
|
+
])
|
|
269
|
+
end.to raise_error(SystemExit)
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
it "rejects --sections with --category-only" do
|
|
275
|
+
Dir.mktmpdir do |dir|
|
|
276
|
+
suppress_stderr do
|
|
277
|
+
expect do
|
|
278
|
+
described_class.parse_options([
|
|
279
|
+
"--lang=en",
|
|
280
|
+
"--category-only",
|
|
281
|
+
"--sections=Plot",
|
|
282
|
+
"-o", dir
|
|
283
|
+
])
|
|
284
|
+
end.to raise_error(SystemExit)
|
|
285
|
+
end
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
it "rejects --sections with --summary-only" do
|
|
290
|
+
Dir.mktmpdir do |dir|
|
|
291
|
+
suppress_stderr do
|
|
292
|
+
expect do
|
|
293
|
+
described_class.parse_options([
|
|
294
|
+
"--lang=en",
|
|
295
|
+
"--summary-only",
|
|
296
|
+
"--sections=Plot",
|
|
297
|
+
"-o", dir
|
|
298
|
+
])
|
|
299
|
+
end.to raise_error(SystemExit)
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
it "rejects --sections with --metadata-only" do
|
|
305
|
+
Dir.mktmpdir do |dir|
|
|
306
|
+
suppress_stderr do
|
|
307
|
+
expect do
|
|
308
|
+
described_class.parse_options([
|
|
309
|
+
"--lang=en",
|
|
310
|
+
"--metadata-only",
|
|
311
|
+
"--sections=Plot",
|
|
312
|
+
"-o", dir
|
|
313
|
+
])
|
|
314
|
+
end.to raise_error(SystemExit)
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
it "rejects --section-stats with --category-only" do
|
|
320
|
+
Dir.mktmpdir do |dir|
|
|
321
|
+
suppress_stderr do
|
|
322
|
+
expect do
|
|
323
|
+
described_class.parse_options([
|
|
324
|
+
"--lang=en",
|
|
325
|
+
"--section-stats",
|
|
326
|
+
"--category-only",
|
|
327
|
+
"-o", dir
|
|
328
|
+
])
|
|
329
|
+
end.to raise_error(SystemExit)
|
|
330
|
+
end
|
|
331
|
+
end
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
it "rejects --section-stats with --summary-only" do
|
|
335
|
+
Dir.mktmpdir do |dir|
|
|
336
|
+
suppress_stderr do
|
|
337
|
+
expect do
|
|
338
|
+
described_class.parse_options([
|
|
339
|
+
"--lang=en",
|
|
340
|
+
"--section-stats",
|
|
341
|
+
"--summary-only",
|
|
342
|
+
"-o", dir
|
|
343
|
+
])
|
|
344
|
+
end.to raise_error(SystemExit)
|
|
345
|
+
end
|
|
346
|
+
end
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
it "allows single extraction mode" do
|
|
350
|
+
Dir.mktmpdir do |dir|
|
|
351
|
+
opts = described_class.parse_options([
|
|
352
|
+
"--lang=en",
|
|
353
|
+
"--category-only",
|
|
354
|
+
"-o", dir
|
|
355
|
+
])
|
|
356
|
+
expect(opts[:category_only]).to be true
|
|
357
|
+
end
|
|
358
|
+
end
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
context "with content filtering options" do
|
|
362
|
+
it "parses --table option (defaults to false)" do
|
|
363
|
+
Dir.mktmpdir do |dir|
|
|
364
|
+
opts = described_class.parse_options([
|
|
365
|
+
"--lang=en",
|
|
366
|
+
"-o", dir
|
|
367
|
+
])
|
|
368
|
+
expect(opts[:table]).to be false
|
|
369
|
+
end
|
|
370
|
+
end
|
|
371
|
+
|
|
372
|
+
it "enables --table option" do
|
|
373
|
+
Dir.mktmpdir do |dir|
|
|
374
|
+
opts = described_class.parse_options([
|
|
375
|
+
"--lang=en",
|
|
376
|
+
"--table",
|
|
377
|
+
"-o", dir
|
|
378
|
+
])
|
|
379
|
+
expect(opts[:table]).to be true
|
|
380
|
+
end
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
it "parses --pre option (defaults to false)" do
|
|
384
|
+
Dir.mktmpdir do |dir|
|
|
385
|
+
opts = described_class.parse_options([
|
|
386
|
+
"--lang=en",
|
|
387
|
+
"-o", dir
|
|
388
|
+
])
|
|
389
|
+
expect(opts[:pre]).to be false
|
|
390
|
+
end
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
it "enables --pre option with short form -p" do
|
|
394
|
+
Dir.mktmpdir do |dir|
|
|
395
|
+
opts = described_class.parse_options([
|
|
396
|
+
"--lang=en",
|
|
397
|
+
"-p",
|
|
398
|
+
"-o", dir
|
|
399
|
+
])
|
|
400
|
+
expect(opts[:pre]).to be true
|
|
401
|
+
end
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
it "parses --multiline option (defaults to false)" do
|
|
405
|
+
Dir.mktmpdir do |dir|
|
|
406
|
+
opts = described_class.parse_options([
|
|
407
|
+
"--lang=en",
|
|
408
|
+
"-o", dir
|
|
409
|
+
])
|
|
410
|
+
expect(opts[:multiline]).to be false
|
|
411
|
+
end
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
it "enables --multiline option" do
|
|
415
|
+
Dir.mktmpdir do |dir|
|
|
416
|
+
opts = described_class.parse_options([
|
|
417
|
+
"--lang=en",
|
|
418
|
+
"--multiline",
|
|
419
|
+
"-o", dir
|
|
420
|
+
])
|
|
421
|
+
expect(opts[:multiline]).to be true
|
|
422
|
+
end
|
|
423
|
+
end
|
|
424
|
+
|
|
425
|
+
it "allows combining content filtering options" do
|
|
426
|
+
Dir.mktmpdir do |dir|
|
|
427
|
+
opts = described_class.parse_options([
|
|
428
|
+
"--lang=en",
|
|
429
|
+
"--table",
|
|
430
|
+
"--pre",
|
|
431
|
+
"--multiline",
|
|
432
|
+
"--list",
|
|
433
|
+
"-o", dir
|
|
434
|
+
])
|
|
435
|
+
expect(opts[:table]).to be true
|
|
436
|
+
expect(opts[:pre]).to be true
|
|
437
|
+
expect(opts[:multiline]).to be true
|
|
438
|
+
expect(opts[:list]).to be true
|
|
439
|
+
end
|
|
440
|
+
end
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
context "with section extraction options" do
|
|
444
|
+
it "parses --sections option" do
|
|
445
|
+
Dir.mktmpdir do |dir|
|
|
446
|
+
opts = described_class.parse_options([
|
|
447
|
+
"--lang=en",
|
|
448
|
+
"--sections=summary,Plot,Reception",
|
|
449
|
+
"-o", dir
|
|
450
|
+
])
|
|
451
|
+
|
|
452
|
+
expect(opts[:sections]).to eq("summary,Plot,Reception")
|
|
453
|
+
end
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
it "parses --no-section-aliases option" do
|
|
457
|
+
Dir.mktmpdir do |dir|
|
|
458
|
+
opts = described_class.parse_options([
|
|
459
|
+
"--lang=en",
|
|
460
|
+
"--sections=Plot",
|
|
461
|
+
"--no-section-aliases",
|
|
462
|
+
"-o", dir
|
|
463
|
+
])
|
|
464
|
+
|
|
465
|
+
expect(opts[:no_section_aliases]).to be true
|
|
466
|
+
end
|
|
467
|
+
end
|
|
468
|
+
|
|
469
|
+
it "parses --show-matched-sections option" do
|
|
470
|
+
Dir.mktmpdir do |dir|
|
|
471
|
+
opts = described_class.parse_options([
|
|
472
|
+
"--lang=en",
|
|
473
|
+
"--sections=Plot",
|
|
474
|
+
"--show-matched-sections",
|
|
475
|
+
"--format=json",
|
|
476
|
+
"-o", dir
|
|
477
|
+
])
|
|
478
|
+
|
|
479
|
+
expect(opts[:show_matched_sections]).to be true
|
|
480
|
+
end
|
|
481
|
+
end
|
|
482
|
+
|
|
483
|
+
it "rejects --show-matched-sections without JSON format" do
|
|
484
|
+
Dir.mktmpdir do |dir|
|
|
485
|
+
suppress_stderr do
|
|
486
|
+
expect do
|
|
487
|
+
described_class.parse_options([
|
|
488
|
+
"--lang=en",
|
|
489
|
+
"--sections=Plot",
|
|
490
|
+
"--show-matched-sections",
|
|
491
|
+
"--format=text",
|
|
492
|
+
"-o", dir
|
|
493
|
+
])
|
|
494
|
+
end.to raise_error(SystemExit)
|
|
495
|
+
end
|
|
496
|
+
end
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
it "parses --section-stats option" do
|
|
500
|
+
Dir.mktmpdir do |dir|
|
|
501
|
+
opts = described_class.parse_options([
|
|
502
|
+
"--lang=en",
|
|
503
|
+
"--section-stats",
|
|
504
|
+
"-o", dir
|
|
505
|
+
])
|
|
506
|
+
|
|
507
|
+
expect(opts[:section_stats]).to be true
|
|
508
|
+
end
|
|
509
|
+
end
|
|
510
|
+
|
|
511
|
+
it "rejects --section-stats with --sections" do
|
|
512
|
+
Dir.mktmpdir do |dir|
|
|
513
|
+
suppress_stderr do
|
|
514
|
+
expect do
|
|
515
|
+
described_class.parse_options([
|
|
516
|
+
"--lang=en",
|
|
517
|
+
"--section-stats",
|
|
518
|
+
"--sections=Plot",
|
|
519
|
+
"-o", dir
|
|
520
|
+
])
|
|
521
|
+
end.to raise_error(SystemExit)
|
|
522
|
+
end
|
|
523
|
+
end
|
|
524
|
+
end
|
|
525
|
+
|
|
526
|
+
it "rejects --section-stats with --metadata-only" do
|
|
527
|
+
Dir.mktmpdir do |dir|
|
|
528
|
+
suppress_stderr do
|
|
529
|
+
expect do
|
|
530
|
+
described_class.parse_options([
|
|
531
|
+
"--lang=en",
|
|
532
|
+
"--section-stats",
|
|
533
|
+
"--metadata-only",
|
|
534
|
+
"-o", dir
|
|
535
|
+
])
|
|
536
|
+
end.to raise_error(SystemExit)
|
|
537
|
+
end
|
|
538
|
+
end
|
|
539
|
+
end
|
|
540
|
+
end
|
|
541
|
+
|
|
542
|
+
context "with --alias-file option" do
|
|
543
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
544
|
+
let(:alias_file) { File.join(temp_dir, "aliases.yml") }
|
|
545
|
+
|
|
546
|
+
after { FileUtils.remove_entry(temp_dir) }
|
|
547
|
+
|
|
548
|
+
it "parses --alias-file option" do
|
|
549
|
+
File.write(alias_file, "Plot:\n - Synopsis\n")
|
|
550
|
+
Dir.mktmpdir do |dir|
|
|
551
|
+
opts = described_class.parse_options([
|
|
552
|
+
"--lang=en",
|
|
553
|
+
"--sections=Plot",
|
|
554
|
+
"--alias-file=#{alias_file}",
|
|
555
|
+
"-o", dir
|
|
556
|
+
])
|
|
557
|
+
|
|
558
|
+
expect(opts[:alias_file]).to eq(alias_file)
|
|
559
|
+
end
|
|
560
|
+
end
|
|
561
|
+
|
|
562
|
+
it "rejects non-existent alias file" do
|
|
563
|
+
Dir.mktmpdir do |dir|
|
|
564
|
+
suppress_stderr do
|
|
565
|
+
expect do
|
|
566
|
+
described_class.parse_options([
|
|
567
|
+
"--lang=en",
|
|
568
|
+
"--sections=Plot",
|
|
569
|
+
"--alias-file=/nonexistent/file.yml",
|
|
570
|
+
"-o", dir
|
|
571
|
+
])
|
|
572
|
+
end.to raise_error(SystemExit)
|
|
573
|
+
end
|
|
574
|
+
end
|
|
575
|
+
end
|
|
576
|
+
|
|
577
|
+
it "rejects invalid YAML alias file" do
|
|
578
|
+
File.write(alias_file, "invalid: yaml: {{")
|
|
579
|
+
Dir.mktmpdir do |dir|
|
|
580
|
+
suppress_stderr do
|
|
581
|
+
expect do
|
|
582
|
+
described_class.parse_options([
|
|
583
|
+
"--lang=en",
|
|
584
|
+
"--sections=Plot",
|
|
585
|
+
"--alias-file=#{alias_file}",
|
|
586
|
+
"-o", dir
|
|
587
|
+
])
|
|
588
|
+
end.to raise_error(SystemExit)
|
|
589
|
+
end
|
|
590
|
+
end
|
|
591
|
+
end
|
|
592
|
+
end
|
|
593
|
+
end
|
|
594
|
+
end
|
|
595
|
+
|
|
596
|
+
# Test the WpApp class methods
|
|
597
|
+
class TestWpApp
|
|
598
|
+
include Wp2txt
|
|
599
|
+
|
|
600
|
+
def format_article(article, config)
|
|
601
|
+
article.title = format_wiki(article.title, config)
|
|
602
|
+
|
|
603
|
+
if config[:category_only]
|
|
604
|
+
format_category_only(article)
|
|
605
|
+
elsif config[:category] && !article.categories.empty?
|
|
606
|
+
format_with_categories(article, config)
|
|
607
|
+
else
|
|
608
|
+
format_full_article(article, config)
|
|
609
|
+
end
|
|
610
|
+
end
|
|
611
|
+
|
|
612
|
+
def format_category_only(article)
|
|
613
|
+
title = "#{article.title}\t"
|
|
614
|
+
contents = article.categories.join(", ")
|
|
615
|
+
contents << "\n"
|
|
616
|
+
title + contents
|
|
617
|
+
end
|
|
618
|
+
|
|
619
|
+
def format_with_categories(article, config)
|
|
620
|
+
title = "\n[[#{article.title}]]\n\n"
|
|
621
|
+
contents = +""
|
|
622
|
+
|
|
623
|
+
article.elements.each do |e|
|
|
624
|
+
line = process_element(e, config)
|
|
625
|
+
contents << line if line
|
|
626
|
+
end
|
|
627
|
+
|
|
628
|
+
contents << "\nCATEGORIES: "
|
|
629
|
+
contents << article.categories.join(", ")
|
|
630
|
+
contents << "\n\n"
|
|
631
|
+
|
|
632
|
+
config[:title] ? title + contents : contents
|
|
633
|
+
end
|
|
634
|
+
|
|
635
|
+
def format_full_article(article, config)
|
|
636
|
+
title = "\n[[#{article.title}]]\n\n"
|
|
637
|
+
contents = +""
|
|
638
|
+
|
|
639
|
+
article.elements.each do |e|
|
|
640
|
+
line = process_element(e, config)
|
|
641
|
+
contents << line if line
|
|
642
|
+
end
|
|
643
|
+
|
|
644
|
+
config[:title] ? title + contents : contents
|
|
645
|
+
end
|
|
646
|
+
|
|
647
|
+
def process_element(element, config)
|
|
648
|
+
type, content = element
|
|
649
|
+
case type
|
|
650
|
+
when :mw_heading
|
|
651
|
+
return nil if config[:summary_only]
|
|
652
|
+
return nil unless config[:heading]
|
|
653
|
+
|
|
654
|
+
content = format_wiki(content, config)
|
|
655
|
+
content + "\n"
|
|
656
|
+
when :mw_paragraph
|
|
657
|
+
content = format_wiki(content, config)
|
|
658
|
+
content + "\n"
|
|
659
|
+
when :mw_table, :mw_htable
|
|
660
|
+
return nil unless config[:table]
|
|
661
|
+
|
|
662
|
+
content + "\n"
|
|
663
|
+
when :mw_unordered, :mw_ordered, :mw_definition
|
|
664
|
+
return nil unless config[:list]
|
|
665
|
+
|
|
666
|
+
content + "\n"
|
|
667
|
+
when :mw_redirect
|
|
668
|
+
return nil unless config[:redirect]
|
|
669
|
+
|
|
670
|
+
content + "\n\n"
|
|
671
|
+
else
|
|
672
|
+
nil
|
|
673
|
+
end
|
|
674
|
+
end
|
|
675
|
+
end
|
|
676
|
+
|
|
677
|
+
RSpec.describe "CLI format_article" do
|
|
678
|
+
let(:app) { TestWpApp.new }
|
|
679
|
+
|
|
680
|
+
let(:sample_wiki) do
|
|
681
|
+
<<~WIKI
|
|
682
|
+
'''Test Article''' is about [[testing]].
|
|
683
|
+
|
|
684
|
+
== Section One ==
|
|
685
|
+
This is paragraph one.
|
|
686
|
+
|
|
687
|
+
== Section Two ==
|
|
688
|
+
This is paragraph two.
|
|
689
|
+
|
|
690
|
+
[[Category:Testing]]
|
|
691
|
+
[[Category:Examples]]
|
|
692
|
+
WIKI
|
|
693
|
+
end
|
|
694
|
+
|
|
695
|
+
let(:article) { Wp2txt::Article.new(sample_wiki, "Test Article") }
|
|
696
|
+
|
|
697
|
+
let(:default_config) do
|
|
698
|
+
{
|
|
699
|
+
title: true,
|
|
700
|
+
heading: true,
|
|
701
|
+
list: false,
|
|
702
|
+
table: false,
|
|
703
|
+
redirect: false,
|
|
704
|
+
category: true,
|
|
705
|
+
category_only: false,
|
|
706
|
+
summary_only: false
|
|
707
|
+
}
|
|
708
|
+
end
|
|
709
|
+
|
|
710
|
+
describe "format_with_categories" do
|
|
711
|
+
it "includes both body text and categories" do
|
|
712
|
+
result = app.format_article(article, default_config)
|
|
713
|
+
|
|
714
|
+
# Should include title
|
|
715
|
+
expect(result).to include("[[Test Article]]")
|
|
716
|
+
|
|
717
|
+
# Should include body text
|
|
718
|
+
expect(result).to include("is about")
|
|
719
|
+
expect(result).to include("Section One")
|
|
720
|
+
expect(result).to include("paragraph one")
|
|
721
|
+
|
|
722
|
+
# Should include categories
|
|
723
|
+
expect(result).to include("CATEGORIES:")
|
|
724
|
+
expect(result).to include("Testing")
|
|
725
|
+
expect(result).to include("Examples")
|
|
726
|
+
end
|
|
727
|
+
|
|
728
|
+
it "places categories after body text" do
|
|
729
|
+
result = app.format_article(article, default_config)
|
|
730
|
+
|
|
731
|
+
body_position = result.index("paragraph")
|
|
732
|
+
categories_position = result.index("CATEGORIES:")
|
|
733
|
+
|
|
734
|
+
expect(categories_position).to be > body_position
|
|
735
|
+
end
|
|
736
|
+
end
|
|
737
|
+
|
|
738
|
+
describe "format_category_only" do
|
|
739
|
+
it "outputs only title and categories without body" do
|
|
740
|
+
config = default_config.merge(category_only: true)
|
|
741
|
+
result = app.format_article(article, config)
|
|
742
|
+
|
|
743
|
+
# Should include title and categories
|
|
744
|
+
expect(result).to include("Test Article")
|
|
745
|
+
expect(result).to include("Testing")
|
|
746
|
+
|
|
747
|
+
# Should NOT include body text
|
|
748
|
+
expect(result).not_to include("paragraph")
|
|
749
|
+
expect(result).not_to include("Section One")
|
|
750
|
+
end
|
|
751
|
+
end
|
|
752
|
+
|
|
753
|
+
describe "format_full_article without categories" do
|
|
754
|
+
it "outputs body without categories section when article has no categories" do
|
|
755
|
+
wiki_no_categories = "'''Simple''' article with no categories."
|
|
756
|
+
article_no_cat = Wp2txt::Article.new(wiki_no_categories, "Simple")
|
|
757
|
+
|
|
758
|
+
result = app.format_article(article_no_cat, default_config)
|
|
759
|
+
|
|
760
|
+
expect(result).to include("[[Simple]]")
|
|
761
|
+
expect(result).to include("article with no categories")
|
|
762
|
+
expect(result).not_to include("CATEGORIES:")
|
|
763
|
+
end
|
|
764
|
+
end
|
|
765
|
+
|
|
766
|
+
describe "summary_only mode" do
|
|
767
|
+
it "excludes headings when summary_only is true" do
|
|
768
|
+
config = default_config.merge(summary_only: true)
|
|
769
|
+
result = app.format_article(article, config)
|
|
770
|
+
|
|
771
|
+
# Should include first paragraph
|
|
772
|
+
expect(result).to include("is about")
|
|
773
|
+
|
|
774
|
+
# Should NOT include section headings
|
|
775
|
+
expect(result).not_to include("Section One")
|
|
776
|
+
expect(result).not_to include("Section Two")
|
|
777
|
+
end
|
|
778
|
+
end
|
|
779
|
+
|
|
780
|
+
describe "heading option" do
|
|
781
|
+
it "excludes headings when heading is false" do
|
|
782
|
+
config = default_config.merge(heading: false)
|
|
783
|
+
result = app.format_article(article, config)
|
|
784
|
+
|
|
785
|
+
# Should include paragraph content
|
|
786
|
+
expect(result).to include("is about")
|
|
787
|
+
|
|
788
|
+
# Should NOT include headings
|
|
789
|
+
expect(result).not_to include("Section One")
|
|
790
|
+
end
|
|
791
|
+
end
|
|
792
|
+
|
|
793
|
+
describe "redirect handling" do
|
|
794
|
+
let(:redirect_wiki) { "#REDIRECT [[Target Article]]" }
|
|
795
|
+
let(:redirect_article) { Wp2txt::Article.new(redirect_wiki, "Redirect Test") }
|
|
796
|
+
|
|
797
|
+
it "excludes redirect by default" do
|
|
798
|
+
result = app.format_article(redirect_article, default_config)
|
|
799
|
+
expect(result).not_to include("REDIRECT")
|
|
800
|
+
expect(result).not_to include("Target Article")
|
|
801
|
+
end
|
|
802
|
+
|
|
803
|
+
it "includes redirect when redirect option is true" do
|
|
804
|
+
config = default_config.merge(redirect: true, category: false)
|
|
805
|
+
result = app.format_article(redirect_article, config)
|
|
806
|
+
expect(result).to include("REDIRECT")
|
|
807
|
+
end
|
|
808
|
+
end
|
|
809
|
+
end
|
|
810
|
+
|
|
811
|
+
RSpec.describe "End-to-end article processing" do
|
|
812
|
+
include Wp2txt
|
|
813
|
+
|
|
814
|
+
let(:complex_article) do
|
|
815
|
+
<<~WIKI
|
|
816
|
+
{{Infobox
|
|
817
|
+
|name = Test
|
|
818
|
+
}}
|
|
819
|
+
'''Complex Article''' is a [[test]] with '''bold''' and ''italic''.
|
|
820
|
+
|
|
821
|
+
== History ==
|
|
822
|
+
The history section with a [[link|display text]].
|
|
823
|
+
|
|
824
|
+
== Features ==
|
|
825
|
+
* Feature one
|
|
826
|
+
* Feature two
|
|
827
|
+
|
|
828
|
+
{| class="wikitable"
|
|
829
|
+
|-
|
|
830
|
+
| Cell 1 || Cell 2
|
|
831
|
+
|}
|
|
832
|
+
|
|
833
|
+
== References ==
|
|
834
|
+
<ref>Citation</ref>
|
|
835
|
+
|
|
836
|
+
[[Category:Complex]]
|
|
837
|
+
[[Category:Test Articles]]
|
|
838
|
+
WIKI
|
|
839
|
+
end
|
|
840
|
+
|
|
841
|
+
it "correctly processes complex articles with categories" do
|
|
842
|
+
article = Wp2txt::Article.new(complex_article, "Complex Article")
|
|
843
|
+
|
|
844
|
+
# Article should have elements
|
|
845
|
+
expect(article.elements).not_to be_empty
|
|
846
|
+
|
|
847
|
+
# Article should have categories
|
|
848
|
+
expect(article.categories.flatten).to include("Complex")
|
|
849
|
+
expect(article.categories.flatten).to include("Test Articles")
|
|
850
|
+
|
|
851
|
+
# Article should have headings
|
|
852
|
+
types = article.elements.map(&:first)
|
|
853
|
+
expect(types).to include(:mw_heading)
|
|
854
|
+
expect(types).to include(:mw_paragraph)
|
|
855
|
+
end
|
|
856
|
+
|
|
857
|
+
it "extracts body text correctly through format_wiki" do
|
|
858
|
+
article = Wp2txt::Article.new(complex_article, "Complex Article")
|
|
859
|
+
|
|
860
|
+
# Find paragraph elements and format them
|
|
861
|
+
paragraphs = article.elements.select { |e| e.first == :mw_paragraph }
|
|
862
|
+
expect(paragraphs).not_to be_empty
|
|
863
|
+
|
|
864
|
+
# Format the first paragraph
|
|
865
|
+
first_para = paragraphs.first.last
|
|
866
|
+
formatted = format_wiki(first_para)
|
|
867
|
+
|
|
868
|
+
# Should contain the text without wiki markup
|
|
869
|
+
expect(formatted).to include("Complex Article")
|
|
870
|
+
expect(formatted).to include("test")
|
|
871
|
+
|
|
872
|
+
# Should not contain raw wiki markup
|
|
873
|
+
expect(formatted).not_to include("'''")
|
|
874
|
+
expect(formatted).not_to include("[[")
|
|
875
|
+
end
|
|
876
|
+
end
|