wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,678 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "spec_helper"
|
|
4
|
+
require_relative "../lib/wp2txt"
|
|
5
|
+
require_relative "../lib/wp2txt/utils"
|
|
6
|
+
|
|
7
|
+
# Simulate CLI app for testing options
|
|
8
|
+
class CLITestApp
|
|
9
|
+
include Wp2txt
|
|
10
|
+
|
|
11
|
+
# Default configuration matching bin/wp2txt defaults
|
|
12
|
+
DEFAULT_CONFIG = {
|
|
13
|
+
title: true,
|
|
14
|
+
heading: true,
|
|
15
|
+
list: false,
|
|
16
|
+
table: false,
|
|
17
|
+
redirect: false,
|
|
18
|
+
category: true,
|
|
19
|
+
category_only: false,
|
|
20
|
+
summary_only: false,
|
|
21
|
+
marker: true,
|
|
22
|
+
extract_citations: false
|
|
23
|
+
}.freeze
|
|
24
|
+
|
|
25
|
+
def self.default_config
|
|
26
|
+
DEFAULT_CONFIG.dup
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def format_article(article, config)
|
|
30
|
+
article.title = format_wiki(article.title, config)
|
|
31
|
+
|
|
32
|
+
if config[:category_only]
|
|
33
|
+
format_category_only(article)
|
|
34
|
+
elsif config[:category] && !article.categories.empty?
|
|
35
|
+
format_with_categories(article, config)
|
|
36
|
+
else
|
|
37
|
+
format_full_article(article, config)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def format_category_only(article)
|
|
42
|
+
title = "#{article.title}\t"
|
|
43
|
+
contents = article.categories.join(", ")
|
|
44
|
+
contents << "\n"
|
|
45
|
+
title + contents
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def format_with_categories(article, config)
|
|
49
|
+
title = "\n[[#{article.title}]]\n\n"
|
|
50
|
+
contents = +""
|
|
51
|
+
|
|
52
|
+
article.elements.each do |e|
|
|
53
|
+
line = process_element(e, config)
|
|
54
|
+
contents << line if line
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
contents << "\nCATEGORIES: "
|
|
58
|
+
contents << article.categories.join(", ")
|
|
59
|
+
contents << "\n\n"
|
|
60
|
+
|
|
61
|
+
config[:title] ? title + contents : contents
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def format_full_article(article, config)
|
|
65
|
+
title = "\n[[#{article.title}]]\n\n"
|
|
66
|
+
contents = +""
|
|
67
|
+
|
|
68
|
+
article.elements.each do |e|
|
|
69
|
+
line = process_element(e, config)
|
|
70
|
+
contents << line if line
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
config[:title] ? title + contents : contents
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def process_element(element, config)
|
|
77
|
+
type, content = element
|
|
78
|
+
case type
|
|
79
|
+
when :mw_heading
|
|
80
|
+
return nil if config[:summary_only]
|
|
81
|
+
return nil unless config[:heading]
|
|
82
|
+
|
|
83
|
+
content = format_wiki(content, config)
|
|
84
|
+
content + "\n"
|
|
85
|
+
when :mw_paragraph
|
|
86
|
+
content = format_wiki(content, config)
|
|
87
|
+
content + "\n"
|
|
88
|
+
when :mw_table, :mw_htable
|
|
89
|
+
return nil unless config[:table]
|
|
90
|
+
|
|
91
|
+
content + "\n"
|
|
92
|
+
when :mw_unordered, :mw_ordered, :mw_definition
|
|
93
|
+
return nil unless config[:list]
|
|
94
|
+
|
|
95
|
+
content + "\n"
|
|
96
|
+
when :mw_redirect
|
|
97
|
+
return nil unless config[:redirect]
|
|
98
|
+
|
|
99
|
+
content + "\n\n"
|
|
100
|
+
when :mw_isolated_template, :mw_isolated_tag
|
|
101
|
+
nil
|
|
102
|
+
else
|
|
103
|
+
nil
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
RSpec.describe "CLI Options" do
|
|
109
|
+
let(:app) { CLITestApp.new }
|
|
110
|
+
|
|
111
|
+
# Standard test article with various elements
|
|
112
|
+
let(:full_article_wiki) do
|
|
113
|
+
<<~WIKI
|
|
114
|
+
'''Test Article''' is about [[testing]] software.
|
|
115
|
+
|
|
116
|
+
== Introduction ==
|
|
117
|
+
This is the introduction paragraph.
|
|
118
|
+
|
|
119
|
+
== Features ==
|
|
120
|
+
The features section.
|
|
121
|
+
|
|
122
|
+
* Feature one
|
|
123
|
+
* Feature two
|
|
124
|
+
# Step one
|
|
125
|
+
# Step two
|
|
126
|
+
|
|
127
|
+
{| class="wikitable"
|
|
128
|
+
|-
|
|
129
|
+
| Cell 1 || Cell 2
|
|
130
|
+
|}
|
|
131
|
+
|
|
132
|
+
== See Also ==
|
|
133
|
+
Related content here.
|
|
134
|
+
|
|
135
|
+
[[Category:Software]]
|
|
136
|
+
[[Category:Testing]]
|
|
137
|
+
WIKI
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
let(:redirect_wiki) { "#REDIRECT [[Target Page]]" }
|
|
141
|
+
|
|
142
|
+
let(:article) { Wp2txt::Article.new(full_article_wiki, "Test Article") }
|
|
143
|
+
let(:redirect_article) { Wp2txt::Article.new(redirect_wiki, "Redirect Source") }
|
|
144
|
+
|
|
145
|
+
describe "Default configuration values" do
|
|
146
|
+
let(:defaults) { CLITestApp.default_config }
|
|
147
|
+
|
|
148
|
+
it "title defaults to true" do
|
|
149
|
+
expect(defaults[:title]).to be true
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
it "heading defaults to true" do
|
|
153
|
+
expect(defaults[:heading]).to be true
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
it "list defaults to false" do
|
|
157
|
+
expect(defaults[:list]).to be false
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
it "table defaults to false" do
|
|
161
|
+
expect(defaults[:table]).to be false
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
it "redirect defaults to false" do
|
|
165
|
+
expect(defaults[:redirect]).to be false
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
it "category defaults to true" do
|
|
169
|
+
expect(defaults[:category]).to be true
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
it "category_only defaults to false" do
|
|
173
|
+
expect(defaults[:category_only]).to be false
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
it "summary_only defaults to false" do
|
|
177
|
+
expect(defaults[:summary_only]).to be false
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
it "marker defaults to true" do
|
|
181
|
+
expect(defaults[:marker]).to be true
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
it "extract_citations defaults to false" do
|
|
185
|
+
expect(defaults[:extract_citations]).to be false
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
describe "--title / -t option" do
|
|
190
|
+
context "when title is true (default)" do
|
|
191
|
+
it "includes article title in output" do
|
|
192
|
+
config = CLITestApp.default_config
|
|
193
|
+
result = app.format_article(article, config)
|
|
194
|
+
|
|
195
|
+
expect(result).to include("[[Test Article]]")
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
context "when title is false" do
|
|
200
|
+
it "excludes article title from output" do
|
|
201
|
+
config = CLITestApp.default_config.merge(title: false)
|
|
202
|
+
result = app.format_article(article, config)
|
|
203
|
+
|
|
204
|
+
expect(result).not_to include("[[Test Article]]")
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
it "still includes body content" do
|
|
208
|
+
config = CLITestApp.default_config.merge(title: false)
|
|
209
|
+
result = app.format_article(article, config)
|
|
210
|
+
|
|
211
|
+
expect(result).to include("testing")
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
describe "--heading / -d option" do
|
|
217
|
+
context "when heading is true (default)" do
|
|
218
|
+
it "includes section headings in output" do
|
|
219
|
+
config = CLITestApp.default_config
|
|
220
|
+
result = app.format_article(article, config)
|
|
221
|
+
|
|
222
|
+
expect(result).to include("Introduction")
|
|
223
|
+
expect(result).to include("Features")
|
|
224
|
+
expect(result).to include("See Also")
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
context "when heading is false" do
|
|
229
|
+
it "excludes section headings from output" do
|
|
230
|
+
config = CLITestApp.default_config.merge(heading: false)
|
|
231
|
+
result = app.format_article(article, config)
|
|
232
|
+
|
|
233
|
+
expect(result).not_to include("Introduction")
|
|
234
|
+
expect(result).not_to include("Features")
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
it "still includes paragraph content" do
|
|
238
|
+
config = CLITestApp.default_config.merge(heading: false)
|
|
239
|
+
result = app.format_article(article, config)
|
|
240
|
+
|
|
241
|
+
expect(result).to include("introduction paragraph")
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
describe "--list / -l option" do
|
|
247
|
+
context "when list is false (default)" do
|
|
248
|
+
it "excludes list items from output" do
|
|
249
|
+
config = CLITestApp.default_config
|
|
250
|
+
result = app.format_article(article, config)
|
|
251
|
+
|
|
252
|
+
expect(result).not_to include("Feature one")
|
|
253
|
+
expect(result).not_to include("Step one")
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
context "when list is true" do
|
|
258
|
+
it "includes unordered list items" do
|
|
259
|
+
config = CLITestApp.default_config.merge(list: true)
|
|
260
|
+
result = app.format_article(article, config)
|
|
261
|
+
|
|
262
|
+
expect(result).to include("Feature one")
|
|
263
|
+
expect(result).to include("Feature two")
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
it "includes ordered list items" do
|
|
267
|
+
config = CLITestApp.default_config.merge(list: true)
|
|
268
|
+
result = app.format_article(article, config)
|
|
269
|
+
|
|
270
|
+
expect(result).to include("Step one")
|
|
271
|
+
expect(result).to include("Step two")
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
describe "--table option" do
|
|
277
|
+
context "when table is false (default)" do
|
|
278
|
+
it "excludes table content from output" do
|
|
279
|
+
config = CLITestApp.default_config
|
|
280
|
+
result = app.format_article(article, config)
|
|
281
|
+
|
|
282
|
+
# Table raw content should not appear
|
|
283
|
+
expect(result).not_to include("Cell 1")
|
|
284
|
+
end
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
context "when table is true" do
|
|
288
|
+
it "includes table content in output" do
|
|
289
|
+
config = CLITestApp.default_config.merge(table: true)
|
|
290
|
+
result = app.format_article(article, config)
|
|
291
|
+
|
|
292
|
+
expect(result).to include("Cell 1")
|
|
293
|
+
end
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
describe "--redirect / -e option" do
|
|
298
|
+
context "when redirect is false (default)" do
|
|
299
|
+
it "excludes redirect information" do
|
|
300
|
+
config = CLITestApp.default_config.merge(category: false)
|
|
301
|
+
result = app.format_article(redirect_article, config)
|
|
302
|
+
|
|
303
|
+
expect(result).not_to include("REDIRECT")
|
|
304
|
+
expect(result).not_to include("Target Page")
|
|
305
|
+
end
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
context "when redirect is true" do
|
|
309
|
+
it "includes redirect information" do
|
|
310
|
+
config = CLITestApp.default_config.merge(redirect: true, category: false)
|
|
311
|
+
result = app.format_article(redirect_article, config)
|
|
312
|
+
|
|
313
|
+
expect(result).to include("REDIRECT")
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
describe "--category / -a option" do
|
|
319
|
+
context "when category is true (default)" do
|
|
320
|
+
it "includes categories in output" do
|
|
321
|
+
config = CLITestApp.default_config
|
|
322
|
+
result = app.format_article(article, config)
|
|
323
|
+
|
|
324
|
+
expect(result).to include("CATEGORIES:")
|
|
325
|
+
expect(result).to include("Software")
|
|
326
|
+
expect(result).to include("Testing")
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
it "also includes body text" do
|
|
330
|
+
config = CLITestApp.default_config
|
|
331
|
+
result = app.format_article(article, config)
|
|
332
|
+
|
|
333
|
+
expect(result).to include("testing")
|
|
334
|
+
expect(result).to include("introduction paragraph")
|
|
335
|
+
end
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
context "when category is false" do
|
|
339
|
+
it "excludes categories section from output" do
|
|
340
|
+
config = CLITestApp.default_config.merge(category: false)
|
|
341
|
+
result = app.format_article(article, config)
|
|
342
|
+
|
|
343
|
+
expect(result).not_to include("CATEGORIES:")
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
it "still includes body text" do
|
|
347
|
+
config = CLITestApp.default_config.merge(category: false)
|
|
348
|
+
result = app.format_article(article, config)
|
|
349
|
+
|
|
350
|
+
expect(result).to include("testing")
|
|
351
|
+
end
|
|
352
|
+
end
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
describe "--category-only / -g option" do
|
|
356
|
+
context "when category_only is false (default)" do
|
|
357
|
+
it "includes full article content" do
|
|
358
|
+
config = CLITestApp.default_config
|
|
359
|
+
result = app.format_article(article, config)
|
|
360
|
+
|
|
361
|
+
expect(result).to include("testing")
|
|
362
|
+
expect(result).to include("Introduction")
|
|
363
|
+
end
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
context "when category_only is true" do
|
|
367
|
+
it "outputs only title and categories" do
|
|
368
|
+
config = CLITestApp.default_config.merge(category_only: true)
|
|
369
|
+
result = app.format_article(article, config)
|
|
370
|
+
|
|
371
|
+
expect(result).to include("Test Article")
|
|
372
|
+
expect(result).to include("Software")
|
|
373
|
+
expect(result).to include("Testing")
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
it "excludes body text" do
|
|
377
|
+
config = CLITestApp.default_config.merge(category_only: true)
|
|
378
|
+
result = app.format_article(article, config)
|
|
379
|
+
|
|
380
|
+
expect(result).not_to include("introduction paragraph")
|
|
381
|
+
expect(result).not_to include("Features")
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
it "uses tab-separated format" do
|
|
385
|
+
config = CLITestApp.default_config.merge(category_only: true)
|
|
386
|
+
result = app.format_article(article, config)
|
|
387
|
+
|
|
388
|
+
expect(result).to include("\t")
|
|
389
|
+
end
|
|
390
|
+
end
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
describe "--summary-only / -s option" do
|
|
394
|
+
context "when summary_only is false (default)" do
|
|
395
|
+
it "includes all headings" do
|
|
396
|
+
config = CLITestApp.default_config
|
|
397
|
+
result = app.format_article(article, config)
|
|
398
|
+
|
|
399
|
+
expect(result).to include("Introduction")
|
|
400
|
+
expect(result).to include("Features")
|
|
401
|
+
expect(result).to include("See Also")
|
|
402
|
+
end
|
|
403
|
+
end
|
|
404
|
+
|
|
405
|
+
context "when summary_only is true" do
|
|
406
|
+
it "excludes section headings" do
|
|
407
|
+
config = CLITestApp.default_config.merge(summary_only: true)
|
|
408
|
+
result = app.format_article(article, config)
|
|
409
|
+
|
|
410
|
+
expect(result).not_to include("Introduction")
|
|
411
|
+
expect(result).not_to include("Features")
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
it "includes first paragraph (summary)" do
|
|
415
|
+
config = CLITestApp.default_config.merge(summary_only: true)
|
|
416
|
+
result = app.format_article(article, config)
|
|
417
|
+
|
|
418
|
+
expect(result).to include("testing")
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
it "includes categories if category option is true" do
|
|
422
|
+
config = CLITestApp.default_config.merge(summary_only: true)
|
|
423
|
+
result = app.format_article(article, config)
|
|
424
|
+
|
|
425
|
+
expect(result).to include("CATEGORIES:")
|
|
426
|
+
end
|
|
427
|
+
end
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
describe "Option combinations" do
|
|
431
|
+
it "category + title both false outputs only body" do
|
|
432
|
+
config = CLITestApp.default_config.merge(category: false, title: false)
|
|
433
|
+
result = app.format_article(article, config)
|
|
434
|
+
|
|
435
|
+
expect(result).not_to include("[[Test Article]]")
|
|
436
|
+
expect(result).not_to include("CATEGORIES:")
|
|
437
|
+
expect(result).to include("testing")
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
it "summary_only + category outputs summary with categories" do
|
|
441
|
+
config = CLITestApp.default_config.merge(summary_only: true, category: true)
|
|
442
|
+
result = app.format_article(article, config)
|
|
443
|
+
|
|
444
|
+
expect(result).to include("testing")
|
|
445
|
+
expect(result).to include("CATEGORIES:")
|
|
446
|
+
expect(result).not_to include("Introduction")
|
|
447
|
+
end
|
|
448
|
+
|
|
449
|
+
it "heading false + list true shows lists but not headings" do
|
|
450
|
+
config = CLITestApp.default_config.merge(heading: false, list: true)
|
|
451
|
+
result = app.format_article(article, config)
|
|
452
|
+
|
|
453
|
+
expect(result).not_to include("Introduction")
|
|
454
|
+
expect(result).to include("Feature one")
|
|
455
|
+
end
|
|
456
|
+
|
|
457
|
+
it "all content options enabled shows everything" do
|
|
458
|
+
config = CLITestApp.default_config.merge(
|
|
459
|
+
heading: true,
|
|
460
|
+
list: true,
|
|
461
|
+
table: true,
|
|
462
|
+
redirect: true
|
|
463
|
+
)
|
|
464
|
+
result = app.format_article(article, config)
|
|
465
|
+
|
|
466
|
+
expect(result).to include("Introduction")
|
|
467
|
+
expect(result).to include("Feature one")
|
|
468
|
+
expect(result).to include("Cell 1")
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
it "category_only takes precedence over other content options" do
|
|
472
|
+
config = CLITestApp.default_config.merge(
|
|
473
|
+
category_only: true,
|
|
474
|
+
heading: true,
|
|
475
|
+
list: true
|
|
476
|
+
)
|
|
477
|
+
result = app.format_article(article, config)
|
|
478
|
+
|
|
479
|
+
# Should only have title and categories
|
|
480
|
+
expect(result).to include("Test Article")
|
|
481
|
+
expect(result).to include("Software")
|
|
482
|
+
expect(result).not_to include("Introduction")
|
|
483
|
+
expect(result).not_to include("Feature one")
|
|
484
|
+
end
|
|
485
|
+
end
|
|
486
|
+
|
|
487
|
+
describe "Edge cases" do
|
|
488
|
+
it "handles article with no categories when category is true" do
|
|
489
|
+
wiki_no_cat = "'''Simple''' article without categories."
|
|
490
|
+
article_no_cat = Wp2txt::Article.new(wiki_no_cat, "Simple")
|
|
491
|
+
config = CLITestApp.default_config
|
|
492
|
+
|
|
493
|
+
result = app.format_article(article_no_cat, config)
|
|
494
|
+
|
|
495
|
+
# Should use format_full_article (no CATEGORIES section)
|
|
496
|
+
expect(result).to include("[[Simple]]")
|
|
497
|
+
expect(result).to include("article without categories")
|
|
498
|
+
expect(result).not_to include("CATEGORIES:")
|
|
499
|
+
end
|
|
500
|
+
|
|
501
|
+
it "handles empty article" do
|
|
502
|
+
empty_article = Wp2txt::Article.new("", "Empty")
|
|
503
|
+
config = CLITestApp.default_config.merge(category: false)
|
|
504
|
+
|
|
505
|
+
result = app.format_article(empty_article, config)
|
|
506
|
+
|
|
507
|
+
expect(result).to include("[[Empty]]")
|
|
508
|
+
end
|
|
509
|
+
|
|
510
|
+
it "handles article with only categories" do
|
|
511
|
+
cat_only_wiki = "[[Category:Test]][[Category:Example]]"
|
|
512
|
+
cat_article = Wp2txt::Article.new(cat_only_wiki, "Categories Only")
|
|
513
|
+
config = CLITestApp.default_config
|
|
514
|
+
|
|
515
|
+
result = app.format_article(cat_article, config)
|
|
516
|
+
|
|
517
|
+
expect(result).to include("CATEGORIES:")
|
|
518
|
+
expect(result).to include("Test")
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
it "handles article with deeply nested markup" do
|
|
522
|
+
nested_wiki = "{{outer|{{inner|{{deep|content}}}}}} and [[link|[[nested]]]]"
|
|
523
|
+
nested_article = Wp2txt::Article.new(nested_wiki, "Nested")
|
|
524
|
+
config = CLITestApp.default_config.merge(category: false)
|
|
525
|
+
|
|
526
|
+
# Should not raise error
|
|
527
|
+
expect { app.format_article(nested_article, config) }.not_to raise_error
|
|
528
|
+
end
|
|
529
|
+
|
|
530
|
+
it "handles article with special characters in title" do
|
|
531
|
+
special_article = Wp2txt::Article.new("Content here.", "C++ Programming")
|
|
532
|
+
config = CLITestApp.default_config.merge(category: false)
|
|
533
|
+
|
|
534
|
+
result = app.format_article(special_article, config)
|
|
535
|
+
expect(result).to include("C++ Programming")
|
|
536
|
+
end
|
|
537
|
+
|
|
538
|
+
it "handles Unicode content correctly" do
|
|
539
|
+
unicode_wiki = "'''日本語記事''' は [[テスト]] です。\n[[カテゴリ:日本語]]"
|
|
540
|
+
unicode_article = Wp2txt::Article.new(unicode_wiki, "日本語")
|
|
541
|
+
config = CLITestApp.default_config
|
|
542
|
+
|
|
543
|
+
result = app.format_article(unicode_article, config)
|
|
544
|
+
|
|
545
|
+
expect(result).to include("日本語")
|
|
546
|
+
expect(result.valid_encoding?).to be true
|
|
547
|
+
end
|
|
548
|
+
end
|
|
549
|
+
|
|
550
|
+
describe "--extract-citations option" do
|
|
551
|
+
include Wp2txt
|
|
552
|
+
|
|
553
|
+
# Test with inline citations in paragraph text
|
|
554
|
+
let(:inline_citation_wiki) do
|
|
555
|
+
<<~WIKI
|
|
556
|
+
'''Test Article''' is about testing.
|
|
557
|
+
|
|
558
|
+
The main source is {{cite book |last=Smith |first=John |title=The Book Title |year=2020}}.
|
|
559
|
+
|
|
560
|
+
Another reference: {{cite web |title=Web Page |url=http://example.com |date=2021-05-15}}.
|
|
561
|
+
WIKI
|
|
562
|
+
end
|
|
563
|
+
|
|
564
|
+
let(:inline_citation_article) { Wp2txt::Article.new(inline_citation_wiki, "Test Article") }
|
|
565
|
+
|
|
566
|
+
context "when extract_citations is false (default)" do
|
|
567
|
+
it "removes citations from text" do
|
|
568
|
+
config = CLITestApp.default_config
|
|
569
|
+
result = app.format_article(inline_citation_article, config)
|
|
570
|
+
|
|
571
|
+
expect(result).not_to include("Smith")
|
|
572
|
+
expect(result).not_to include("The Book Title")
|
|
573
|
+
expect(result).to include("The main source is")
|
|
574
|
+
end
|
|
575
|
+
end
|
|
576
|
+
|
|
577
|
+
context "when extract_citations is true" do
|
|
578
|
+
it "extracts formatted citations" do
|
|
579
|
+
config = CLITestApp.default_config.merge(extract_citations: true)
|
|
580
|
+
result = app.format_article(inline_citation_article, config)
|
|
581
|
+
|
|
582
|
+
expect(result).to include("Smith")
|
|
583
|
+
expect(result).to include("The Book Title")
|
|
584
|
+
expect(result).to include("2020")
|
|
585
|
+
end
|
|
586
|
+
|
|
587
|
+
it "extracts multiple citations" do
|
|
588
|
+
config = CLITestApp.default_config.merge(extract_citations: true)
|
|
589
|
+
result = app.format_article(inline_citation_article, config)
|
|
590
|
+
|
|
591
|
+
expect(result).to include("Smith")
|
|
592
|
+
expect(result).to include("Web Page")
|
|
593
|
+
end
|
|
594
|
+
end
|
|
595
|
+
|
|
596
|
+
# Test format_wiki directly for [REFERENCES] marker
|
|
597
|
+
describe "format_wiki with references" do
|
|
598
|
+
it "replaces {{reflist}} with [REFERENCES] marker by default" do
|
|
599
|
+
input = "Text\n{{reflist}}"
|
|
600
|
+
result = format_wiki(input)
|
|
601
|
+
expect(result).to include("[REFERENCES]")
|
|
602
|
+
end
|
|
603
|
+
|
|
604
|
+
it "replaces {{refbegin}}...{{refend}} with [REFERENCES] marker by default" do
|
|
605
|
+
input = "{{refbegin}}\n* Citation\n{{refend}}"
|
|
606
|
+
result = format_wiki(input)
|
|
607
|
+
expect(result).to include("[REFERENCES]")
|
|
608
|
+
end
|
|
609
|
+
|
|
610
|
+
it "extracts citations when extract_citations is true" do
|
|
611
|
+
input = "{{cite book |last=Author |title=Book |year=2020}}"
|
|
612
|
+
result = format_wiki(input, extract_citations: true)
|
|
613
|
+
expect(result).to include("Author")
|
|
614
|
+
expect(result).to include("Book")
|
|
615
|
+
end
|
|
616
|
+
end
|
|
617
|
+
end
|
|
618
|
+
end
|
|
619
|
+
|
|
620
|
+
RSpec.describe "Article element type coverage" do
|
|
621
|
+
include Wp2txt
|
|
622
|
+
|
|
623
|
+
describe "All element types are parsed correctly" do
|
|
624
|
+
it "detects :mw_heading" do
|
|
625
|
+
article = Wp2txt::Article.new("== Heading ==", "Test")
|
|
626
|
+
types = article.elements.map(&:first)
|
|
627
|
+
expect(types).to include(:mw_heading)
|
|
628
|
+
end
|
|
629
|
+
|
|
630
|
+
it "detects :mw_paragraph" do
|
|
631
|
+
article = Wp2txt::Article.new("Simple paragraph text.", "Test")
|
|
632
|
+
types = article.elements.map(&:first)
|
|
633
|
+
expect(types).to include(:mw_paragraph)
|
|
634
|
+
end
|
|
635
|
+
|
|
636
|
+
it "detects :mw_unordered" do
|
|
637
|
+
article = Wp2txt::Article.new("* List item", "Test")
|
|
638
|
+
types = article.elements.map(&:first)
|
|
639
|
+
expect(types).to include(:mw_unordered)
|
|
640
|
+
end
|
|
641
|
+
|
|
642
|
+
it "detects :mw_ordered" do
|
|
643
|
+
article = Wp2txt::Article.new("# Numbered item", "Test")
|
|
644
|
+
types = article.elements.map(&:first)
|
|
645
|
+
expect(types).to include(:mw_ordered)
|
|
646
|
+
end
|
|
647
|
+
|
|
648
|
+
it "detects :mw_definition" do
|
|
649
|
+
article = Wp2txt::Article.new("; Term\n: Definition", "Test")
|
|
650
|
+
types = article.elements.map(&:first)
|
|
651
|
+
expect(types).to include(:mw_definition)
|
|
652
|
+
end
|
|
653
|
+
|
|
654
|
+
it "detects :mw_table" do
|
|
655
|
+
article = Wp2txt::Article.new("{| class=\"wikitable\"\n|-\n| Cell\n|}", "Test")
|
|
656
|
+
types = article.elements.map(&:first)
|
|
657
|
+
expect(types).to include(:mw_table)
|
|
658
|
+
end
|
|
659
|
+
|
|
660
|
+
it "detects :mw_redirect" do
|
|
661
|
+
article = Wp2txt::Article.new("#REDIRECT [[Target]]", "Test")
|
|
662
|
+
types = article.elements.map(&:first)
|
|
663
|
+
expect(types).to include(:mw_redirect)
|
|
664
|
+
end
|
|
665
|
+
|
|
666
|
+
it "detects :mw_blank" do
|
|
667
|
+
article = Wp2txt::Article.new("Text\n\nMore text", "Test")
|
|
668
|
+
types = article.elements.map(&:first)
|
|
669
|
+
expect(types).to include(:mw_blank)
|
|
670
|
+
end
|
|
671
|
+
|
|
672
|
+
it "detects :mw_isolated_template" do
|
|
673
|
+
article = Wp2txt::Article.new("{{Stub}}", "Test")
|
|
674
|
+
types = article.elements.map(&:first)
|
|
675
|
+
expect(types).to include(:mw_isolated_template)
|
|
676
|
+
end
|
|
677
|
+
end
|
|
678
|
+
end
|