wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,455 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
|
|
5
|
+
RSpec.describe Wp2txt::ParserFunctions do
|
|
6
|
+
let(:parser) { described_class.new }
|
|
7
|
+
|
|
8
|
+
describe "#if" do
|
|
9
|
+
it "returns then-value when condition is non-empty" do
|
|
10
|
+
expect(parser.evaluate("{{#if:yes|true|false}}")).to eq("true")
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
it "returns else-value when condition is empty" do
|
|
14
|
+
expect(parser.evaluate("{{#if:|true|false}}")).to eq("false")
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
it "returns else-value when condition is whitespace only" do
|
|
18
|
+
expect(parser.evaluate("{{#if: |true|false}}")).to eq("false")
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
it "returns then-value with text condition" do
|
|
22
|
+
expect(parser.evaluate("{{#if:something|yes|no}}")).to eq("yes")
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it "returns empty when no else-value and condition is empty" do
|
|
26
|
+
expect(parser.evaluate("{{#if:|true}}")).to eq("")
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it "handles nested #if" do
|
|
30
|
+
expect(parser.evaluate("{{#if:x|{{#if:y|inner|}}|outer}}")).to eq("inner")
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
describe "#ifeq" do
|
|
35
|
+
it "returns then-value when strings are equal" do
|
|
36
|
+
expect(parser.evaluate("{{#ifeq:foo|foo|equal|not equal}}")).to eq("equal")
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it "returns else-value when strings differ" do
|
|
40
|
+
expect(parser.evaluate("{{#ifeq:foo|bar|equal|not equal}}")).to eq("not equal")
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it "handles numeric comparison" do
|
|
44
|
+
expect(parser.evaluate("{{#ifeq:01|1|equal|not equal}}")).to eq("equal")
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
it "handles case-sensitive comparison" do
|
|
48
|
+
expect(parser.evaluate("{{#ifeq:Foo|foo|equal|not equal}}")).to eq("not equal")
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
it "trims whitespace in comparison" do
|
|
52
|
+
expect(parser.evaluate("{{#ifeq: foo |foo|equal|not equal}}")).to eq("equal")
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
it "returns empty when no else-value and not equal" do
|
|
56
|
+
expect(parser.evaluate("{{#ifeq:a|b|equal}}")).to eq("")
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
describe "#switch" do
|
|
61
|
+
it "returns matching case value" do
|
|
62
|
+
expect(parser.evaluate("{{#switch:b|a=first|b=second|c=third}}")).to eq("second")
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
it "returns default value when no match" do
|
|
66
|
+
expect(parser.evaluate("{{#switch:x|a=first|b=second|#default=none}}")).to eq("none")
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
it "returns last unnamed value as default" do
|
|
70
|
+
expect(parser.evaluate("{{#switch:x|a=first|b=second|fallback}}")).to eq("fallback")
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
it "returns empty when no match and no default" do
|
|
74
|
+
expect(parser.evaluate("{{#switch:x|a=first|b=second}}")).to eq("")
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
it "handles fall-through cases" do
|
|
78
|
+
expect(parser.evaluate("{{#switch:b|a|b|c=result}}")).to eq("result")
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
it "handles numeric matching" do
|
|
82
|
+
expect(parser.evaluate("{{#switch:2|1=one|2=two|3=three}}")).to eq("two")
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
it "trims whitespace in comparisons" do
|
|
86
|
+
expect(parser.evaluate("{{#switch: b |a=first| b =second}}")).to eq("second")
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
describe "#ifexpr" do
|
|
91
|
+
it "returns then-value when expression is non-zero" do
|
|
92
|
+
expect(parser.evaluate("{{#ifexpr:1|yes|no}}")).to eq("yes")
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
it "returns else-value when expression is zero" do
|
|
96
|
+
expect(parser.evaluate("{{#ifexpr:0|yes|no}}")).to eq("no")
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
it "evaluates simple arithmetic" do
|
|
100
|
+
expect(parser.evaluate("{{#ifexpr:2+2=4|yes|no}}")).to eq("yes")
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
it "evaluates comparison operators" do
|
|
104
|
+
expect(parser.evaluate("{{#ifexpr:5>3|yes|no}}")).to eq("yes")
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
it "handles negative results" do
|
|
108
|
+
expect(parser.evaluate("{{#ifexpr:3-5|yes|no}}")).to eq("yes")
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
describe "#expr" do
|
|
113
|
+
it "evaluates addition" do
|
|
114
|
+
expect(parser.evaluate("{{#expr:2+3}}")).to eq("5")
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
it "evaluates subtraction" do
|
|
118
|
+
expect(parser.evaluate("{{#expr:10-3}}")).to eq("7")
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
it "evaluates multiplication" do
|
|
122
|
+
expect(parser.evaluate("{{#expr:4*5}}")).to eq("20")
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
it "evaluates division" do
|
|
126
|
+
expect(parser.evaluate("{{#expr:20/4}}")).to eq("5")
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
it "evaluates modulo" do
|
|
130
|
+
expect(parser.evaluate("{{#expr:17 mod 5}}")).to eq("2")
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
it "evaluates parentheses" do
|
|
134
|
+
expect(parser.evaluate("{{#expr:(2+3)*4}}")).to eq("20")
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
it "evaluates power" do
|
|
138
|
+
expect(parser.evaluate("{{#expr:2^3}}")).to eq("8")
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
it "handles decimal results" do
|
|
142
|
+
result = parser.evaluate("{{#expr:10/3}}")
|
|
143
|
+
expect(result.to_f).to be_within(0.01).of(3.33)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
it "handles comparison operators returning 1 or 0" do
|
|
147
|
+
expect(parser.evaluate("{{#expr:5>3}}")).to eq("1")
|
|
148
|
+
expect(parser.evaluate("{{#expr:5<3}}")).to eq("0")
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
it "handles equality comparison" do
|
|
152
|
+
expect(parser.evaluate("{{#expr:5=5}}")).to eq("1")
|
|
153
|
+
expect(parser.evaluate("{{#expr:5=6}}")).to eq("0")
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
it "handles and/or operators" do
|
|
157
|
+
expect(parser.evaluate("{{#expr:1 and 1}}")).to eq("1")
|
|
158
|
+
expect(parser.evaluate("{{#expr:1 and 0}}")).to eq("0")
|
|
159
|
+
expect(parser.evaluate("{{#expr:0 or 1}}")).to eq("1")
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
it "handles unary minus" do
|
|
163
|
+
expect(parser.evaluate("{{#expr:-5}}")).to eq("-5")
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
it "returns error indicator for invalid expressions" do
|
|
167
|
+
expect(parser.evaluate("{{#expr:invalid}}")).to eq("")
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
describe "#len" do
|
|
172
|
+
it "returns string length" do
|
|
173
|
+
expect(parser.evaluate("{{#len:hello}}")).to eq("5")
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
it "counts unicode characters" do
|
|
177
|
+
expect(parser.evaluate("{{#len:日本語}}")).to eq("3")
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
it "returns 0 for empty string" do
|
|
181
|
+
expect(parser.evaluate("{{#len:}}")).to eq("0")
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
describe "#pos" do
|
|
186
|
+
it "returns position of substring" do
|
|
187
|
+
expect(parser.evaluate("{{#pos:hello|l}}")).to eq("2")
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
it "returns empty when not found" do
|
|
191
|
+
expect(parser.evaluate("{{#pos:hello|x}}")).to eq("")
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
it "returns position of first occurrence" do
|
|
195
|
+
expect(parser.evaluate("{{#pos:hello|l}}")).to eq("2")
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
describe "#sub" do
|
|
200
|
+
it "extracts substring from start" do
|
|
201
|
+
expect(parser.evaluate("{{#sub:hello|0|3}}")).to eq("hel")
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
it "extracts substring from position" do
|
|
205
|
+
expect(parser.evaluate("{{#sub:hello|2|3}}")).to eq("llo")
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
it "handles negative start (from end)" do
|
|
209
|
+
expect(parser.evaluate("{{#sub:hello|-2}}")).to eq("lo")
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
it "handles length beyond string" do
|
|
213
|
+
expect(parser.evaluate("{{#sub:hello|0|100}}")).to eq("hello")
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
describe "#replace" do
|
|
218
|
+
it "replaces substring" do
|
|
219
|
+
expect(parser.evaluate("{{#replace:hello world|world|universe}}")).to eq("hello universe")
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
it "replaces all occurrences" do
|
|
223
|
+
expect(parser.evaluate("{{#replace:ababa|a|x}}")).to eq("xbxbx")
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
it "handles empty replacement" do
|
|
227
|
+
expect(parser.evaluate("{{#replace:hello|l|}}")).to eq("heo")
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
describe "#titleparts" do
|
|
232
|
+
it "extracts first part of title" do
|
|
233
|
+
expect(parser.evaluate("{{#titleparts:Talk:Foo/Bar/Baz|1}}")).to eq("Talk:Foo")
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
it "extracts multiple parts" do
|
|
237
|
+
expect(parser.evaluate("{{#titleparts:Talk:Foo/Bar/Baz|2}}")).to eq("Talk:Foo/Bar")
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
it "extracts from offset" do
|
|
241
|
+
expect(parser.evaluate("{{#titleparts:Talk:Foo/Bar/Baz|1|1}}")).to eq("Bar")
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
it "handles negative count (from end)" do
|
|
245
|
+
expect(parser.evaluate("{{#titleparts:Talk:Foo/Bar/Baz|-1}}")).to eq("Talk:Foo/Bar")
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
describe "#time" do
|
|
250
|
+
let(:parser_with_date) { described_class.new(reference_date: Time.new(2024, 6, 15, 10, 30, 45)) }
|
|
251
|
+
|
|
252
|
+
it "formats year" do
|
|
253
|
+
expect(parser_with_date.evaluate("{{#time:Y}}")).to eq("2024")
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
it "formats month name" do
|
|
257
|
+
expect(parser_with_date.evaluate("{{#time:F}}")).to eq("June")
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
it "formats day" do
|
|
261
|
+
expect(parser_with_date.evaluate("{{#time:j}}")).to eq("15")
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
it "formats full date" do
|
|
265
|
+
expect(parser_with_date.evaluate("{{#time:Y-m-d}}")).to eq("2024-06-15")
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
it "parses input date" do
|
|
269
|
+
expect(parser.evaluate("{{#time:Y|2020-05-15}}")).to eq("2020")
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
describe "integration with template_expander" do
|
|
274
|
+
include Wp2txt
|
|
275
|
+
|
|
276
|
+
it "expands parser functions in format_wiki" do
|
|
277
|
+
input = "Result: {{#if:yes|shown|hidden}}"
|
|
278
|
+
result = format_wiki(input, title: "Test", expand_templates: true)
|
|
279
|
+
expect(result).to include("Result: shown")
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
it "handles parser functions within templates" do
|
|
283
|
+
input = "{{#switch:2|1=one|2=two|3=three}}"
|
|
284
|
+
result = format_wiki(input, title: "Test", expand_templates: true)
|
|
285
|
+
expect(result).to eq("two")
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
it "handles nested parser functions and templates" do
|
|
289
|
+
input = "{{#if:yes|{{circa|1500}}|unknown}}"
|
|
290
|
+
result = format_wiki(input, title: "Test", expand_templates: true)
|
|
291
|
+
expect(result).to eq("c. 1500")
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
describe "edge cases" do
|
|
296
|
+
it "handles malformed parser function gracefully" do
|
|
297
|
+
expect(parser.evaluate("{{#if:}}")).to eq("")
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
it "handles unknown parser function" do
|
|
301
|
+
expect(parser.evaluate("{{#unknown:foo|bar}}")).to eq("")
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
it "handles deeply nested functions" do
|
|
305
|
+
result = parser.evaluate("{{#if:x|{{#ifeq:a|a|{{#switch:1|1=deep}}|}}|}}")
|
|
306
|
+
expect(result).to eq("deep")
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
it "preserves text around parser functions" do
|
|
310
|
+
expect(parser.evaluate("Before {{#if:x|middle|}} after")).to eq("Before middle after")
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
# New parser functions for WikiExtractor parity
|
|
315
|
+
describe "#iferror" do
|
|
316
|
+
it "returns then-value when input contains error class" do
|
|
317
|
+
expect(parser.evaluate("{{#iferror:<span class=\"error\">Error</span>|error found|no error}}")).to eq("error found")
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
it "returns else-value when input is normal" do
|
|
321
|
+
expect(parser.evaluate("{{#iferror:normal text|error|no error}}")).to eq("no error")
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
it "returns empty when no else-value and no error" do
|
|
325
|
+
expect(parser.evaluate("{{#iferror:normal text|error}}")).to eq("")
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
it "returns input when no then-value and no error" do
|
|
329
|
+
expect(parser.evaluate("{{#iferror:normal text}}")).to eq("normal text")
|
|
330
|
+
end
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
describe "#rpos" do
|
|
334
|
+
it "returns position of last occurrence" do
|
|
335
|
+
expect(parser.evaluate("{{#rpos:abcabc|b}}")).to eq("4")
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
it "returns empty when not found" do
|
|
339
|
+
expect(parser.evaluate("{{#rpos:hello|x}}")).to eq("-1")
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
it "handles single occurrence same as #pos" do
|
|
343
|
+
expect(parser.evaluate("{{#rpos:hello|l}}")).to eq("3")
|
|
344
|
+
end
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
describe "#count" do
|
|
348
|
+
it "counts occurrences of substring" do
|
|
349
|
+
expect(parser.evaluate("{{#count:abcabc|a}}")).to eq("2")
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
it "returns 0 when not found" do
|
|
353
|
+
expect(parser.evaluate("{{#count:hello|x}}")).to eq("0")
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
it "counts overlapping occurrences" do
|
|
357
|
+
expect(parser.evaluate("{{#count:aaaa|aa}}")).to eq("2")
|
|
358
|
+
end
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
describe "#explode" do
|
|
362
|
+
it "splits and returns nth element" do
|
|
363
|
+
expect(parser.evaluate("{{#explode:a,b,c|,|1}}")).to eq("b")
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
it "returns first element by default" do
|
|
367
|
+
expect(parser.evaluate("{{#explode:a-b-c|-}}")).to eq("a")
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
it "handles negative index (from end)" do
|
|
371
|
+
expect(parser.evaluate("{{#explode:a,b,c|,|-1}}")).to eq("c")
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
it "returns empty for out of bounds" do
|
|
375
|
+
expect(parser.evaluate("{{#explode:a,b|,|5}}")).to eq("")
|
|
376
|
+
end
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
describe "#urldecode" do
|
|
380
|
+
it "decodes URL-encoded string" do
|
|
381
|
+
expect(parser.evaluate("{{#urldecode:Hello%20World}}")).to eq("Hello World")
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
it "decodes special characters" do
|
|
385
|
+
expect(parser.evaluate("{{#urldecode:%26%3D%3F}}")).to eq("&=?")
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
it "handles already decoded string" do
|
|
389
|
+
expect(parser.evaluate("{{#urldecode:hello}}")).to eq("hello")
|
|
390
|
+
end
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
describe "#urlencode" do
|
|
394
|
+
it "encodes string for URL" do
|
|
395
|
+
expect(parser.evaluate("{{#urlencode:Hello World}}")).to eq("Hello%20World")
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
it "encodes special characters" do
|
|
399
|
+
expect(parser.evaluate("{{#urlencode:a&b=c}}")).to eq("a%26b%3Dc")
|
|
400
|
+
end
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
describe "#padleft" do
|
|
404
|
+
it "pads string on left" do
|
|
405
|
+
expect(parser.evaluate("{{#padleft:7|3|0}}")).to eq("007")
|
|
406
|
+
end
|
|
407
|
+
|
|
408
|
+
it "does not truncate if already longer" do
|
|
409
|
+
expect(parser.evaluate("{{#padleft:hello|3|x}}")).to eq("hello")
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
it "uses space as default padding" do
|
|
413
|
+
expect(parser.evaluate("{{#padleft:a|3}}")).to eq(" a")
|
|
414
|
+
end
|
|
415
|
+
end
|
|
416
|
+
|
|
417
|
+
describe "#padright" do
|
|
418
|
+
it "pads string on right" do
|
|
419
|
+
expect(parser.evaluate("{{#padright:7|3|0}}")).to eq("700")
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
it "does not truncate if already longer" do
|
|
423
|
+
expect(parser.evaluate("{{#padright:hello|3|x}}")).to eq("hello")
|
|
424
|
+
end
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
describe "enhanced #time" do
|
|
428
|
+
let(:parser_with_date) { described_class.new(reference_date: Time.new(2024, 6, 15, 14, 30, 45)) }
|
|
429
|
+
|
|
430
|
+
it "formats 12-hour time" do
|
|
431
|
+
expect(parser_with_date.evaluate("{{#time:g:i a}}")).to eq("2:30 pm")
|
|
432
|
+
end
|
|
433
|
+
|
|
434
|
+
it "formats ISO week number" do
|
|
435
|
+
expect(parser_with_date.evaluate("{{#time:W}}")).to eq("24")
|
|
436
|
+
end
|
|
437
|
+
|
|
438
|
+
it "formats day of week" do
|
|
439
|
+
expect(parser_with_date.evaluate("{{#time:l}}")).to eq("Saturday")
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
it "formats short day of week" do
|
|
443
|
+
expect(parser_with_date.evaluate("{{#time:D}}")).to eq("Sat")
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
it "formats ordinal day suffix" do
|
|
447
|
+
expect(parser_with_date.evaluate("{{#time:jS}}")).to eq("15th")
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
it "formats timezone" do
|
|
451
|
+
result = parser_with_date.evaluate("{{#time:T}}")
|
|
452
|
+
expect(result).not_to be_empty
|
|
453
|
+
end
|
|
454
|
+
end
|
|
455
|
+
end
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
require "wp2txt/ractor_worker"
|
|
5
|
+
|
|
6
|
+
RSpec.describe Wp2txt::RactorWorker do
|
|
7
|
+
describe "MINIMUM_RUBY_VERSION" do
|
|
8
|
+
it "is set to 4.0" do
|
|
9
|
+
expect(described_class::MINIMUM_RUBY_VERSION).to eq("4.0")
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
describe "OPERATIONS" do
|
|
14
|
+
it "includes expected operations" do
|
|
15
|
+
expect(described_class::OPERATIONS).to include(:process_article)
|
|
16
|
+
expect(described_class::OPERATIONS).to include(:double)
|
|
17
|
+
expect(described_class::OPERATIONS).to include(:fib)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
it "does not include removed operations" do
|
|
21
|
+
expect(described_class::OPERATIONS).not_to include(:regex_transform)
|
|
22
|
+
expect(described_class::OPERATIONS).not_to include(:format_wiki)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
describe ".ruby_version_sufficient?" do
|
|
27
|
+
it "returns boolean based on Ruby version" do
|
|
28
|
+
if Gem::Version.new(RUBY_VERSION) < Gem::Version.new("4.0")
|
|
29
|
+
expect(described_class.ruby_version_sufficient?).to be false
|
|
30
|
+
else
|
|
31
|
+
expect(described_class.ruby_version_sufficient?).to be true
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
describe ".available?" do
|
|
37
|
+
it "returns a boolean" do
|
|
38
|
+
result = described_class.available?
|
|
39
|
+
expect([true, false]).to include(result)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
it "caches the result" do
|
|
43
|
+
result1 = described_class.available?
|
|
44
|
+
result2 = described_class.available?
|
|
45
|
+
expect(result1).to eq(result2)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it "returns false on Ruby < 4.0" do
|
|
49
|
+
if Gem::Version.new(RUBY_VERSION) < Gem::Version.new("4.0")
|
|
50
|
+
if described_class.instance_variable_defined?(:@available)
|
|
51
|
+
described_class.remove_instance_variable(:@available)
|
|
52
|
+
end
|
|
53
|
+
expect(described_class.available?).to be false
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
describe ".optimal_workers" do
|
|
59
|
+
it "returns a positive integer" do
|
|
60
|
+
result = described_class.optimal_workers
|
|
61
|
+
expect(result).to be_a(Integer)
|
|
62
|
+
expect(result).to be >= 1
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
it "does not exceed CPU count" do
|
|
66
|
+
result = described_class.optimal_workers
|
|
67
|
+
expect(result).to be <= Etc.nprocessors
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
describe ".deep_freeze" do
|
|
72
|
+
it "freezes a hash" do
|
|
73
|
+
hash = { a: 1, b: "hello" }
|
|
74
|
+
frozen = described_class.deep_freeze(hash)
|
|
75
|
+
expect(frozen).to be_frozen
|
|
76
|
+
expect(frozen[:b]).to be_frozen
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
it "freezes nested structures" do
|
|
80
|
+
nested = { a: [1, 2, { b: "c" }] }
|
|
81
|
+
frozen = described_class.deep_freeze(nested)
|
|
82
|
+
expect(frozen).to be_frozen
|
|
83
|
+
expect(frozen[:a]).to be_frozen
|
|
84
|
+
expect(frozen[:a][2]).to be_frozen
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
it "handles already frozen objects" do
|
|
88
|
+
str = "hello".freeze
|
|
89
|
+
expect { described_class.deep_freeze(str) }.not_to raise_error
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
describe ".process_single" do
|
|
94
|
+
it "processes :double operation" do
|
|
95
|
+
result = described_class.process_single(5, :double, {})
|
|
96
|
+
expect(result).to eq(10)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
it "processes :fib operation" do
|
|
100
|
+
result = described_class.process_single(10, :fib, {})
|
|
101
|
+
expect(result).to eq(55)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
it "raises error for unknown operation" do
|
|
105
|
+
expect {
|
|
106
|
+
described_class.process_single(1, :unknown_op, {})
|
|
107
|
+
}.to raise_error(/Unknown operation/)
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
describe ".parallel_process" do
|
|
112
|
+
context "with simple operations" do
|
|
113
|
+
it "processes items with :double operation" do
|
|
114
|
+
items = [1, 2, 3, 4, 5]
|
|
115
|
+
results = described_class.parallel_process(
|
|
116
|
+
items,
|
|
117
|
+
operation: :double,
|
|
118
|
+
config: {},
|
|
119
|
+
num_workers: 2
|
|
120
|
+
)
|
|
121
|
+
expect(results).to eq([2, 4, 6, 8, 10])
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
it "returns empty array for empty input" do
|
|
125
|
+
results = described_class.parallel_process(
|
|
126
|
+
[],
|
|
127
|
+
operation: :double,
|
|
128
|
+
config: {}
|
|
129
|
+
)
|
|
130
|
+
expect(results).to eq([])
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
it "handles single item (falls back to sequential)" do
|
|
134
|
+
results = described_class.parallel_process(
|
|
135
|
+
[5],
|
|
136
|
+
operation: :double,
|
|
137
|
+
config: {}
|
|
138
|
+
)
|
|
139
|
+
expect(results).to eq([10])
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
it "preserves result order" do
|
|
143
|
+
items = [5, 3, 7, 1, 9]
|
|
144
|
+
results = described_class.parallel_process(
|
|
145
|
+
items,
|
|
146
|
+
operation: :double,
|
|
147
|
+
config: {},
|
|
148
|
+
num_workers: 4
|
|
149
|
+
)
|
|
150
|
+
expect(results).to eq([10, 6, 14, 2, 18])
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
context "with process_article operation" do
|
|
155
|
+
let(:config) { { format: :text, title: true, heading: true, category: true } }
|
|
156
|
+
|
|
157
|
+
it "processes multiple articles" do
|
|
158
|
+
items = [
|
|
159
|
+
["Article1", "Text one. [[Category:C1]]", false],
|
|
160
|
+
["Article2", "Text two. [[Category:C2]]", false]
|
|
161
|
+
]
|
|
162
|
+
results = described_class.parallel_process(
|
|
163
|
+
items,
|
|
164
|
+
operation: :process_article,
|
|
165
|
+
config: config,
|
|
166
|
+
num_workers: 2
|
|
167
|
+
)
|
|
168
|
+
expect(results.size).to eq(2)
|
|
169
|
+
expect(results.compact.size).to eq(2)
|
|
170
|
+
expect(results[0]).to include("Article1")
|
|
171
|
+
expect(results[1]).to include("Article2")
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
describe ".process_articles" do
|
|
177
|
+
let(:config) { { format: :text, title: true, heading: true, category: true } }
|
|
178
|
+
|
|
179
|
+
it "processes pages as [title, text] pairs" do
|
|
180
|
+
pages = [
|
|
181
|
+
["Test1", "Content one. [[Category:Cat]]"],
|
|
182
|
+
["Test2", "Content two. [[Category:Cat]]"]
|
|
183
|
+
]
|
|
184
|
+
results = described_class.process_articles(pages, config: config, num_workers: 2)
|
|
185
|
+
expect(results.size).to eq(2)
|
|
186
|
+
expect(results.compact.size).to eq(2)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
it "includes article titles in output" do
|
|
190
|
+
pages = [
|
|
191
|
+
["MyTitle", "Some content here."]
|
|
192
|
+
]
|
|
193
|
+
results = described_class.process_articles(pages, config: config)
|
|
194
|
+
expect(results.first).to include("MyTitle")
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end
|