wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,476 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "spec_helper"
|
|
4
|
+
|
|
5
|
+
RSpec.describe "Wp2txt Markers" do
|
|
6
|
+
include Wp2txt
|
|
7
|
+
|
|
8
|
+
describe "marker classification constants" do
|
|
9
|
+
it "defines INLINE_MARKERS" do
|
|
10
|
+
expect(Wp2txt::INLINE_MARKERS).to be_a(Array)
|
|
11
|
+
expect(Wp2txt::INLINE_MARKERS).to include(:math, :chem, :ipa, :code)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
it "defines BLOCK_MARKERS" do
|
|
15
|
+
expect(Wp2txt::BLOCK_MARKERS).to be_a(Array)
|
|
16
|
+
expect(Wp2txt::BLOCK_MARKERS).to include(:table, :infobox, :navbox, :codeblock)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it "has no overlap between inline and block markers" do
|
|
20
|
+
overlap = Wp2txt::INLINE_MARKERS & Wp2txt::BLOCK_MARKERS
|
|
21
|
+
expect(overlap).to be_empty
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
it "includes all marker types in either inline or block" do
|
|
25
|
+
all_classified = Wp2txt::INLINE_MARKERS + Wp2txt::BLOCK_MARKERS
|
|
26
|
+
expect(all_classified.sort).to eq(Wp2txt::MARKER_TYPES.sort)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Default behavior: markers are ON
|
|
31
|
+
describe "marker replacement (default: enabled)" do
|
|
32
|
+
describe "MATH marker" do
|
|
33
|
+
it "replaces <math> tags with [MATH]" do
|
|
34
|
+
input = "The equation <math>E = mc^2</math> is famous."
|
|
35
|
+
result = format_wiki(input)
|
|
36
|
+
expect(result).to include("[MATH]")
|
|
37
|
+
expect(result).not_to include("<math>")
|
|
38
|
+
expect(result).not_to include("E = mc^2")
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
it "replaces {{math}} templates with [MATH]" do
|
|
42
|
+
input = "The formula {{math|x^2 + y^2 = z^2}} is well known."
|
|
43
|
+
result = format_wiki(input)
|
|
44
|
+
expect(result).to include("[MATH]")
|
|
45
|
+
expect(result).not_to include("{{math")
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it "replaces {{mvar}} templates with [MATH]" do
|
|
49
|
+
input = "Let {{mvar|n}} be an integer."
|
|
50
|
+
result = format_wiki(input)
|
|
51
|
+
expect(result).to include("[MATH]")
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
describe "CODE marker (inline)" do
|
|
56
|
+
it "replaces <code> tags with [CODE]" do
|
|
57
|
+
input = "Use <code>printf()</code> to print."
|
|
58
|
+
result = format_wiki(input)
|
|
59
|
+
expect(result).to include("[CODE]")
|
|
60
|
+
expect(result).not_to include("<code>")
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it "handles inline code in sentence" do
|
|
64
|
+
input = "The variable <code>x</code> and <code>y</code> are integers."
|
|
65
|
+
result = format_wiki(input)
|
|
66
|
+
expect(result).to include("[CODE]")
|
|
67
|
+
expect(result).to include("are integers")
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
describe "CODEBLOCK marker (block)" do
|
|
72
|
+
it "replaces <syntaxhighlight> tags with [CODEBLOCK]" do
|
|
73
|
+
input = "<syntaxhighlight lang=\"python\">def hello():\n print('Hello')</syntaxhighlight>"
|
|
74
|
+
result = format_wiki(input)
|
|
75
|
+
expect(result).to include("[CODEBLOCK]")
|
|
76
|
+
expect(result).not_to include("<syntaxhighlight")
|
|
77
|
+
expect(result).not_to include("[CODE]")
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
it "replaces <source> tags with [CODEBLOCK]" do
|
|
81
|
+
input = "<source lang=\"ruby\">puts 'hello'</source>"
|
|
82
|
+
result = format_wiki(input)
|
|
83
|
+
expect(result).to include("[CODEBLOCK]")
|
|
84
|
+
expect(result).not_to include("<source")
|
|
85
|
+
expect(result).not_to include("[CODE]")
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
it "replaces <pre> tags with [CODEBLOCK]" do
|
|
89
|
+
input = "<pre>some preformatted code</pre>"
|
|
90
|
+
result = format_wiki(input)
|
|
91
|
+
expect(result).to include("[CODEBLOCK]")
|
|
92
|
+
expect(result).not_to include("<pre>")
|
|
93
|
+
expect(result).not_to include("[CODE]")
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
it "handles multiple codeblocks" do
|
|
97
|
+
input = "<syntaxhighlight>code1</syntaxhighlight>\n\n<source>code2</source>"
|
|
98
|
+
result = format_wiki(input)
|
|
99
|
+
expect(result.scan("[CODEBLOCK]").count).to eq(2)
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
describe "CHEM marker" do
|
|
104
|
+
it "replaces <chem> tags with [CHEM]" do
|
|
105
|
+
input = "Water is <chem>H2O</chem>."
|
|
106
|
+
result = format_wiki(input)
|
|
107
|
+
expect(result).to include("[CHEM]")
|
|
108
|
+
expect(result).not_to include("<chem>")
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
it "replaces {{chem}} templates with [CHEM]" do
|
|
112
|
+
input = "The reaction produces {{chem|CO|2}}."
|
|
113
|
+
result = format_wiki(input)
|
|
114
|
+
expect(result).to include("[CHEM]")
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
it "replaces {{ce}} templates with [CHEM]" do
|
|
118
|
+
input = "Salt is {{ce|NaCl}}."
|
|
119
|
+
result = format_wiki(input)
|
|
120
|
+
expect(result).to include("[CHEM]")
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
describe "TABLE marker" do
|
|
125
|
+
it "replaces wiki tables with [TABLE]" do
|
|
126
|
+
input = "Data:\n{| class=\"wikitable\"\n|-\n! Header\n|-\n| Cell\n|}\nMore text."
|
|
127
|
+
result = format_wiki(input)
|
|
128
|
+
expect(result).to include("[TABLE]")
|
|
129
|
+
expect(result).not_to include("{|")
|
|
130
|
+
expect(result).not_to include("|}")
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
it "replaces <table> tags with [TABLE]" do
|
|
134
|
+
input = "Data: <table><tr><td>Cell</td></tr></table> more."
|
|
135
|
+
result = format_wiki(input)
|
|
136
|
+
expect(result).to include("[TABLE]")
|
|
137
|
+
expect(result).not_to include("<table>")
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
describe "SCORE marker" do
|
|
142
|
+
it "replaces <score> tags with [SCORE]" do
|
|
143
|
+
input = "The melody: <score>\\relative c' { c d e f g }</score>"
|
|
144
|
+
result = format_wiki(input)
|
|
145
|
+
expect(result).to include("[SCORE]")
|
|
146
|
+
expect(result).not_to include("<score>")
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
describe "TIMELINE marker" do
|
|
151
|
+
it "replaces <timeline> tags with [TIMELINE]" do
|
|
152
|
+
input = "History:\n<timeline>\nImageSize = width:800\n</timeline>\nEnd."
|
|
153
|
+
result = format_wiki(input)
|
|
154
|
+
expect(result).to include("[TIMELINE]")
|
|
155
|
+
expect(result).not_to include("<timeline>")
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
describe "GRAPH marker" do
|
|
160
|
+
it "replaces <graph> tags with [GRAPH]" do
|
|
161
|
+
input = "Chart: <graph>{\"data\": []}</graph> shown above."
|
|
162
|
+
result = format_wiki(input)
|
|
163
|
+
expect(result).to include("[GRAPH]")
|
|
164
|
+
expect(result).not_to include("<graph>")
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
describe "IPA marker" do
|
|
169
|
+
it "replaces {{IPA}} templates with [IPA]" do
|
|
170
|
+
input = "Pronounced {{IPA|/həˈloʊ/}}."
|
|
171
|
+
result = format_wiki(input)
|
|
172
|
+
expect(result).to include("[IPA]")
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
it "replaces {{IPAc-en}} templates with [IPA]" do
|
|
176
|
+
input = "Say {{IPAc-en|ˈ|h|ɛ|l|oʊ}}."
|
|
177
|
+
result = format_wiki(input)
|
|
178
|
+
expect(result).to include("[IPA]")
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
describe "INFOBOX marker" do
|
|
183
|
+
it "replaces {{Infobox}} templates with [INFOBOX]" do
|
|
184
|
+
input = "{{Infobox person\n|name = John\n|birth_date = 1990\n}}\nJohn is a person."
|
|
185
|
+
result = format_wiki(input)
|
|
186
|
+
expect(result).to include("[INFOBOX]")
|
|
187
|
+
expect(result).not_to include("{{Infobox")
|
|
188
|
+
expect(result).not_to include("name = John")
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
it "handles nested templates in infobox" do
|
|
192
|
+
input = "{{Infobox country\n|name = {{flag|Japan}}\n|capital = Tokyo\n}}"
|
|
193
|
+
result = format_wiki(input)
|
|
194
|
+
expect(result).to include("[INFOBOX]")
|
|
195
|
+
expect(result).not_to include("{{Infobox")
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
describe "NAVBOX marker" do
|
|
200
|
+
it "replaces {{Navbox}} templates with [NAVBOX]" do
|
|
201
|
+
input = "Text\n{{Navbox\n|title = Navigation\n|list1 = Item1\n}}"
|
|
202
|
+
result = format_wiki(input)
|
|
203
|
+
expect(result).to include("[NAVBOX]")
|
|
204
|
+
expect(result).not_to include("{{Navbox")
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
describe "GALLERY marker" do
|
|
209
|
+
it "replaces <gallery> tags with [GALLERY]" do
|
|
210
|
+
input = "Images:\n<gallery>\nFile:Test.jpg|Caption\nFile:Test2.jpg|Caption2\n</gallery>"
|
|
211
|
+
result = format_wiki(input)
|
|
212
|
+
expect(result).to include("[GALLERY]")
|
|
213
|
+
expect(result).not_to include("<gallery>")
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
describe "SIDEBAR marker" do
|
|
218
|
+
it "replaces {{Sidebar}} templates with [SIDEBAR]" do
|
|
219
|
+
input = "{{Sidebar\n|title = Test\n|content = Text\n}}"
|
|
220
|
+
result = format_wiki(input)
|
|
221
|
+
expect(result).to include("[SIDEBAR]")
|
|
222
|
+
expect(result).not_to include("{{Sidebar")
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
describe "MAPFRAME marker" do
|
|
227
|
+
it "replaces <mapframe> tags with [MAPFRAME]" do
|
|
228
|
+
input = "Map: <mapframe latitude=\"51.5\" longitude=\"-0.1\">data</mapframe>"
|
|
229
|
+
result = format_wiki(input)
|
|
230
|
+
expect(result).to include("[MAPFRAME]")
|
|
231
|
+
expect(result).not_to include("<mapframe")
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
describe "IMAGEMAP marker" do
|
|
236
|
+
it "replaces <imagemap> tags with [IMAGEMAP]" do
|
|
237
|
+
input = "<imagemap>\nImage:Test.png|100px\nrect 0 0 100 100 [[Link]]\n</imagemap>"
|
|
238
|
+
result = format_wiki(input)
|
|
239
|
+
expect(result).to include("[IMAGEMAP]")
|
|
240
|
+
expect(result).not_to include("<imagemap>")
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
describe "REFERENCES marker" do
|
|
245
|
+
it "replaces {{reflist}} templates with [REFERENCES]" do
|
|
246
|
+
input = "Text with citations.\n\n== References ==\n{{reflist}}"
|
|
247
|
+
result = format_wiki(input)
|
|
248
|
+
expect(result).to include("[REFERENCES]")
|
|
249
|
+
expect(result).not_to include("{{reflist")
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
it "replaces {{Reflist}} with parameters with [REFERENCES]" do
|
|
253
|
+
input = "== References ==\n{{Reflist|30em}}"
|
|
254
|
+
result = format_wiki(input)
|
|
255
|
+
expect(result).to include("[REFERENCES]")
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
it "replaces <references/> self-closing tag with [REFERENCES]" do
|
|
259
|
+
input = "== References ==\n<references/>"
|
|
260
|
+
result = format_wiki(input)
|
|
261
|
+
expect(result).to include("[REFERENCES]")
|
|
262
|
+
expect(result).not_to include("<references")
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
it "replaces <references>...</references> tag with [REFERENCES]" do
|
|
266
|
+
input = "== References ==\n<references>\n<ref name=\"test\">Content</ref>\n</references>"
|
|
267
|
+
result = format_wiki(input)
|
|
268
|
+
expect(result).to include("[REFERENCES]")
|
|
269
|
+
expect(result).not_to include("<references>")
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
it "replaces {{refbegin}}...{{refend}} blocks with [REFERENCES]" do
|
|
273
|
+
input = "== Bibliography ==\n{{refbegin}}\n* Book one\n* Book two\n{{refend}}"
|
|
274
|
+
result = format_wiki(input)
|
|
275
|
+
expect(result).to include("[REFERENCES]")
|
|
276
|
+
expect(result).not_to include("{{refbegin")
|
|
277
|
+
expect(result).not_to include("{{refend")
|
|
278
|
+
expect(result).not_to include("Book one")
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
it "handles {{refbegin}} with parameters" do
|
|
282
|
+
input = "{{refbegin|30em|indent=yes}}\n* Citation\n{{refend}}"
|
|
283
|
+
result = format_wiki(input)
|
|
284
|
+
expect(result).to include("[REFERENCES]")
|
|
285
|
+
expect(result).not_to include("Citation")
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
describe "Citation extraction (extract_citations option)" do
|
|
290
|
+
it "extracts author, title, year from {{cite book}}" do
|
|
291
|
+
input = "{{cite book |last=Smith |first=John |title=The Book Title |year=2020}}"
|
|
292
|
+
result = format_wiki(input, extract_citations: true)
|
|
293
|
+
expect(result).to include("Smith")
|
|
294
|
+
expect(result).to include("The Book Title")
|
|
295
|
+
expect(result).to include("2020")
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
it "extracts from {{cite web}}" do
|
|
299
|
+
input = "{{cite web |title=Web Page Title |url=http://example.com |date=2021-05-15}}"
|
|
300
|
+
result = format_wiki(input, extract_citations: true)
|
|
301
|
+
expect(result).to include("Web Page Title")
|
|
302
|
+
expect(result).to include("2021")
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
it "extracts from {{cite news}}" do
|
|
306
|
+
input = "{{cite news |last=Reporter |title=News Article |newspaper=Daily News |date=2022-03-20}}"
|
|
307
|
+
result = format_wiki(input, extract_citations: true)
|
|
308
|
+
expect(result).to include("Reporter")
|
|
309
|
+
expect(result).to include("News Article")
|
|
310
|
+
expect(result).to include("2022")
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
it "extracts from {{cite journal}}" do
|
|
314
|
+
input = "{{cite journal |last=Scientist |title=Research Paper |journal=Nature |year=2023}}"
|
|
315
|
+
result = format_wiki(input, extract_citations: true)
|
|
316
|
+
expect(result).to include("Scientist")
|
|
317
|
+
expect(result).to include("Research Paper")
|
|
318
|
+
expect(result).to include("2023")
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
it "extracts from {{Citation}}" do
|
|
322
|
+
input = "{{Citation |last=Doe |first=Jane |title=Article Title |year=2019 |publisher=Publisher Name}}"
|
|
323
|
+
result = format_wiki(input, extract_citations: true)
|
|
324
|
+
expect(result).to include("Doe")
|
|
325
|
+
expect(result).to include("Article Title")
|
|
326
|
+
expect(result).to include("2019")
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
it "handles multiple citations" do
|
|
330
|
+
input = "* {{cite book |last=Author1 |title=Book One |year=2001}}\n* {{cite book |last=Author2 |title=Book Two |year=2002}}"
|
|
331
|
+
result = format_wiki(input, extract_citations: true)
|
|
332
|
+
expect(result).to include("Author1")
|
|
333
|
+
expect(result).to include("Book One")
|
|
334
|
+
expect(result).to include("Author2")
|
|
335
|
+
expect(result).to include("Book Two")
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
it "extracts citations from refbegin/refend blocks" do
|
|
339
|
+
input = "{{refbegin}}\n* {{cite book |last=Smith |title=Book Title |year=2020}}\n{{refend}}"
|
|
340
|
+
result = format_wiki(input, extract_citations: true)
|
|
341
|
+
expect(result).to include("Smith")
|
|
342
|
+
expect(result).to include("Book Title")
|
|
343
|
+
expect(result).not_to include("{{refbegin")
|
|
344
|
+
expect(result).not_to include("{{refend")
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
it "removes citations when extract_citations is false (default)" do
|
|
348
|
+
input = "Text. {{cite book |last=Smith |title=Book |year=2020}}"
|
|
349
|
+
result = format_wiki(input)
|
|
350
|
+
expect(result).not_to include("Smith")
|
|
351
|
+
expect(result).not_to include("Book")
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
it "handles citations with only title" do
|
|
355
|
+
input = "{{cite web |title=Untitled Page |url=http://example.com}}"
|
|
356
|
+
result = format_wiki(input, extract_citations: true)
|
|
357
|
+
expect(result).to include("Untitled Page")
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
it "handles author1/first1 format" do
|
|
361
|
+
input = "{{cite book |last1=Primary |first1=Author |title=Multi-Author Book |year=2021}}"
|
|
362
|
+
result = format_wiki(input, extract_citations: true)
|
|
363
|
+
expect(result).to include("Primary")
|
|
364
|
+
expect(result).to include("Multi-Author Book")
|
|
365
|
+
end
|
|
366
|
+
end
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
# Markers can be disabled
|
|
370
|
+
describe "marker replacement (disabled)" do
|
|
371
|
+
it "removes content without markers when markers disabled" do
|
|
372
|
+
input = "The equation <math>E = mc^2</math> is famous."
|
|
373
|
+
result = format_wiki(input, markers: false)
|
|
374
|
+
expect(result).not_to include("[MATH]")
|
|
375
|
+
expect(result).not_to include("<math>")
|
|
376
|
+
expect(result).not_to include("E = mc^2")
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
it "removes all marker types when disabled" do
|
|
380
|
+
input = "<code>x</code> <chem>H2O</chem> <score>notes</score>"
|
|
381
|
+
result = format_wiki(input, markers: false)
|
|
382
|
+
expect(result).not_to include("[CODE]")
|
|
383
|
+
expect(result).not_to include("[CHEM]")
|
|
384
|
+
expect(result).not_to include("[SCORE]")
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
it "removes codeblock when disabled" do
|
|
388
|
+
input = "<syntaxhighlight lang=\"python\">print('hello')</syntaxhighlight>"
|
|
389
|
+
result = format_wiki(input, markers: false)
|
|
390
|
+
expect(result).not_to include("[CODEBLOCK]")
|
|
391
|
+
expect(result).not_to include("<syntaxhighlight")
|
|
392
|
+
end
|
|
393
|
+
|
|
394
|
+
it "removes infobox when markers disabled" do
|
|
395
|
+
input = "{{Infobox person\n|name = John\n}}\nText."
|
|
396
|
+
result = format_wiki(input, markers: false)
|
|
397
|
+
expect(result).not_to include("[INFOBOX]")
|
|
398
|
+
expect(result).not_to include("{{Infobox")
|
|
399
|
+
expect(result).to include("Text")
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
it "removes navbox when markers disabled" do
|
|
403
|
+
input = "Text.\n{{Navbox\n|title = Nav\n}}"
|
|
404
|
+
result = format_wiki(input, markers: false)
|
|
405
|
+
expect(result).not_to include("[NAVBOX]")
|
|
406
|
+
expect(result).not_to include("{{Navbox")
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
it "removes gallery when markers disabled" do
|
|
410
|
+
input = "<gallery>\nFile:Test.jpg\n</gallery>"
|
|
411
|
+
result = format_wiki(input, markers: false)
|
|
412
|
+
expect(result).not_to include("[GALLERY]")
|
|
413
|
+
expect(result).not_to include("<gallery>")
|
|
414
|
+
end
|
|
415
|
+
|
|
416
|
+
it "removes references when markers disabled" do
|
|
417
|
+
input = "Text.\n{{reflist}}"
|
|
418
|
+
result = format_wiki(input, markers: false)
|
|
419
|
+
expect(result).not_to include("[REFERENCES]")
|
|
420
|
+
expect(result).not_to include("{{reflist")
|
|
421
|
+
end
|
|
422
|
+
end
|
|
423
|
+
|
|
424
|
+
# Selective markers
|
|
425
|
+
describe "selective marker replacement" do
|
|
426
|
+
it "enables only specified markers" do
|
|
427
|
+
input = "<math>x</math> and <code>y</code>"
|
|
428
|
+
result = format_wiki(input, markers: [:math])
|
|
429
|
+
expect(result).to include("[MATH]")
|
|
430
|
+
expect(result).not_to include("[CODE]")
|
|
431
|
+
expect(result).not_to include("<code>")
|
|
432
|
+
end
|
|
433
|
+
|
|
434
|
+
it "accepts array of marker symbols" do
|
|
435
|
+
input = "<math>x</math> <code>y</code> <chem>H2O</chem>"
|
|
436
|
+
result = format_wiki(input, markers: [:math, :code])
|
|
437
|
+
expect(result).to include("[MATH]")
|
|
438
|
+
expect(result).to include("[CODE]")
|
|
439
|
+
expect(result).not_to include("[CHEM]")
|
|
440
|
+
end
|
|
441
|
+
|
|
442
|
+
it "distinguishes code and codeblock markers" do
|
|
443
|
+
input = "<code>inline</code>\n<syntaxhighlight>block</syntaxhighlight>"
|
|
444
|
+
result = format_wiki(input, markers: [:code])
|
|
445
|
+
expect(result).to include("[CODE]")
|
|
446
|
+
expect(result).not_to include("[CODEBLOCK]")
|
|
447
|
+
expect(result).not_to include("block")
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
it "enables codeblock independently from code" do
|
|
451
|
+
input = "<code>inline</code>\n<syntaxhighlight>block</syntaxhighlight>"
|
|
452
|
+
result = format_wiki(input, markers: [:codeblock])
|
|
453
|
+
expect(result).to include("[CODEBLOCK]")
|
|
454
|
+
expect(result).not_to include("[CODE]")
|
|
455
|
+
expect(result).not_to include("inline")
|
|
456
|
+
end
|
|
457
|
+
end
|
|
458
|
+
|
|
459
|
+
# Multiple markers in one text
|
|
460
|
+
describe "multiple markers" do
|
|
461
|
+
it "handles multiple marker types in same text" do
|
|
462
|
+
input = "Formula <math>E=mc^2</math>, code <code>x=1</code>, and water <chem>H2O</chem>."
|
|
463
|
+
result = format_wiki(input)
|
|
464
|
+
expect(result).to include("[MATH]")
|
|
465
|
+
expect(result).to include("[CODE]")
|
|
466
|
+
expect(result).to include("[CHEM]")
|
|
467
|
+
end
|
|
468
|
+
|
|
469
|
+
it "handles nested content correctly" do
|
|
470
|
+
input = "{| class=\"wikitable\"\n|-\n| <math>x^2</math>\n|}"
|
|
471
|
+
result = format_wiki(input)
|
|
472
|
+
expect(result).to include("[TABLE]")
|
|
473
|
+
# Math inside table is processed with the table
|
|
474
|
+
end
|
|
475
|
+
end
|
|
476
|
+
end
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
require "wp2txt/memory_monitor"
|
|
5
|
+
|
|
6
|
+
RSpec.describe Wp2txt::MemoryMonitor do
|
|
7
|
+
describe ".current_memory_usage" do
|
|
8
|
+
it "returns a non-negative integer" do
|
|
9
|
+
usage = described_class.current_memory_usage
|
|
10
|
+
expect(usage).to be_a(Integer)
|
|
11
|
+
expect(usage).to be >= 0
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
describe ".total_system_memory" do
|
|
16
|
+
it "returns a positive integer" do
|
|
17
|
+
total = described_class.total_system_memory
|
|
18
|
+
expect(total).to be_a(Integer)
|
|
19
|
+
expect(total).to be > 0
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
it "returns at least 1 GB (reasonable minimum for running tests)" do
|
|
23
|
+
total = described_class.total_system_memory
|
|
24
|
+
one_gb = 1024 * 1024 * 1024
|
|
25
|
+
expect(total).to be >= one_gb
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
describe ".available_memory" do
|
|
30
|
+
it "returns a positive integer" do
|
|
31
|
+
available = described_class.available_memory
|
|
32
|
+
expect(available).to be_a(Integer)
|
|
33
|
+
expect(available).to be > 0
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
it "is less than or equal to total system memory" do
|
|
37
|
+
available = described_class.available_memory
|
|
38
|
+
total = described_class.total_system_memory
|
|
39
|
+
expect(available).to be <= total
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
describe ".memory_usage_percent" do
|
|
44
|
+
it "returns a float between 0 and 100" do
|
|
45
|
+
percent = described_class.memory_usage_percent
|
|
46
|
+
expect(percent).to be_a(Float)
|
|
47
|
+
expect(percent).to be >= 0
|
|
48
|
+
expect(percent).to be <= 100
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
describe ".memory_low?" do
|
|
53
|
+
it "returns a boolean" do
|
|
54
|
+
result = described_class.memory_low?
|
|
55
|
+
expect([true, false]).to include(result)
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
describe ".optimal_buffer_size" do
|
|
60
|
+
it "returns an integer within bounds" do
|
|
61
|
+
size = described_class.optimal_buffer_size
|
|
62
|
+
expect(size).to be_a(Integer)
|
|
63
|
+
expect(size).to be >= described_class::MIN_BUFFER_SIZE
|
|
64
|
+
expect(size).to be <= described_class::MAX_BUFFER_SIZE
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
it "returns a multiple of 1 MB" do
|
|
68
|
+
size = described_class.optimal_buffer_size
|
|
69
|
+
one_mb = 1_048_576
|
|
70
|
+
expect(size % one_mb).to eq(0)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
describe ".memory_stats" do
|
|
75
|
+
it "returns a hash with expected keys" do
|
|
76
|
+
stats = described_class.memory_stats
|
|
77
|
+
expect(stats).to be_a(Hash)
|
|
78
|
+
expect(stats).to have_key(:current_usage_mb)
|
|
79
|
+
expect(stats).to have_key(:total_system_mb)
|
|
80
|
+
expect(stats).to have_key(:available_mb)
|
|
81
|
+
expect(stats).to have_key(:usage_percent)
|
|
82
|
+
expect(stats).to have_key(:recommended_buffer_mb)
|
|
83
|
+
expect(stats).to have_key(:low_memory)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
it "returns numeric values for memory metrics" do
|
|
87
|
+
stats = described_class.memory_stats
|
|
88
|
+
expect(stats[:current_usage_mb]).to be_a(Numeric)
|
|
89
|
+
expect(stats[:total_system_mb]).to be_a(Numeric)
|
|
90
|
+
expect(stats[:available_mb]).to be_a(Numeric)
|
|
91
|
+
expect(stats[:usage_percent]).to be_a(Numeric)
|
|
92
|
+
expect(stats[:recommended_buffer_mb]).to be_a(Numeric)
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
describe ".format_memory" do
|
|
97
|
+
it "formats bytes" do
|
|
98
|
+
expect(described_class.format_memory(500)).to eq("500 B")
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
it "formats kilobytes" do
|
|
102
|
+
expect(described_class.format_memory(2048)).to eq("2.0 KB")
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
it "formats megabytes" do
|
|
106
|
+
expect(described_class.format_memory(5_242_880)).to eq("5.0 MB")
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
it "formats gigabytes" do
|
|
110
|
+
expect(described_class.format_memory(2_147_483_648)).to eq("2.0 GB")
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
describe ".gc_if_needed" do
|
|
115
|
+
it "returns a boolean" do
|
|
116
|
+
result = described_class.gc_if_needed
|
|
117
|
+
expect([true, false]).to include(result)
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
describe "constants" do
|
|
122
|
+
it "has reasonable threshold values" do
|
|
123
|
+
expect(described_class::LOW_MEMORY_THRESHOLD_MB).to be > 0
|
|
124
|
+
expect(described_class::HIGH_MEMORY_THRESHOLD_MB).to be > described_class::LOW_MEMORY_THRESHOLD_MB
|
|
125
|
+
expect(described_class::TARGET_MEMORY_USAGE_PERCENT).to be_between(50, 90)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
it "has reasonable buffer size bounds" do
|
|
129
|
+
expect(described_class::MIN_BUFFER_SIZE).to be > 0
|
|
130
|
+
expect(described_class::MAX_BUFFER_SIZE).to be > described_class::MIN_BUFFER_SIZE
|
|
131
|
+
expect(described_class::DEFAULT_BUFFER_SIZE).to be >= described_class::MIN_BUFFER_SIZE
|
|
132
|
+
expect(described_class::DEFAULT_BUFFER_SIZE).to be <= described_class::MAX_BUFFER_SIZE
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
it "has reasonable memory per process value" do
|
|
136
|
+
expect(described_class::MEMORY_PER_PROCESS_MB).to be_between(100, 1000)
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
describe ".optimal_processes" do
|
|
141
|
+
it "returns a positive integer" do
|
|
142
|
+
result = described_class.optimal_processes
|
|
143
|
+
expect(result).to be_a(Integer)
|
|
144
|
+
expect(result).to be >= 1
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
it "returns a value less than or equal to CPU cores" do
|
|
148
|
+
result = described_class.optimal_processes
|
|
149
|
+
cores = Etc.nprocessors
|
|
150
|
+
expect(result).to be <= cores
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it "accepts custom memory_per_process_mb parameter" do
|
|
154
|
+
# With very high memory requirement, should return fewer processes
|
|
155
|
+
high_mem = described_class.optimal_processes(memory_per_process_mb: 10_000)
|
|
156
|
+
low_mem = described_class.optimal_processes(memory_per_process_mb: 100)
|
|
157
|
+
expect(high_mem).to be <= low_mem
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
it "returns at least 1 even with extreme memory constraints" do
|
|
161
|
+
result = described_class.optimal_processes(memory_per_process_mb: 1_000_000)
|
|
162
|
+
expect(result).to be >= 1
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
describe ".parallel_processing_info" do
|
|
167
|
+
it "returns a hash with expected keys" do
|
|
168
|
+
info = described_class.parallel_processing_info
|
|
169
|
+
expect(info).to be_a(Hash)
|
|
170
|
+
expect(info).to have_key(:cpu_cores)
|
|
171
|
+
expect(info).to have_key(:available_memory_mb)
|
|
172
|
+
expect(info).to have_key(:memory_per_process_mb)
|
|
173
|
+
expect(info).to have_key(:optimal_processes)
|
|
174
|
+
expect(info).to have_key(:max_by_cpu)
|
|
175
|
+
expect(info).to have_key(:max_by_memory)
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
it "returns consistent values" do
|
|
179
|
+
info = described_class.parallel_processing_info
|
|
180
|
+
expect(info[:cpu_cores]).to eq(Etc.nprocessors)
|
|
181
|
+
expect(info[:memory_per_process_mb]).to eq(described_class::MEMORY_PER_PROCESS_MB)
|
|
182
|
+
expect(info[:optimal_processes]).to eq(described_class.optimal_processes)
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
it "returns positive values for all numeric fields" do
|
|
186
|
+
info = described_class.parallel_processing_info
|
|
187
|
+
expect(info[:cpu_cores]).to be > 0
|
|
188
|
+
expect(info[:available_memory_mb]).to be > 0
|
|
189
|
+
expect(info[:optimal_processes]).to be > 0
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
end
|