wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
|
|
5
|
+
RSpec.describe Wp2txt::MagicWordExpander do
|
|
6
|
+
let(:title) { "Test Article" }
|
|
7
|
+
let(:namespace) { "" }
|
|
8
|
+
let(:dump_date) { Time.new(2024, 6, 15, 14, 30, 45) }
|
|
9
|
+
let(:expander) { described_class.new(title, namespace: namespace, dump_date: dump_date) }
|
|
10
|
+
|
|
11
|
+
describe "#expand" do
|
|
12
|
+
context "page context magic words" do
|
|
13
|
+
it "expands {{PAGENAME}}" do
|
|
14
|
+
expect(expander.expand("{{PAGENAME}}")).to eq("Test Article")
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
it "expands {{pagename}} (case-insensitive)" do
|
|
18
|
+
expect(expander.expand("{{pagename}}")).to eq("Test Article")
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
it "expands {{PAGENAMEE}} with URL encoding" do
|
|
22
|
+
expect(expander.expand("{{PAGENAMEE}}")).to eq("Test_Article")
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it "expands {{FULLPAGENAME}} without namespace" do
|
|
26
|
+
expect(expander.expand("{{FULLPAGENAME}}")).to eq("Test Article")
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
context "with namespace" do
|
|
30
|
+
let(:namespace) { "Wikipedia" }
|
|
31
|
+
|
|
32
|
+
it "expands {{FULLPAGENAME}} with namespace" do
|
|
33
|
+
expect(expander.expand("{{FULLPAGENAME}}")).to eq("Wikipedia:Test Article")
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
it "expands {{NAMESPACE}}" do
|
|
37
|
+
expect(expander.expand("{{NAMESPACE}}")).to eq("Wikipedia")
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
context "with subpage title" do
|
|
42
|
+
let(:title) { "Main Page/Subpage/Deep" }
|
|
43
|
+
|
|
44
|
+
it "expands {{BASEPAGENAME}} to parent" do
|
|
45
|
+
expect(expander.expand("{{BASEPAGENAME}}")).to eq("Main Page/Subpage")
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it "expands {{ROOTPAGENAME}} to root" do
|
|
49
|
+
expect(expander.expand("{{ROOTPAGENAME}}")).to eq("Main Page")
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
it "expands {{SUBPAGENAME}} to last part" do
|
|
53
|
+
expect(expander.expand("{{SUBPAGENAME}}")).to eq("Deep")
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
it "expands {{TALKPAGENAME}}" do
|
|
58
|
+
expect(expander.expand("{{TALKPAGENAME}}")).to eq("Talk:Test Article")
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
it "expands {{NAMESPACENUMBER}} for main namespace" do
|
|
62
|
+
expect(expander.expand("{{NAMESPACENUMBER}}")).to eq("0")
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
context "date/time magic words" do
|
|
67
|
+
it "expands {{CURRENTYEAR}}" do
|
|
68
|
+
expect(expander.expand("{{CURRENTYEAR}}")).to eq("2024")
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
it "expands {{CURRENTMONTH}} with zero padding" do
|
|
72
|
+
expect(expander.expand("{{CURRENTMONTH}}")).to eq("06")
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
it "expands {{CURRENTMONTH1}} without zero padding" do
|
|
76
|
+
expect(expander.expand("{{CURRENTMONTH1}}")).to eq("6")
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
it "expands {{CURRENTMONTHNAME}}" do
|
|
80
|
+
expect(expander.expand("{{CURRENTMONTHNAME}}")).to eq("June")
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
it "expands {{CURRENTMONTHABBREV}}" do
|
|
84
|
+
expect(expander.expand("{{CURRENTMONTHABBREV}}")).to eq("Jun")
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
it "expands {{CURRENTDAY}}" do
|
|
88
|
+
expect(expander.expand("{{CURRENTDAY}}")).to eq("15")
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
it "expands {{CURRENTDAY2}} with zero padding" do
|
|
92
|
+
dump_date_single_digit = Time.new(2024, 6, 5)
|
|
93
|
+
exp = described_class.new(title, dump_date: dump_date_single_digit)
|
|
94
|
+
expect(exp.expand("{{CURRENTDAY2}}")).to eq("05")
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
it "expands {{CURRENTDOW}} (day of week)" do
|
|
98
|
+
# June 15, 2024 is a Saturday (6)
|
|
99
|
+
expect(expander.expand("{{CURRENTDOW}}")).to eq("6")
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
it "expands {{CURRENTDAYNAME}}" do
|
|
103
|
+
expect(expander.expand("{{CURRENTDAYNAME}}")).to eq("Saturday")
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
it "expands {{CURRENTTIME}}" do
|
|
107
|
+
expect(expander.expand("{{CURRENTTIME}}")).to eq("14:30")
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
it "expands {{CURRENTHOUR}}" do
|
|
111
|
+
expect(expander.expand("{{CURRENTHOUR}}")).to eq("14")
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
it "expands {{CURRENTTIMESTAMP}}" do
|
|
115
|
+
expect(expander.expand("{{CURRENTTIMESTAMP}}")).to eq("20240615143045")
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
it "expands {{LOCALYEAR}} (same as CURRENTYEAR)" do
|
|
119
|
+
expect(expander.expand("{{LOCALYEAR}}")).to eq("2024")
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
context "string functions" do
|
|
124
|
+
it "expands {{lc:TEXT}}" do
|
|
125
|
+
expect(expander.expand("{{lc:HELLO WORLD}}")).to eq("hello world")
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
it "expands {{uc:text}}" do
|
|
129
|
+
expect(expander.expand("{{uc:hello world}}")).to eq("HELLO WORLD")
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
it "expands {{lcfirst:TEXT}}" do
|
|
133
|
+
expect(expander.expand("{{lcfirst:HELLO}}")).to eq("hELLO")
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
it "expands {{ucfirst:text}}" do
|
|
137
|
+
expect(expander.expand("{{ucfirst:hello}}")).to eq("Hello")
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
it "expands {{urlencode:...}}" do
|
|
141
|
+
expect(expander.expand("{{urlencode:hello world}}")).to eq("hello_world")
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
it "expands {{anchorencode:...}}" do
|
|
145
|
+
expect(expander.expand("{{anchorencode:hello world}}")).to eq("hello_world")
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
it "expands {{padleft:...}}" do
|
|
149
|
+
expect(expander.expand("{{padleft:7|3|0}}")).to eq("007")
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
it "expands {{padright:...}}" do
|
|
153
|
+
expect(expander.expand("{{padright:7|3|0}}")).to eq("700")
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
it "expands {{formatnum:...}} with thousand separators" do
|
|
157
|
+
expect(expander.expand("{{formatnum:12345}}")).to eq("12,345")
|
|
158
|
+
expect(expander.expand("{{formatnum:1234567}}")).to eq("1,234,567")
|
|
159
|
+
expect(expander.expand("{{formatnum:1234.56}}")).to eq("1,234.56")
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
it "expands {{formatnum:...|R}} to remove formatting" do
|
|
163
|
+
expect(expander.expand("{{formatnum:1,234,567|R}}")).to eq("1234567")
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
context "#titleparts parser function" do
|
|
168
|
+
it "extracts first N segments" do
|
|
169
|
+
expect(expander.expand("{{#titleparts:A/B/C|2}}")).to eq("A/B")
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
it "extracts from offset" do
|
|
173
|
+
expect(expander.expand("{{#titleparts:A/B/C|1|2}}")).to eq("B")
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
it "handles negative count (all but last N)" do
|
|
177
|
+
expect(expander.expand("{{#titleparts:A/B/C/D|-1}}")).to eq("A/B/C")
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
it "returns full path without parameters" do
|
|
181
|
+
expect(expander.expand("{{#titleparts:A/B/C}}")).to eq("A/B/C")
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
context "multiple magic words in one string" do
|
|
186
|
+
it "expands all magic words" do
|
|
187
|
+
input = "Page: {{PAGENAME}}, Year: {{CURRENTYEAR}}, Month: {{CURRENTMONTHNAME}}"
|
|
188
|
+
expected = "Page: Test Article, Year: 2024, Month: June"
|
|
189
|
+
expect(expander.expand(input)).to eq(expected)
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
it "handles mixed case and functions" do
|
|
193
|
+
input = "{{uc:{{PAGENAME}}}} in {{CURRENTYEAR}}"
|
|
194
|
+
# The uc: function uppercases the inner PAGENAME result
|
|
195
|
+
result = expander.expand(input)
|
|
196
|
+
expect(result).to include("TEST ARTICLE")
|
|
197
|
+
expect(result).to include("2024")
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
context "unrecognized magic words" do
|
|
202
|
+
it "leaves unrecognized magic words unchanged" do
|
|
203
|
+
expect(expander.expand("{{UNKNOWNMAGICWORD}}")).to eq("{{UNKNOWNMAGICWORD}}")
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
it "leaves template calls unchanged" do
|
|
207
|
+
expect(expander.expand("{{Infobox|name=test}}")).to eq("{{Infobox|name=test}}")
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
context "edge cases" do
|
|
212
|
+
it "handles nil input" do
|
|
213
|
+
expect(expander.expand(nil)).to eq(nil)
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
it "handles empty string" do
|
|
217
|
+
expect(expander.expand("")).to eq("")
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
it "handles text without magic words" do
|
|
221
|
+
expect(expander.expand("Hello World")).to eq("Hello World")
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
it "handles magic words with extra whitespace" do
|
|
225
|
+
expect(expander.expand("{{ PAGENAME }}")).to eq("Test Article")
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
describe "integration with format_wiki" do
|
|
231
|
+
include Wp2txt
|
|
232
|
+
|
|
233
|
+
it "expands magic words when title is provided in config" do
|
|
234
|
+
input = "This article is about {{PAGENAME}}."
|
|
235
|
+
result = format_wiki(input, title: "Ruby Programming")
|
|
236
|
+
expect(result).to include("Ruby Programming")
|
|
237
|
+
expect(result).not_to include("{{PAGENAME}}")
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
it "does not expand magic words without title in config" do
|
|
241
|
+
input = "This article is about {{PAGENAME}}."
|
|
242
|
+
result = format_wiki(input)
|
|
243
|
+
# Without title, the magic word might be removed as template or left as-is
|
|
244
|
+
# The important thing is it doesn't crash
|
|
245
|
+
expect(result).to be_a(String)
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
it "expands date magic words with current time when dump_date not specified" do
|
|
249
|
+
input = "Year: {{CURRENTYEAR}}"
|
|
250
|
+
result = format_wiki(input, title: "Test")
|
|
251
|
+
expect(result).to match(/Year: \d{4}/)
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
it "expands string functions" do
|
|
255
|
+
input = "{{uc:hello}} {{lc:WORLD}}"
|
|
256
|
+
result = format_wiki(input, title: "Test")
|
|
257
|
+
expect(result).to include("HELLO")
|
|
258
|
+
expect(result).to include("world")
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
end
|