wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
|
|
5
|
+
RSpec.describe "MediaWiki Data" do
|
|
6
|
+
describe "extension_tags in mediawiki_aliases.json" do
|
|
7
|
+
before(:all) do
|
|
8
|
+
data_path = File.join(__dir__, "..", "lib", "wp2txt", "data", "mediawiki_aliases.json")
|
|
9
|
+
@mediawiki_data = JSON.parse(File.read(data_path))
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it "contains extension_tags array" do
|
|
13
|
+
expect(@mediawiki_data["extension_tags"]).to be_an(Array)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
it "contains common extension tags" do
|
|
17
|
+
tags = @mediawiki_data["extension_tags"]
|
|
18
|
+
expect(tags).to include("gallery")
|
|
19
|
+
expect(tags).to include("timeline")
|
|
20
|
+
expect(tags).to include("imagemap")
|
|
21
|
+
expect(tags).to include("math")
|
|
22
|
+
expect(tags).to include("ref")
|
|
23
|
+
expect(tags).to include("syntaxhighlight")
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
it "contains German-specific tag" do
|
|
27
|
+
tags = @mediawiki_data["extension_tags"]
|
|
28
|
+
expect(tags).to include("abschnitt") # German for "section"
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
RSpec.describe "Template Data" do
|
|
34
|
+
let(:data_path) { File.join(__dir__, "..", "lib", "wp2txt", "data", "template_aliases.json") }
|
|
35
|
+
|
|
36
|
+
describe "template_aliases.json" do
|
|
37
|
+
before(:all) do
|
|
38
|
+
@data = if File.exist?(File.join(__dir__, "..", "lib", "wp2txt", "data", "template_aliases.json"))
|
|
39
|
+
JSON.parse(File.read(File.join(__dir__, "..", "lib", "wp2txt", "data", "template_aliases.json")))
|
|
40
|
+
else
|
|
41
|
+
nil
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
it "exists" do
|
|
46
|
+
expect(File.exist?(data_path)).to be true
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
it "has valid JSON structure" do
|
|
50
|
+
expect(@data).to be_a(Hash)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
it "has meta information" do
|
|
54
|
+
expect(@data["meta"]).to be_a(Hash)
|
|
55
|
+
expect(@data["meta"]["generated_at"]).to be_a(String)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
describe "remove_templates category" do
|
|
59
|
+
it "exists and is an array" do
|
|
60
|
+
expect(@data["remove_templates"]).to be_an(Array)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it "contains known navigation templates" do
|
|
64
|
+
templates = @data["remove_templates"].map(&:downcase)
|
|
65
|
+
# These should be removed in all languages
|
|
66
|
+
expect(templates).to include("reflist")
|
|
67
|
+
expect(templates).to include("refbegin")
|
|
68
|
+
expect(templates).to include("refend")
|
|
69
|
+
expect(templates).to include("notelist")
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
it "contains hatnote templates" do
|
|
73
|
+
templates = @data["remove_templates"].map(&:downcase)
|
|
74
|
+
expect(templates).to include("main")
|
|
75
|
+
expect(templates).to include("see also")
|
|
76
|
+
expect(templates).to include("further")
|
|
77
|
+
expect(templates).to include("about")
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
describe "authority_control category" do
|
|
82
|
+
it "exists and is an array" do
|
|
83
|
+
expect(@data["authority_control"]).to be_an(Array)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
it "contains English authority control templates" do
|
|
87
|
+
templates = @data["authority_control"].map(&:downcase)
|
|
88
|
+
expect(templates).to include("authority control")
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
it "contains German Normdaten template" do
|
|
92
|
+
templates = @data["authority_control"].map(&:downcase)
|
|
93
|
+
expect(templates).to include("normdaten")
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
it "contains identifier templates" do
|
|
97
|
+
templates = @data["authority_control"].map(&:downcase)
|
|
98
|
+
%w[viaf lccn gnd isni orcid].each do |id|
|
|
99
|
+
expect(templates).to include(id)
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
describe "cleanup_remnants category" do
|
|
105
|
+
it "exists and is an array" do
|
|
106
|
+
expect(@data["cleanup_remnants"]).to be_an(Array)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
it "contains layout templates" do
|
|
110
|
+
templates = @data["cleanup_remnants"].map(&:downcase)
|
|
111
|
+
expect(templates).to include("clear")
|
|
112
|
+
expect(templates).to include("clearleft")
|
|
113
|
+
expect(templates).to include("clearright")
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
describe "citation_templates category" do
|
|
118
|
+
it "exists and is an array" do
|
|
119
|
+
expect(@data["citation_templates"]).to be_an(Array)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
it "contains cite templates in multiple languages" do
|
|
123
|
+
templates = @data["citation_templates"].map(&:downcase)
|
|
124
|
+
# English
|
|
125
|
+
expect(templates).to include("cite web")
|
|
126
|
+
expect(templates).to include("cite book")
|
|
127
|
+
expect(templates).to include("citation")
|
|
128
|
+
# Japanese
|
|
129
|
+
expect(templates).to include("cite web") # Same in Japanese Wikipedia
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
describe "ruby_text_templates category" do
|
|
134
|
+
it "exists and is an array" do
|
|
135
|
+
expect(@data["ruby_text_templates"]).to be_an(Array)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
it "contains Japanese ruby template" do
|
|
139
|
+
templates = @data["ruby_text_templates"]
|
|
140
|
+
expect(templates).to include("読み仮名")
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
it "contains English ruby templates" do
|
|
144
|
+
templates = @data["ruby_text_templates"].map(&:downcase)
|
|
145
|
+
expect(templates).to include("ruby")
|
|
146
|
+
expect(templates).to include("ruby-ja")
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
describe "interwiki_link_templates category" do
|
|
151
|
+
it "exists and is an array" do
|
|
152
|
+
expect(@data["interwiki_link_templates"]).to be_an(Array)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
it "contains Japanese 仮リンク template" do
|
|
156
|
+
templates = @data["interwiki_link_templates"]
|
|
157
|
+
expect(templates).to include("仮リンク")
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
it "contains interlanguage link templates" do
|
|
161
|
+
templates = @data["interwiki_link_templates"].map(&:downcase)
|
|
162
|
+
expect(templates).to include("ill") # Interlanguage link
|
|
163
|
+
expect(templates).to include("interlanguage link")
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
describe "mixed_script_templates category" do
|
|
168
|
+
it "exists and is an array" do
|
|
169
|
+
expect(@data["mixed_script_templates"]).to be_an(Array)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
it "contains nihongo template" do
|
|
173
|
+
templates = @data["mixed_script_templates"].map(&:downcase)
|
|
174
|
+
expect(templates).to include("nihongo")
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
it "contains transliteration templates" do
|
|
178
|
+
templates = @data["mixed_script_templates"].map(&:downcase)
|
|
179
|
+
expect(templates).to include("transl")
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
describe "formatting_templates category" do
|
|
184
|
+
it "exists and is an array" do
|
|
185
|
+
expect(@data["formatting_templates"]).to be_an(Array)
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
it "contains size templates" do
|
|
189
|
+
templates = @data["formatting_templates"].map(&:downcase)
|
|
190
|
+
expect(templates).to include("small")
|
|
191
|
+
expect(templates).to include("smaller")
|
|
192
|
+
expect(templates).to include("large")
|
|
193
|
+
expect(templates).to include("larger")
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
it "contains spacing templates" do
|
|
197
|
+
templates = @data["formatting_templates"].map(&:downcase)
|
|
198
|
+
expect(templates).to include("nbsp")
|
|
199
|
+
expect(templates).to include("nowrap")
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
describe "flag_templates category" do
|
|
204
|
+
it "exists and is an array" do
|
|
205
|
+
expect(@data["flag_templates"]).to be_an(Array)
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
it "contains flag template variants" do
|
|
209
|
+
templates = @data["flag_templates"].map(&:downcase)
|
|
210
|
+
expect(templates).to include("flag")
|
|
211
|
+
expect(templates).to include("flagicon")
|
|
212
|
+
expect(templates).to include("flagcountry")
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
describe "portal_templates category" do
|
|
217
|
+
it "exists and is an array" do
|
|
218
|
+
expect(@data["portal_templates"]).to be_an(Array)
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
it "contains portal templates in multiple languages" do
|
|
222
|
+
templates = @data["portal_templates"].map(&:downcase)
|
|
223
|
+
expect(templates).to include("portal")
|
|
224
|
+
# Japanese
|
|
225
|
+
expect(@data["portal_templates"]).to include("ウィキポータルリンク")
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
describe "sister_project_templates category" do
|
|
230
|
+
it "exists and is an array" do
|
|
231
|
+
expect(@data["sister_project_templates"]).to be_an(Array)
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
it "contains commons templates" do
|
|
235
|
+
templates = @data["sister_project_templates"].map(&:downcase)
|
|
236
|
+
expect(templates).to include("commons")
|
|
237
|
+
expect(templates).to include("commons category")
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
it "contains wiktionary templates" do
|
|
241
|
+
templates = @data["sister_project_templates"].map(&:downcase)
|
|
242
|
+
expect(templates).to include("wiktionary")
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
end
|