wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Fetches magic words, namespace data, and interwiki info from Wikipedia APIs
|
|
4
|
+
# Usage: ruby scripts/fetch_mediawiki_data.rb
|
|
5
|
+
#
|
|
6
|
+
# This script queries the MediaWiki API for all Wikipedia language editions
|
|
7
|
+
# and extracts:
|
|
8
|
+
# - Magic words (redirect, defaultsort, displaytitle, image options, etc.)
|
|
9
|
+
# - Double-underscore behavior switches (__NOTOC__, __TOC__, etc.)
|
|
10
|
+
# - Namespace names and aliases (Category, File, Template, etc.)
|
|
11
|
+
# - Interwiki map (for sister projects)
|
|
12
|
+
|
|
13
|
+
require "net/http"
|
|
14
|
+
require "json"
|
|
15
|
+
require "fileutils"
|
|
16
|
+
|
|
17
|
+
# Fetch all Wikipedia language codes from Wikimedia sitematrix API
|
|
18
|
+
def fetch_all_wikipedia_languages
|
|
19
|
+
uri = URI("https://meta.wikimedia.org/w/api.php")
|
|
20
|
+
params = {
|
|
21
|
+
action: "sitematrix",
|
|
22
|
+
smtype: "language",
|
|
23
|
+
format: "json"
|
|
24
|
+
}
|
|
25
|
+
uri.query = URI.encode_www_form(params)
|
|
26
|
+
|
|
27
|
+
response = Net::HTTP.get_response(uri)
|
|
28
|
+
return [] unless response.is_a?(Net::HTTPSuccess)
|
|
29
|
+
|
|
30
|
+
data = JSON.parse(response.body)
|
|
31
|
+
languages = []
|
|
32
|
+
|
|
33
|
+
data["sitematrix"].each do |key, val|
|
|
34
|
+
next unless key.match?(/^\d+$/) && val.is_a?(Hash) && val["site"]
|
|
35
|
+
|
|
36
|
+
# Check if this language has a Wikipedia (code: 'wiki')
|
|
37
|
+
has_wikipedia = val["site"].any? { |site| site["code"] == "wiki" }
|
|
38
|
+
languages << val["code"] if has_wikipedia
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
languages.sort
|
|
42
|
+
rescue StandardError => e
|
|
43
|
+
warn "Error fetching language list: #{e.message}"
|
|
44
|
+
[]
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Magic word types we care about for text processing
|
|
48
|
+
RELEVANT_MAGIC_WORDS = %w[
|
|
49
|
+
redirect
|
|
50
|
+
notoc noeditsection nogallery forcetoc toc nocontentconvert nocc
|
|
51
|
+
notitleconvert notc displaytitle defaultsort
|
|
52
|
+
img_thumbnail img_manualthumb img_right img_left img_none img_center
|
|
53
|
+
img_framed img_frameless img_page img_upright img_border img_baseline
|
|
54
|
+
img_sub img_super img_top img_text_top img_middle img_bottom img_text_bottom
|
|
55
|
+
img_link img_alt img_class img_lang
|
|
56
|
+
].freeze
|
|
57
|
+
|
|
58
|
+
# All namespace IDs (negative = special, 0 = main article, positive = other)
|
|
59
|
+
# We want to collect all non-article namespaces for filtering
|
|
60
|
+
# ID 0 is main namespace (articles), we want everything else
|
|
61
|
+
NON_ARTICLE_NAMESPACE_IDS = [
|
|
62
|
+
-2, # Media
|
|
63
|
+
-1, # Special
|
|
64
|
+
# 0 is main article namespace - skip
|
|
65
|
+
1, # Talk
|
|
66
|
+
2, # User
|
|
67
|
+
3, # User talk
|
|
68
|
+
4, # Project (Wikipedia)
|
|
69
|
+
5, # Project talk
|
|
70
|
+
6, # File
|
|
71
|
+
7, # File talk
|
|
72
|
+
8, # MediaWiki
|
|
73
|
+
9, # MediaWiki talk
|
|
74
|
+
10, # Template
|
|
75
|
+
11, # Template talk
|
|
76
|
+
12, # Help
|
|
77
|
+
13, # Help talk
|
|
78
|
+
14, # Category
|
|
79
|
+
15, # Category talk
|
|
80
|
+
# 100+ are custom namespaces per wiki (Portal, WikiProject, Module, etc.)
|
|
81
|
+
].freeze
|
|
82
|
+
|
|
83
|
+
# We still need specific namespace collections for targeted operations
|
|
84
|
+
SPECIFIC_NAMESPACE_IDS = {
|
|
85
|
+
"file" => 6,
|
|
86
|
+
"category" => 14,
|
|
87
|
+
"template" => 10,
|
|
88
|
+
"wikipedia" => 4,
|
|
89
|
+
"help" => 12,
|
|
90
|
+
"portal" => 100, # May not exist in all wikis
|
|
91
|
+
"module" => 828 # May not exist in all wikis
|
|
92
|
+
}.freeze
|
|
93
|
+
|
|
94
|
+
def fetch_siteinfo(lang)
|
|
95
|
+
uri = URI("https://#{lang}.wikipedia.org/w/api.php")
|
|
96
|
+
params = {
|
|
97
|
+
action: "query",
|
|
98
|
+
meta: "siteinfo",
|
|
99
|
+
siprop: "magicwords|namespaces|namespacealiases|interwikimap|extensiontags",
|
|
100
|
+
format: "json"
|
|
101
|
+
}
|
|
102
|
+
uri.query = URI.encode_www_form(params)
|
|
103
|
+
|
|
104
|
+
response = Net::HTTP.get_response(uri)
|
|
105
|
+
return nil unless response.is_a?(Net::HTTPSuccess)
|
|
106
|
+
|
|
107
|
+
JSON.parse(response.body)
|
|
108
|
+
rescue StandardError => e
|
|
109
|
+
warn " Error fetching #{lang}: #{e.message}"
|
|
110
|
+
nil
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def extract_extension_tags(data)
|
|
114
|
+
return [] unless data&.dig("query", "extensiontags")
|
|
115
|
+
|
|
116
|
+
# Extension tags come as ["<tag>", ...], extract just the tag name
|
|
117
|
+
data["query"]["extensiontags"].map do |tag|
|
|
118
|
+
tag.gsub(/^<|>$/, "")
|
|
119
|
+
end.compact.uniq
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def extract_magic_words(data)
|
|
123
|
+
return {} unless data&.dig("query", "magicwords")
|
|
124
|
+
|
|
125
|
+
result = {}
|
|
126
|
+
double_underscore = []
|
|
127
|
+
|
|
128
|
+
data["query"]["magicwords"].each do |mw|
|
|
129
|
+
name = mw["name"]
|
|
130
|
+
aliases = mw["aliases"] || []
|
|
131
|
+
|
|
132
|
+
# Collect double-underscore magic words (behavior switches)
|
|
133
|
+
double_underscore_aliases = aliases.select { |a| a.start_with?("__") && a.end_with?("__") }
|
|
134
|
+
double_underscore.concat(double_underscore_aliases) unless double_underscore_aliases.empty?
|
|
135
|
+
|
|
136
|
+
# Only keep specific magic words we care about
|
|
137
|
+
next unless RELEVANT_MAGIC_WORDS.include?(name)
|
|
138
|
+
|
|
139
|
+
# Remove # prefix for redirect (we'll add it in regex)
|
|
140
|
+
if name == "redirect"
|
|
141
|
+
aliases = aliases.map { |a| a.sub(/^[##]/, "") }
|
|
142
|
+
end
|
|
143
|
+
result[name] = aliases.uniq
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Add double-underscore as a special category
|
|
147
|
+
result["double_underscore"] = double_underscore.uniq unless double_underscore.empty?
|
|
148
|
+
|
|
149
|
+
result
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def extract_namespaces(data)
|
|
153
|
+
return {} unless data&.dig("query", "namespaces")
|
|
154
|
+
|
|
155
|
+
result = {}
|
|
156
|
+
all_non_article = []
|
|
157
|
+
|
|
158
|
+
# Get main namespace names
|
|
159
|
+
data["query"]["namespaces"].each do |id, ns|
|
|
160
|
+
id_int = id.to_i
|
|
161
|
+
|
|
162
|
+
# Collect all non-article namespace names
|
|
163
|
+
if id_int != 0 && ns["*"] && !ns["*"].empty?
|
|
164
|
+
all_non_article << ns["*"]
|
|
165
|
+
all_non_article << ns["canonical"] if ns["canonical"] && !ns["canonical"].empty?
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Also collect specific namespaces by ID
|
|
169
|
+
SPECIFIC_NAMESPACE_IDS.each do |key, target_id|
|
|
170
|
+
next unless id_int == target_id
|
|
171
|
+
|
|
172
|
+
result[key] ||= []
|
|
173
|
+
result[key] << ns["canonical"] if ns["canonical"] && !ns["canonical"].empty?
|
|
174
|
+
result[key] << ns["*"] if ns["*"] && !ns["*"].empty? && ns["*"] != ns["canonical"]
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Get namespace aliases
|
|
179
|
+
(data["query"]["namespacealiases"] || []).each do |alias_info|
|
|
180
|
+
id = alias_info["id"]
|
|
181
|
+
alias_name = alias_info["*"]
|
|
182
|
+
next unless alias_name && !alias_name.empty?
|
|
183
|
+
|
|
184
|
+
# Add to all non-article namespaces
|
|
185
|
+
all_non_article << alias_name if id != 0
|
|
186
|
+
|
|
187
|
+
# Add to specific namespace collections
|
|
188
|
+
SPECIFIC_NAMESPACE_IDS.each do |key, target_id|
|
|
189
|
+
next unless id == target_id
|
|
190
|
+
|
|
191
|
+
result[key] ||= []
|
|
192
|
+
result[key] << alias_name
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Store all non-article namespace names
|
|
197
|
+
result["non_article"] = all_non_article.uniq
|
|
198
|
+
|
|
199
|
+
# Deduplicate all collections
|
|
200
|
+
result.transform_values!(&:uniq)
|
|
201
|
+
result
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def extract_interwiki(data)
|
|
205
|
+
return {} unless data&.dig("query", "interwikimap")
|
|
206
|
+
|
|
207
|
+
result = {}
|
|
208
|
+
sister_projects = []
|
|
209
|
+
|
|
210
|
+
# Known Wikimedia sister project prefixes
|
|
211
|
+
wikimedia_projects = %w[
|
|
212
|
+
commons wikibooks wikinews wikiquote wikisource
|
|
213
|
+
wikiversity wikivoyage wiktionary wikidata wikispecies
|
|
214
|
+
meta mediawiki mediawikiwiki species
|
|
215
|
+
]
|
|
216
|
+
|
|
217
|
+
data["query"]["interwikimap"].each do |iw|
|
|
218
|
+
prefix = iw["prefix"]
|
|
219
|
+
url = iw["url"] || ""
|
|
220
|
+
|
|
221
|
+
# Check if this is a Wikimedia project
|
|
222
|
+
is_wikimedia = wikimedia_projects.include?(prefix) ||
|
|
223
|
+
url.include?("wikimedia.org") ||
|
|
224
|
+
url.include?("wikipedia.org") ||
|
|
225
|
+
url.include?("wikibooks.org") ||
|
|
226
|
+
url.include?("wikinews.org") ||
|
|
227
|
+
url.include?("wikiquote.org") ||
|
|
228
|
+
url.include?("wikisource.org") ||
|
|
229
|
+
url.include?("wikiversity.org") ||
|
|
230
|
+
url.include?("wikivoyage.org") ||
|
|
231
|
+
url.include?("wiktionary.org") ||
|
|
232
|
+
url.include?("wikidata.org") ||
|
|
233
|
+
url.include?("wikispecies.org") ||
|
|
234
|
+
url.include?("mediawiki.org")
|
|
235
|
+
|
|
236
|
+
sister_projects << prefix if is_wikimedia
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
result["sister_projects"] = sister_projects.uniq
|
|
240
|
+
result
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
def main
|
|
244
|
+
puts "Fetching list of all Wikipedia languages..."
|
|
245
|
+
languages = fetch_all_wikipedia_languages
|
|
246
|
+
|
|
247
|
+
if languages.empty?
|
|
248
|
+
warn "Failed to fetch language list. Aborting."
|
|
249
|
+
exit 1
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
puts "Found #{languages.size} Wikipedia editions. Fetching data..."
|
|
253
|
+
|
|
254
|
+
all_magic_words = Hash.new { |h, k| h[k] = Set.new }
|
|
255
|
+
all_namespaces = Hash.new { |h, k| h[k] = Set.new }
|
|
256
|
+
all_interwiki = Hash.new { |h, k| h[k] = Set.new }
|
|
257
|
+
all_extension_tags = Set.new
|
|
258
|
+
successful = 0
|
|
259
|
+
failed = []
|
|
260
|
+
|
|
261
|
+
languages.each_with_index do |lang, idx|
|
|
262
|
+
print "\r Processing: #{lang.ljust(10)} (#{idx + 1}/#{languages.size})"
|
|
263
|
+
$stdout.flush
|
|
264
|
+
|
|
265
|
+
data = fetch_siteinfo(lang)
|
|
266
|
+
unless data
|
|
267
|
+
failed << lang
|
|
268
|
+
next
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
# Merge magic words
|
|
272
|
+
extract_magic_words(data).each do |name, aliases|
|
|
273
|
+
aliases.each { |a| all_magic_words[name] << a }
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# Merge namespaces
|
|
277
|
+
extract_namespaces(data).each do |name, aliases|
|
|
278
|
+
aliases.each { |a| all_namespaces[name] << a }
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
# Merge interwiki
|
|
282
|
+
extract_interwiki(data).each do |name, prefixes|
|
|
283
|
+
prefixes.each { |p| all_interwiki[name] << p }
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
# Merge extension tags (consistent across all wikis, but collect from all to be safe)
|
|
287
|
+
extract_extension_tags(data).each { |tag| all_extension_tags << tag }
|
|
288
|
+
|
|
289
|
+
successful += 1
|
|
290
|
+
sleep 0.05 # Rate limiting (faster since we have many languages)
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
puts "\n Successfully fetched: #{successful}/#{languages.size}"
|
|
294
|
+
puts " Failed: #{failed.size} (#{failed.first(10).join(', ')}#{failed.size > 10 ? '...' : ''})" if failed.any?
|
|
295
|
+
|
|
296
|
+
# Convert Sets to sorted Arrays
|
|
297
|
+
result = {
|
|
298
|
+
"meta" => {
|
|
299
|
+
"generated_at" => Time.now.utc.iso8601,
|
|
300
|
+
"source" => "MediaWiki API (siteinfo via Wikimedia sitematrix)",
|
|
301
|
+
"languages_queried" => languages.size,
|
|
302
|
+
"languages_successful" => successful
|
|
303
|
+
},
|
|
304
|
+
"magic_words" => all_magic_words.transform_values { |v| v.to_a.sort },
|
|
305
|
+
"namespaces" => all_namespaces.transform_values { |v| v.to_a.sort },
|
|
306
|
+
"interwiki" => all_interwiki.transform_values { |v| v.to_a.sort },
|
|
307
|
+
"extension_tags" => all_extension_tags.to_a.sort
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
# Write output
|
|
311
|
+
output_path = File.join(__dir__, "..", "lib", "wp2txt", "data", "mediawiki_aliases.json")
|
|
312
|
+
FileUtils.mkdir_p(File.dirname(output_path))
|
|
313
|
+
|
|
314
|
+
File.write(output_path, JSON.pretty_generate(result))
|
|
315
|
+
puts "\nData written to: #{output_path}"
|
|
316
|
+
|
|
317
|
+
# Summary
|
|
318
|
+
puts "\n=== Summary ==="
|
|
319
|
+
puts "Magic Words:"
|
|
320
|
+
result["magic_words"].each do |name, aliases|
|
|
321
|
+
puts " #{name}: #{aliases.size} aliases"
|
|
322
|
+
end
|
|
323
|
+
puts "\nNamespaces:"
|
|
324
|
+
result["namespaces"].each do |name, aliases|
|
|
325
|
+
puts " #{name}: #{aliases.size} aliases"
|
|
326
|
+
end
|
|
327
|
+
puts "\nInterwiki:"
|
|
328
|
+
result["interwiki"].each do |name, prefixes|
|
|
329
|
+
puts " #{name}: #{prefixes.size} prefixes"
|
|
330
|
+
end
|
|
331
|
+
puts "\nExtension Tags: #{result["extension_tags"].size} tags"
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
main if __FILE__ == $PROGRAM_NAME
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Fetches template aliases and redirects from Wikipedia APIs
|
|
4
|
+
# Usage: ruby scripts/fetch_template_data.rb
|
|
5
|
+
#
|
|
6
|
+
# This script augments the existing template_aliases.json by:
|
|
7
|
+
# - Fetching redirects for known templates (to discover aliases)
|
|
8
|
+
# - Validating that templates exist
|
|
9
|
+
# - Merging new aliases from multiple Wikipedia language editions
|
|
10
|
+
#
|
|
11
|
+
# Note: Unlike magic words and namespaces (which come from MediaWiki siteinfo),
|
|
12
|
+
# templates are wiki-specific pages. This script queries a subset of major
|
|
13
|
+
# Wikipedia editions to collect common template aliases.
|
|
14
|
+
|
|
15
|
+
require "net/http"
|
|
16
|
+
require "json"
|
|
17
|
+
require "fileutils"
|
|
18
|
+
require "set"
|
|
19
|
+
|
|
20
|
+
# Languages to query for template data
|
|
21
|
+
TEMPLATE_LANGUAGES = %w[en ja de fr es it ru zh pt nl pl ar ko].freeze
|
|
22
|
+
|
|
23
|
+
# Base template names to look up redirects for (English)
|
|
24
|
+
# These are canonical names; we'll find their translations and aliases
|
|
25
|
+
BASE_TEMPLATES = {
|
|
26
|
+
"remove_templates" => %w[
|
|
27
|
+
Reflist Notelist Sfn Efn Main See_also Further About Portal
|
|
28
|
+
],
|
|
29
|
+
"authority_control" => %w[
|
|
30
|
+
Authority_control Normdaten Persondata
|
|
31
|
+
],
|
|
32
|
+
"citation_templates" => %w[
|
|
33
|
+
Cite_web Cite_book Cite_news Cite_journal Citation
|
|
34
|
+
],
|
|
35
|
+
"sister_project_templates" => %w[
|
|
36
|
+
Commons Commons_category Wiktionary Wikiquote Wikisource
|
|
37
|
+
]
|
|
38
|
+
}.freeze
|
|
39
|
+
|
|
40
|
+
# Fetch template redirects from a specific Wikipedia
|
|
41
|
+
def fetch_template_redirects(lang, template_name)
|
|
42
|
+
uri = URI("https://#{lang}.wikipedia.org/w/api.php")
|
|
43
|
+
params = {
|
|
44
|
+
action: "query",
|
|
45
|
+
titles: "Template:#{template_name}",
|
|
46
|
+
prop: "redirects",
|
|
47
|
+
rdlimit: "max",
|
|
48
|
+
format: "json"
|
|
49
|
+
}
|
|
50
|
+
uri.query = URI.encode_www_form(params)
|
|
51
|
+
|
|
52
|
+
response = Net::HTTP.get_response(uri)
|
|
53
|
+
return [] unless response.is_a?(Net::HTTPSuccess)
|
|
54
|
+
|
|
55
|
+
data = JSON.parse(response.body)
|
|
56
|
+
pages = data.dig("query", "pages") || {}
|
|
57
|
+
|
|
58
|
+
redirects = []
|
|
59
|
+
pages.each_value do |page|
|
|
60
|
+
next if page["missing"]
|
|
61
|
+
|
|
62
|
+
# Add the page title itself (normalized)
|
|
63
|
+
page_title = page["title"]&.sub(/^Template:/, "")
|
|
64
|
+
redirects << page_title if page_title
|
|
65
|
+
|
|
66
|
+
# Add all redirects
|
|
67
|
+
(page["redirects"] || []).each do |rd|
|
|
68
|
+
rd_title = rd["title"]&.sub(/^Template:/, "")
|
|
69
|
+
redirects << rd_title if rd_title
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
redirects.compact.uniq
|
|
74
|
+
rescue StandardError => e
|
|
75
|
+
warn " Error fetching #{lang}:Template:#{template_name}: #{e.message}"
|
|
76
|
+
[]
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Fetch all templates in a category
|
|
80
|
+
def fetch_category_members(lang, category_name, limit: 100)
|
|
81
|
+
uri = URI("https://#{lang}.wikipedia.org/w/api.php")
|
|
82
|
+
params = {
|
|
83
|
+
action: "query",
|
|
84
|
+
list: "categorymembers",
|
|
85
|
+
cmtitle: "Category:#{category_name}",
|
|
86
|
+
cmtype: "page",
|
|
87
|
+
cmnamespace: "10", # Template namespace
|
|
88
|
+
cmlimit: limit.to_s,
|
|
89
|
+
format: "json"
|
|
90
|
+
}
|
|
91
|
+
uri.query = URI.encode_www_form(params)
|
|
92
|
+
|
|
93
|
+
response = Net::HTTP.get_response(uri)
|
|
94
|
+
return [] unless response.is_a?(Net::HTTPSuccess)
|
|
95
|
+
|
|
96
|
+
data = JSON.parse(response.body)
|
|
97
|
+
members = data.dig("query", "categorymembers") || []
|
|
98
|
+
|
|
99
|
+
members.map { |m| m["title"]&.sub(/^Template:/, "") }.compact
|
|
100
|
+
rescue StandardError => e
|
|
101
|
+
warn " Error fetching category #{category_name}: #{e.message}"
|
|
102
|
+
[]
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def main
|
|
106
|
+
puts "Template Data Fetcher"
|
|
107
|
+
puts "=" * 50
|
|
108
|
+
|
|
109
|
+
# Load existing data
|
|
110
|
+
data_path = File.join(__dir__, "..", "lib", "wp2txt", "data", "template_aliases.json")
|
|
111
|
+
existing_data = if File.exist?(data_path)
|
|
112
|
+
JSON.parse(File.read(data_path))
|
|
113
|
+
else
|
|
114
|
+
{ "meta" => {} }
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Convert arrays to sets for efficient merging
|
|
118
|
+
categories = {}
|
|
119
|
+
existing_data.each do |key, value|
|
|
120
|
+
next if key == "meta"
|
|
121
|
+
categories[key] = Set.new(value) if value.is_a?(Array)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
puts "\nFetching template redirects from #{TEMPLATE_LANGUAGES.size} Wikipedia editions..."
|
|
125
|
+
|
|
126
|
+
# Fetch redirects for base templates
|
|
127
|
+
BASE_TEMPLATES.each do |category, templates|
|
|
128
|
+
puts "\n#{category}:"
|
|
129
|
+
templates.each do |template|
|
|
130
|
+
TEMPLATE_LANGUAGES.each do |lang|
|
|
131
|
+
print " #{lang}:#{template}..."
|
|
132
|
+
aliases = fetch_template_redirects(lang, template)
|
|
133
|
+
if aliases.any?
|
|
134
|
+
categories[category] ||= Set.new
|
|
135
|
+
aliases.each { |a| categories[category] << a }
|
|
136
|
+
puts " #{aliases.size} aliases"
|
|
137
|
+
else
|
|
138
|
+
puts " not found"
|
|
139
|
+
end
|
|
140
|
+
sleep 0.1 # Rate limiting
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Fetch citation templates from category (English)
|
|
146
|
+
puts "\nFetching citation templates from category..."
|
|
147
|
+
citation_members = fetch_category_members("en", "Citation templates")
|
|
148
|
+
if citation_members.any?
|
|
149
|
+
categories["citation_templates"] ||= Set.new
|
|
150
|
+
citation_members.each { |t| categories["citation_templates"] << t }
|
|
151
|
+
puts " Found #{citation_members.size} citation templates"
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Fetch hatnote templates
|
|
155
|
+
puts "\nFetching hatnote templates from category..."
|
|
156
|
+
hatnote_members = fetch_category_members("en", "Hatnote templates")
|
|
157
|
+
if hatnote_members.any?
|
|
158
|
+
categories["remove_templates"] ||= Set.new
|
|
159
|
+
hatnote_members.each { |t| categories["remove_templates"] << t }
|
|
160
|
+
puts " Found #{hatnote_members.size} hatnote templates"
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Convert sets back to sorted arrays
|
|
164
|
+
result = { "meta" => existing_data["meta"] || {} }
|
|
165
|
+
result["meta"]["generated_at"] = Time.now.utc.iso8601
|
|
166
|
+
result["meta"]["source"] = "Manual curation + MediaWiki API (templates)"
|
|
167
|
+
result["meta"]["languages_queried"] = TEMPLATE_LANGUAGES
|
|
168
|
+
|
|
169
|
+
categories.each do |key, set|
|
|
170
|
+
result[key] = set.to_a.sort_by(&:downcase)
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Write output
|
|
174
|
+
File.write(data_path, JSON.pretty_generate(result))
|
|
175
|
+
puts "\n" + "=" * 50
|
|
176
|
+
puts "Data written to: #{data_path}"
|
|
177
|
+
|
|
178
|
+
# Summary
|
|
179
|
+
puts "\n=== Summary ==="
|
|
180
|
+
result.each do |key, value|
|
|
181
|
+
next if key == "meta"
|
|
182
|
+
puts " #{key}: #{value.size} templates"
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
main if __FILE__ == $PROGRAM_NAME
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Memory profiling script for wp2txt streaming operations
|
|
4
|
+
# Demonstrates the MemoryMonitor module and adaptive buffer sizing
|
|
5
|
+
#
|
|
6
|
+
# Usage: ruby scripts/profile_memory.rb [input_file.xml]
|
|
7
|
+
|
|
8
|
+
$LOAD_PATH.unshift(File.expand_path("../lib", __dir__))
|
|
9
|
+
require "wp2txt"
|
|
10
|
+
require "wp2txt/memory_monitor"
|
|
11
|
+
require "wp2txt/stream_processor"
|
|
12
|
+
|
|
13
|
+
def print_separator(title = nil)
|
|
14
|
+
puts
|
|
15
|
+
puts "=" * 60
|
|
16
|
+
puts title if title
|
|
17
|
+
puts "=" * 60
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def profile_memory_monitor
|
|
21
|
+
print_separator("MemoryMonitor Module Test")
|
|
22
|
+
|
|
23
|
+
puts "System Memory Information:"
|
|
24
|
+
puts "-" * 40
|
|
25
|
+
|
|
26
|
+
stats = Wp2txt::MemoryMonitor.memory_stats
|
|
27
|
+
stats.each do |key, value|
|
|
28
|
+
formatted_key = key.to_s.gsub("_", " ").capitalize
|
|
29
|
+
puts " #{formatted_key}: #{value}"
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
puts
|
|
33
|
+
puts "Raw values:"
|
|
34
|
+
puts " Current usage: #{Wp2txt::MemoryMonitor.format_memory(Wp2txt::MemoryMonitor.current_memory_usage)}"
|
|
35
|
+
puts " Total system: #{Wp2txt::MemoryMonitor.format_memory(Wp2txt::MemoryMonitor.total_system_memory)}"
|
|
36
|
+
puts " Available: #{Wp2txt::MemoryMonitor.format_memory(Wp2txt::MemoryMonitor.available_memory)}"
|
|
37
|
+
puts " Optimal buffer: #{Wp2txt::MemoryMonitor.format_memory(Wp2txt::MemoryMonitor.optimal_buffer_size)}"
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def profile_stream_processor(input_file)
|
|
41
|
+
print_separator("StreamProcessor Memory Profile")
|
|
42
|
+
|
|
43
|
+
unless File.exist?(input_file)
|
|
44
|
+
puts "File not found: #{input_file}"
|
|
45
|
+
puts "Creating a sample XML file for testing..."
|
|
46
|
+
|
|
47
|
+
# Create a sample file for testing
|
|
48
|
+
sample_xml = <<~XML
|
|
49
|
+
<mediawiki>
|
|
50
|
+
#{(1..100).map { |i| <<~PAGE
|
|
51
|
+
<page>
|
|
52
|
+
<title>Test Article #{i}</title>
|
|
53
|
+
<revision>
|
|
54
|
+
<text>This is test content for article #{i}. #{"Lorem ipsum " * 100}</text>
|
|
55
|
+
</revision>
|
|
56
|
+
</page>
|
|
57
|
+
PAGE
|
|
58
|
+
}.join}
|
|
59
|
+
</mediawiki>
|
|
60
|
+
XML
|
|
61
|
+
|
|
62
|
+
input_file = "/tmp/wp2txt_test_sample.xml"
|
|
63
|
+
File.write(input_file, sample_xml)
|
|
64
|
+
puts "Created sample file: #{input_file} (#{File.size(input_file)} bytes)"
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
puts
|
|
68
|
+
puts "Processing: #{input_file}"
|
|
69
|
+
puts "File size: #{Wp2txt::MemoryMonitor.format_memory(File.size(input_file))}"
|
|
70
|
+
puts
|
|
71
|
+
|
|
72
|
+
# Test with adaptive buffer
|
|
73
|
+
puts "Testing with adaptive buffer sizing:"
|
|
74
|
+
puts "-" * 40
|
|
75
|
+
|
|
76
|
+
processor = Wp2txt::StreamProcessor.new(input_file, adaptive_buffer: true)
|
|
77
|
+
puts "Initial buffer size: #{Wp2txt::MemoryMonitor.format_memory(processor.buffer_size)}"
|
|
78
|
+
|
|
79
|
+
start_time = Time.now
|
|
80
|
+
start_memory = Wp2txt::MemoryMonitor.current_memory_usage
|
|
81
|
+
|
|
82
|
+
page_count = 0
|
|
83
|
+
processor.each_page do |title, _text|
|
|
84
|
+
page_count += 1
|
|
85
|
+
if page_count % 10 == 0
|
|
86
|
+
stats = processor.stats
|
|
87
|
+
puts " Processed #{page_count} pages, buffer: #{Wp2txt::MemoryMonitor.format_memory(stats[:buffer_size])}"
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
end_time = Time.now
|
|
92
|
+
end_memory = Wp2txt::MemoryMonitor.current_memory_usage
|
|
93
|
+
|
|
94
|
+
puts
|
|
95
|
+
puts "Final Statistics:"
|
|
96
|
+
final_stats = processor.stats
|
|
97
|
+
puts " Pages processed: #{final_stats[:pages_processed]}"
|
|
98
|
+
puts " Bytes read: #{Wp2txt::MemoryMonitor.format_memory(final_stats[:bytes_read])}"
|
|
99
|
+
puts " Final buffer size: #{Wp2txt::MemoryMonitor.format_memory(final_stats[:buffer_size])}"
|
|
100
|
+
puts " Processing time: #{(end_time - start_time).round(3)}s"
|
|
101
|
+
puts " Memory delta: #{Wp2txt::MemoryMonitor.format_memory(end_memory - start_memory)}"
|
|
102
|
+
|
|
103
|
+
# Test without adaptive buffer
|
|
104
|
+
puts
|
|
105
|
+
puts "Testing with fixed buffer sizing (10 MB):"
|
|
106
|
+
puts "-" * 40
|
|
107
|
+
|
|
108
|
+
processor2 = Wp2txt::StreamProcessor.new(input_file, adaptive_buffer: false)
|
|
109
|
+
puts "Fixed buffer size: #{Wp2txt::MemoryMonitor.format_memory(processor2.buffer_size)}"
|
|
110
|
+
|
|
111
|
+
start_time = Time.now
|
|
112
|
+
start_memory = Wp2txt::MemoryMonitor.current_memory_usage
|
|
113
|
+
|
|
114
|
+
page_count = 0
|
|
115
|
+
processor2.each_page do |_title, _text|
|
|
116
|
+
page_count += 1
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
end_time = Time.now
|
|
120
|
+
end_memory = Wp2txt::MemoryMonitor.current_memory_usage
|
|
121
|
+
|
|
122
|
+
final_stats = processor2.stats
|
|
123
|
+
puts " Pages processed: #{final_stats[:pages_processed]}"
|
|
124
|
+
puts " Processing time: #{(end_time - start_time).round(3)}s"
|
|
125
|
+
puts " Memory delta: #{Wp2txt::MemoryMonitor.format_memory(end_memory - start_memory)}"
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def main
|
|
129
|
+
profile_memory_monitor
|
|
130
|
+
|
|
131
|
+
input_file = ARGV[0] || "/tmp/wp2txt_test_sample.xml"
|
|
132
|
+
profile_stream_processor(input_file)
|
|
133
|
+
|
|
134
|
+
print_separator("Memory Profiling Complete")
|
|
135
|
+
puts "Current memory: #{Wp2txt::MemoryMonitor.format_memory(Wp2txt::MemoryMonitor.current_memory_usage)}"
|
|
136
|
+
puts "Memory low?: #{Wp2txt::MemoryMonitor.memory_low?}"
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
main
|