wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,334 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Fetches magic words, namespace data, and interwiki info from Wikipedia APIs
4
+ # Usage: ruby scripts/fetch_mediawiki_data.rb
5
+ #
6
+ # This script queries the MediaWiki API for all Wikipedia language editions
7
+ # and extracts:
8
+ # - Magic words (redirect, defaultsort, displaytitle, image options, etc.)
9
+ # - Double-underscore behavior switches (__NOTOC__, __TOC__, etc.)
10
+ # - Namespace names and aliases (Category, File, Template, etc.)
11
+ # - Interwiki map (for sister projects)
12
+
13
+ require "net/http"
14
+ require "json"
15
+ require "fileutils"
16
+
17
+ # Fetch all Wikipedia language codes from Wikimedia sitematrix API
18
+ def fetch_all_wikipedia_languages
19
+ uri = URI("https://meta.wikimedia.org/w/api.php")
20
+ params = {
21
+ action: "sitematrix",
22
+ smtype: "language",
23
+ format: "json"
24
+ }
25
+ uri.query = URI.encode_www_form(params)
26
+
27
+ response = Net::HTTP.get_response(uri)
28
+ return [] unless response.is_a?(Net::HTTPSuccess)
29
+
30
+ data = JSON.parse(response.body)
31
+ languages = []
32
+
33
+ data["sitematrix"].each do |key, val|
34
+ next unless key.match?(/^\d+$/) && val.is_a?(Hash) && val["site"]
35
+
36
+ # Check if this language has a Wikipedia (code: 'wiki')
37
+ has_wikipedia = val["site"].any? { |site| site["code"] == "wiki" }
38
+ languages << val["code"] if has_wikipedia
39
+ end
40
+
41
+ languages.sort
42
+ rescue StandardError => e
43
+ warn "Error fetching language list: #{e.message}"
44
+ []
45
+ end
46
+
47
+ # Magic word types we care about for text processing
48
+ RELEVANT_MAGIC_WORDS = %w[
49
+ redirect
50
+ notoc noeditsection nogallery forcetoc toc nocontentconvert nocc
51
+ notitleconvert notc displaytitle defaultsort
52
+ img_thumbnail img_manualthumb img_right img_left img_none img_center
53
+ img_framed img_frameless img_page img_upright img_border img_baseline
54
+ img_sub img_super img_top img_text_top img_middle img_bottom img_text_bottom
55
+ img_link img_alt img_class img_lang
56
+ ].freeze
57
+
58
+ # All namespace IDs (negative = special, 0 = main article, positive = other)
59
+ # We want to collect all non-article namespaces for filtering
60
+ # ID 0 is main namespace (articles), we want everything else
61
+ NON_ARTICLE_NAMESPACE_IDS = [
62
+ -2, # Media
63
+ -1, # Special
64
+ # 0 is main article namespace - skip
65
+ 1, # Talk
66
+ 2, # User
67
+ 3, # User talk
68
+ 4, # Project (Wikipedia)
69
+ 5, # Project talk
70
+ 6, # File
71
+ 7, # File talk
72
+ 8, # MediaWiki
73
+ 9, # MediaWiki talk
74
+ 10, # Template
75
+ 11, # Template talk
76
+ 12, # Help
77
+ 13, # Help talk
78
+ 14, # Category
79
+ 15, # Category talk
80
+ # 100+ are custom namespaces per wiki (Portal, WikiProject, Module, etc.)
81
+ ].freeze
82
+
83
+ # We still need specific namespace collections for targeted operations
84
+ SPECIFIC_NAMESPACE_IDS = {
85
+ "file" => 6,
86
+ "category" => 14,
87
+ "template" => 10,
88
+ "wikipedia" => 4,
89
+ "help" => 12,
90
+ "portal" => 100, # May not exist in all wikis
91
+ "module" => 828 # May not exist in all wikis
92
+ }.freeze
93
+
94
+ def fetch_siteinfo(lang)
95
+ uri = URI("https://#{lang}.wikipedia.org/w/api.php")
96
+ params = {
97
+ action: "query",
98
+ meta: "siteinfo",
99
+ siprop: "magicwords|namespaces|namespacealiases|interwikimap|extensiontags",
100
+ format: "json"
101
+ }
102
+ uri.query = URI.encode_www_form(params)
103
+
104
+ response = Net::HTTP.get_response(uri)
105
+ return nil unless response.is_a?(Net::HTTPSuccess)
106
+
107
+ JSON.parse(response.body)
108
+ rescue StandardError => e
109
+ warn " Error fetching #{lang}: #{e.message}"
110
+ nil
111
+ end
112
+
113
+ def extract_extension_tags(data)
114
+ return [] unless data&.dig("query", "extensiontags")
115
+
116
+ # Extension tags come as ["<tag>", ...], extract just the tag name
117
+ data["query"]["extensiontags"].map do |tag|
118
+ tag.gsub(/^<|>$/, "")
119
+ end.compact.uniq
120
+ end
121
+
122
+ def extract_magic_words(data)
123
+ return {} unless data&.dig("query", "magicwords")
124
+
125
+ result = {}
126
+ double_underscore = []
127
+
128
+ data["query"]["magicwords"].each do |mw|
129
+ name = mw["name"]
130
+ aliases = mw["aliases"] || []
131
+
132
+ # Collect double-underscore magic words (behavior switches)
133
+ double_underscore_aliases = aliases.select { |a| a.start_with?("__") && a.end_with?("__") }
134
+ double_underscore.concat(double_underscore_aliases) unless double_underscore_aliases.empty?
135
+
136
+ # Only keep specific magic words we care about
137
+ next unless RELEVANT_MAGIC_WORDS.include?(name)
138
+
139
+ # Remove # prefix for redirect (we'll add it in regex)
140
+ if name == "redirect"
141
+ aliases = aliases.map { |a| a.sub(/^[##]/, "") }
142
+ end
143
+ result[name] = aliases.uniq
144
+ end
145
+
146
+ # Add double-underscore as a special category
147
+ result["double_underscore"] = double_underscore.uniq unless double_underscore.empty?
148
+
149
+ result
150
+ end
151
+
152
+ def extract_namespaces(data)
153
+ return {} unless data&.dig("query", "namespaces")
154
+
155
+ result = {}
156
+ all_non_article = []
157
+
158
+ # Get main namespace names
159
+ data["query"]["namespaces"].each do |id, ns|
160
+ id_int = id.to_i
161
+
162
+ # Collect all non-article namespace names
163
+ if id_int != 0 && ns["*"] && !ns["*"].empty?
164
+ all_non_article << ns["*"]
165
+ all_non_article << ns["canonical"] if ns["canonical"] && !ns["canonical"].empty?
166
+ end
167
+
168
+ # Also collect specific namespaces by ID
169
+ SPECIFIC_NAMESPACE_IDS.each do |key, target_id|
170
+ next unless id_int == target_id
171
+
172
+ result[key] ||= []
173
+ result[key] << ns["canonical"] if ns["canonical"] && !ns["canonical"].empty?
174
+ result[key] << ns["*"] if ns["*"] && !ns["*"].empty? && ns["*"] != ns["canonical"]
175
+ end
176
+ end
177
+
178
+ # Get namespace aliases
179
+ (data["query"]["namespacealiases"] || []).each do |alias_info|
180
+ id = alias_info["id"]
181
+ alias_name = alias_info["*"]
182
+ next unless alias_name && !alias_name.empty?
183
+
184
+ # Add to all non-article namespaces
185
+ all_non_article << alias_name if id != 0
186
+
187
+ # Add to specific namespace collections
188
+ SPECIFIC_NAMESPACE_IDS.each do |key, target_id|
189
+ next unless id == target_id
190
+
191
+ result[key] ||= []
192
+ result[key] << alias_name
193
+ end
194
+ end
195
+
196
+ # Store all non-article namespace names
197
+ result["non_article"] = all_non_article.uniq
198
+
199
+ # Deduplicate all collections
200
+ result.transform_values!(&:uniq)
201
+ result
202
+ end
203
+
204
+ def extract_interwiki(data)
205
+ return {} unless data&.dig("query", "interwikimap")
206
+
207
+ result = {}
208
+ sister_projects = []
209
+
210
+ # Known Wikimedia sister project prefixes
211
+ wikimedia_projects = %w[
212
+ commons wikibooks wikinews wikiquote wikisource
213
+ wikiversity wikivoyage wiktionary wikidata wikispecies
214
+ meta mediawiki mediawikiwiki species
215
+ ]
216
+
217
+ data["query"]["interwikimap"].each do |iw|
218
+ prefix = iw["prefix"]
219
+ url = iw["url"] || ""
220
+
221
+ # Check if this is a Wikimedia project
222
+ is_wikimedia = wikimedia_projects.include?(prefix) ||
223
+ url.include?("wikimedia.org") ||
224
+ url.include?("wikipedia.org") ||
225
+ url.include?("wikibooks.org") ||
226
+ url.include?("wikinews.org") ||
227
+ url.include?("wikiquote.org") ||
228
+ url.include?("wikisource.org") ||
229
+ url.include?("wikiversity.org") ||
230
+ url.include?("wikivoyage.org") ||
231
+ url.include?("wiktionary.org") ||
232
+ url.include?("wikidata.org") ||
233
+ url.include?("wikispecies.org") ||
234
+ url.include?("mediawiki.org")
235
+
236
+ sister_projects << prefix if is_wikimedia
237
+ end
238
+
239
+ result["sister_projects"] = sister_projects.uniq
240
+ result
241
+ end
242
+
243
+ def main
244
+ puts "Fetching list of all Wikipedia languages..."
245
+ languages = fetch_all_wikipedia_languages
246
+
247
+ if languages.empty?
248
+ warn "Failed to fetch language list. Aborting."
249
+ exit 1
250
+ end
251
+
252
+ puts "Found #{languages.size} Wikipedia editions. Fetching data..."
253
+
254
+ all_magic_words = Hash.new { |h, k| h[k] = Set.new }
255
+ all_namespaces = Hash.new { |h, k| h[k] = Set.new }
256
+ all_interwiki = Hash.new { |h, k| h[k] = Set.new }
257
+ all_extension_tags = Set.new
258
+ successful = 0
259
+ failed = []
260
+
261
+ languages.each_with_index do |lang, idx|
262
+ print "\r Processing: #{lang.ljust(10)} (#{idx + 1}/#{languages.size})"
263
+ $stdout.flush
264
+
265
+ data = fetch_siteinfo(lang)
266
+ unless data
267
+ failed << lang
268
+ next
269
+ end
270
+
271
+ # Merge magic words
272
+ extract_magic_words(data).each do |name, aliases|
273
+ aliases.each { |a| all_magic_words[name] << a }
274
+ end
275
+
276
+ # Merge namespaces
277
+ extract_namespaces(data).each do |name, aliases|
278
+ aliases.each { |a| all_namespaces[name] << a }
279
+ end
280
+
281
+ # Merge interwiki
282
+ extract_interwiki(data).each do |name, prefixes|
283
+ prefixes.each { |p| all_interwiki[name] << p }
284
+ end
285
+
286
+ # Merge extension tags (consistent across all wikis, but collect from all to be safe)
287
+ extract_extension_tags(data).each { |tag| all_extension_tags << tag }
288
+
289
+ successful += 1
290
+ sleep 0.05 # Rate limiting (faster since we have many languages)
291
+ end
292
+
293
+ puts "\n Successfully fetched: #{successful}/#{languages.size}"
294
+ puts " Failed: #{failed.size} (#{failed.first(10).join(', ')}#{failed.size > 10 ? '...' : ''})" if failed.any?
295
+
296
+ # Convert Sets to sorted Arrays
297
+ result = {
298
+ "meta" => {
299
+ "generated_at" => Time.now.utc.iso8601,
300
+ "source" => "MediaWiki API (siteinfo via Wikimedia sitematrix)",
301
+ "languages_queried" => languages.size,
302
+ "languages_successful" => successful
303
+ },
304
+ "magic_words" => all_magic_words.transform_values { |v| v.to_a.sort },
305
+ "namespaces" => all_namespaces.transform_values { |v| v.to_a.sort },
306
+ "interwiki" => all_interwiki.transform_values { |v| v.to_a.sort },
307
+ "extension_tags" => all_extension_tags.to_a.sort
308
+ }
309
+
310
+ # Write output
311
+ output_path = File.join(__dir__, "..", "lib", "wp2txt", "data", "mediawiki_aliases.json")
312
+ FileUtils.mkdir_p(File.dirname(output_path))
313
+
314
+ File.write(output_path, JSON.pretty_generate(result))
315
+ puts "\nData written to: #{output_path}"
316
+
317
+ # Summary
318
+ puts "\n=== Summary ==="
319
+ puts "Magic Words:"
320
+ result["magic_words"].each do |name, aliases|
321
+ puts " #{name}: #{aliases.size} aliases"
322
+ end
323
+ puts "\nNamespaces:"
324
+ result["namespaces"].each do |name, aliases|
325
+ puts " #{name}: #{aliases.size} aliases"
326
+ end
327
+ puts "\nInterwiki:"
328
+ result["interwiki"].each do |name, prefixes|
329
+ puts " #{name}: #{prefixes.size} prefixes"
330
+ end
331
+ puts "\nExtension Tags: #{result["extension_tags"].size} tags"
332
+ end
333
+
334
+ main if __FILE__ == $PROGRAM_NAME
@@ -0,0 +1,186 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Fetches template aliases and redirects from Wikipedia APIs
4
+ # Usage: ruby scripts/fetch_template_data.rb
5
+ #
6
+ # This script augments the existing template_aliases.json by:
7
+ # - Fetching redirects for known templates (to discover aliases)
8
+ # - Validating that templates exist
9
+ # - Merging new aliases from multiple Wikipedia language editions
10
+ #
11
+ # Note: Unlike magic words and namespaces (which come from MediaWiki siteinfo),
12
+ # templates are wiki-specific pages. This script queries a subset of major
13
+ # Wikipedia editions to collect common template aliases.
14
+
15
+ require "net/http"
16
+ require "json"
17
+ require "fileutils"
18
+ require "set"
19
+
20
+ # Languages to query for template data
21
+ TEMPLATE_LANGUAGES = %w[en ja de fr es it ru zh pt nl pl ar ko].freeze
22
+
23
+ # Base template names to look up redirects for (English)
24
+ # These are canonical names; we'll find their translations and aliases
25
+ BASE_TEMPLATES = {
26
+ "remove_templates" => %w[
27
+ Reflist Notelist Sfn Efn Main See_also Further About Portal
28
+ ],
29
+ "authority_control" => %w[
30
+ Authority_control Normdaten Persondata
31
+ ],
32
+ "citation_templates" => %w[
33
+ Cite_web Cite_book Cite_news Cite_journal Citation
34
+ ],
35
+ "sister_project_templates" => %w[
36
+ Commons Commons_category Wiktionary Wikiquote Wikisource
37
+ ]
38
+ }.freeze
39
+
40
+ # Fetch template redirects from a specific Wikipedia
41
+ def fetch_template_redirects(lang, template_name)
42
+ uri = URI("https://#{lang}.wikipedia.org/w/api.php")
43
+ params = {
44
+ action: "query",
45
+ titles: "Template:#{template_name}",
46
+ prop: "redirects",
47
+ rdlimit: "max",
48
+ format: "json"
49
+ }
50
+ uri.query = URI.encode_www_form(params)
51
+
52
+ response = Net::HTTP.get_response(uri)
53
+ return [] unless response.is_a?(Net::HTTPSuccess)
54
+
55
+ data = JSON.parse(response.body)
56
+ pages = data.dig("query", "pages") || {}
57
+
58
+ redirects = []
59
+ pages.each_value do |page|
60
+ next if page["missing"]
61
+
62
+ # Add the page title itself (normalized)
63
+ page_title = page["title"]&.sub(/^Template:/, "")
64
+ redirects << page_title if page_title
65
+
66
+ # Add all redirects
67
+ (page["redirects"] || []).each do |rd|
68
+ rd_title = rd["title"]&.sub(/^Template:/, "")
69
+ redirects << rd_title if rd_title
70
+ end
71
+ end
72
+
73
+ redirects.compact.uniq
74
+ rescue StandardError => e
75
+ warn " Error fetching #{lang}:Template:#{template_name}: #{e.message}"
76
+ []
77
+ end
78
+
79
+ # Fetch all templates in a category
80
+ def fetch_category_members(lang, category_name, limit: 100)
81
+ uri = URI("https://#{lang}.wikipedia.org/w/api.php")
82
+ params = {
83
+ action: "query",
84
+ list: "categorymembers",
85
+ cmtitle: "Category:#{category_name}",
86
+ cmtype: "page",
87
+ cmnamespace: "10", # Template namespace
88
+ cmlimit: limit.to_s,
89
+ format: "json"
90
+ }
91
+ uri.query = URI.encode_www_form(params)
92
+
93
+ response = Net::HTTP.get_response(uri)
94
+ return [] unless response.is_a?(Net::HTTPSuccess)
95
+
96
+ data = JSON.parse(response.body)
97
+ members = data.dig("query", "categorymembers") || []
98
+
99
+ members.map { |m| m["title"]&.sub(/^Template:/, "") }.compact
100
+ rescue StandardError => e
101
+ warn " Error fetching category #{category_name}: #{e.message}"
102
+ []
103
+ end
104
+
105
+ def main
106
+ puts "Template Data Fetcher"
107
+ puts "=" * 50
108
+
109
+ # Load existing data
110
+ data_path = File.join(__dir__, "..", "lib", "wp2txt", "data", "template_aliases.json")
111
+ existing_data = if File.exist?(data_path)
112
+ JSON.parse(File.read(data_path))
113
+ else
114
+ { "meta" => {} }
115
+ end
116
+
117
+ # Convert arrays to sets for efficient merging
118
+ categories = {}
119
+ existing_data.each do |key, value|
120
+ next if key == "meta"
121
+ categories[key] = Set.new(value) if value.is_a?(Array)
122
+ end
123
+
124
+ puts "\nFetching template redirects from #{TEMPLATE_LANGUAGES.size} Wikipedia editions..."
125
+
126
+ # Fetch redirects for base templates
127
+ BASE_TEMPLATES.each do |category, templates|
128
+ puts "\n#{category}:"
129
+ templates.each do |template|
130
+ TEMPLATE_LANGUAGES.each do |lang|
131
+ print " #{lang}:#{template}..."
132
+ aliases = fetch_template_redirects(lang, template)
133
+ if aliases.any?
134
+ categories[category] ||= Set.new
135
+ aliases.each { |a| categories[category] << a }
136
+ puts " #{aliases.size} aliases"
137
+ else
138
+ puts " not found"
139
+ end
140
+ sleep 0.1 # Rate limiting
141
+ end
142
+ end
143
+ end
144
+
145
+ # Fetch citation templates from category (English)
146
+ puts "\nFetching citation templates from category..."
147
+ citation_members = fetch_category_members("en", "Citation templates")
148
+ if citation_members.any?
149
+ categories["citation_templates"] ||= Set.new
150
+ citation_members.each { |t| categories["citation_templates"] << t }
151
+ puts " Found #{citation_members.size} citation templates"
152
+ end
153
+
154
+ # Fetch hatnote templates
155
+ puts "\nFetching hatnote templates from category..."
156
+ hatnote_members = fetch_category_members("en", "Hatnote templates")
157
+ if hatnote_members.any?
158
+ categories["remove_templates"] ||= Set.new
159
+ hatnote_members.each { |t| categories["remove_templates"] << t }
160
+ puts " Found #{hatnote_members.size} hatnote templates"
161
+ end
162
+
163
+ # Convert sets back to sorted arrays
164
+ result = { "meta" => existing_data["meta"] || {} }
165
+ result["meta"]["generated_at"] = Time.now.utc.iso8601
166
+ result["meta"]["source"] = "Manual curation + MediaWiki API (templates)"
167
+ result["meta"]["languages_queried"] = TEMPLATE_LANGUAGES
168
+
169
+ categories.each do |key, set|
170
+ result[key] = set.to_a.sort_by(&:downcase)
171
+ end
172
+
173
+ # Write output
174
+ File.write(data_path, JSON.pretty_generate(result))
175
+ puts "\n" + "=" * 50
176
+ puts "Data written to: #{data_path}"
177
+
178
+ # Summary
179
+ puts "\n=== Summary ==="
180
+ result.each do |key, value|
181
+ next if key == "meta"
182
+ puts " #{key}: #{value.size} templates"
183
+ end
184
+ end
185
+
186
+ main if __FILE__ == $PROGRAM_NAME
@@ -0,0 +1,139 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Memory profiling script for wp2txt streaming operations
4
+ # Demonstrates the MemoryMonitor module and adaptive buffer sizing
5
+ #
6
+ # Usage: ruby scripts/profile_memory.rb [input_file.xml]
7
+
8
+ $LOAD_PATH.unshift(File.expand_path("../lib", __dir__))
9
+ require "wp2txt"
10
+ require "wp2txt/memory_monitor"
11
+ require "wp2txt/stream_processor"
12
+
13
+ def print_separator(title = nil)
14
+ puts
15
+ puts "=" * 60
16
+ puts title if title
17
+ puts "=" * 60
18
+ end
19
+
20
+ def profile_memory_monitor
21
+ print_separator("MemoryMonitor Module Test")
22
+
23
+ puts "System Memory Information:"
24
+ puts "-" * 40
25
+
26
+ stats = Wp2txt::MemoryMonitor.memory_stats
27
+ stats.each do |key, value|
28
+ formatted_key = key.to_s.gsub("_", " ").capitalize
29
+ puts " #{formatted_key}: #{value}"
30
+ end
31
+
32
+ puts
33
+ puts "Raw values:"
34
+ puts " Current usage: #{Wp2txt::MemoryMonitor.format_memory(Wp2txt::MemoryMonitor.current_memory_usage)}"
35
+ puts " Total system: #{Wp2txt::MemoryMonitor.format_memory(Wp2txt::MemoryMonitor.total_system_memory)}"
36
+ puts " Available: #{Wp2txt::MemoryMonitor.format_memory(Wp2txt::MemoryMonitor.available_memory)}"
37
+ puts " Optimal buffer: #{Wp2txt::MemoryMonitor.format_memory(Wp2txt::MemoryMonitor.optimal_buffer_size)}"
38
+ end
39
+
40
+ def profile_stream_processor(input_file)
41
+ print_separator("StreamProcessor Memory Profile")
42
+
43
+ unless File.exist?(input_file)
44
+ puts "File not found: #{input_file}"
45
+ puts "Creating a sample XML file for testing..."
46
+
47
+ # Create a sample file for testing
48
+ sample_xml = <<~XML
49
+ <mediawiki>
50
+ #{(1..100).map { |i| <<~PAGE
51
+ <page>
52
+ <title>Test Article #{i}</title>
53
+ <revision>
54
+ <text>This is test content for article #{i}. #{"Lorem ipsum " * 100}</text>
55
+ </revision>
56
+ </page>
57
+ PAGE
58
+ }.join}
59
+ </mediawiki>
60
+ XML
61
+
62
+ input_file = "/tmp/wp2txt_test_sample.xml"
63
+ File.write(input_file, sample_xml)
64
+ puts "Created sample file: #{input_file} (#{File.size(input_file)} bytes)"
65
+ end
66
+
67
+ puts
68
+ puts "Processing: #{input_file}"
69
+ puts "File size: #{Wp2txt::MemoryMonitor.format_memory(File.size(input_file))}"
70
+ puts
71
+
72
+ # Test with adaptive buffer
73
+ puts "Testing with adaptive buffer sizing:"
74
+ puts "-" * 40
75
+
76
+ processor = Wp2txt::StreamProcessor.new(input_file, adaptive_buffer: true)
77
+ puts "Initial buffer size: #{Wp2txt::MemoryMonitor.format_memory(processor.buffer_size)}"
78
+
79
+ start_time = Time.now
80
+ start_memory = Wp2txt::MemoryMonitor.current_memory_usage
81
+
82
+ page_count = 0
83
+ processor.each_page do |title, _text|
84
+ page_count += 1
85
+ if page_count % 10 == 0
86
+ stats = processor.stats
87
+ puts " Processed #{page_count} pages, buffer: #{Wp2txt::MemoryMonitor.format_memory(stats[:buffer_size])}"
88
+ end
89
+ end
90
+
91
+ end_time = Time.now
92
+ end_memory = Wp2txt::MemoryMonitor.current_memory_usage
93
+
94
+ puts
95
+ puts "Final Statistics:"
96
+ final_stats = processor.stats
97
+ puts " Pages processed: #{final_stats[:pages_processed]}"
98
+ puts " Bytes read: #{Wp2txt::MemoryMonitor.format_memory(final_stats[:bytes_read])}"
99
+ puts " Final buffer size: #{Wp2txt::MemoryMonitor.format_memory(final_stats[:buffer_size])}"
100
+ puts " Processing time: #{(end_time - start_time).round(3)}s"
101
+ puts " Memory delta: #{Wp2txt::MemoryMonitor.format_memory(end_memory - start_memory)}"
102
+
103
+ # Test without adaptive buffer
104
+ puts
105
+ puts "Testing with fixed buffer sizing (10 MB):"
106
+ puts "-" * 40
107
+
108
+ processor2 = Wp2txt::StreamProcessor.new(input_file, adaptive_buffer: false)
109
+ puts "Fixed buffer size: #{Wp2txt::MemoryMonitor.format_memory(processor2.buffer_size)}"
110
+
111
+ start_time = Time.now
112
+ start_memory = Wp2txt::MemoryMonitor.current_memory_usage
113
+
114
+ page_count = 0
115
+ processor2.each_page do |_title, _text|
116
+ page_count += 1
117
+ end
118
+
119
+ end_time = Time.now
120
+ end_memory = Wp2txt::MemoryMonitor.current_memory_usage
121
+
122
+ final_stats = processor2.stats
123
+ puts " Pages processed: #{final_stats[:pages_processed]}"
124
+ puts " Processing time: #{(end_time - start_time).round(3)}s"
125
+ puts " Memory delta: #{Wp2txt::MemoryMonitor.format_memory(end_memory - start_memory)}"
126
+ end
127
+
128
+ def main
129
+ profile_memory_monitor
130
+
131
+ input_file = ARGV[0] || "/tmp/wp2txt_test_sample.xml"
132
+ profile_stream_processor(input_file)
133
+
134
+ print_separator("Memory Profiling Complete")
135
+ puts "Current memory: #{Wp2txt::MemoryMonitor.format_memory(Wp2txt::MemoryMonitor.current_memory_usage)}"
136
+ puts "Memory low?: #{Wp2txt::MemoryMonitor.memory_low?}"
137
+ end
138
+
139
+ main