wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wp2txt
4
- VERSION = "1.1.3"
4
+ VERSION = "2.1.0"
5
5
  end
data/lib/wp2txt.rb CHANGED
@@ -3,17 +3,24 @@
3
3
  require "nokogiri"
4
4
  require_relative "wp2txt/article"
5
5
  require_relative "wp2txt/utils"
6
+ require_relative "wp2txt/stream_processor"
7
+ require_relative "wp2txt/output_writer"
6
8
 
7
9
  module Wp2txt
8
10
  class Splitter
9
11
  include Wp2txt
10
- def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false)
12
+
13
+ attr_reader :size_read, :file_index
14
+
15
+ def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false, &progress_callback)
11
16
  @fp = nil
12
17
  @input_file = input_file
13
18
  @output_dir = output_dir
14
19
  @tfile_size = tfile_size
15
20
  require "bzip2-ruby" if bz2_gem
16
21
  @bz2_gem = bz2_gem
22
+ @progress_callback = progress_callback
23
+ @last_progress_time = Time.now
17
24
  prepare
18
25
  end
19
26
 
@@ -26,7 +33,7 @@ module Wp2txt
26
33
  loop do
27
34
  begin
28
35
  a = file.read(unit)
29
- rescue StandardError
36
+ rescue IOError, Errno::EIO, Errno::ENOENT
30
37
  a = nil
31
38
  end
32
39
  break unless a
@@ -46,20 +53,22 @@ module Wp2txt
46
53
  # check if a given command exists: return the path if it does, return false if not
47
54
  def command_exist?(command)
48
55
  basename = File.basename(command)
49
- path = +""
50
56
  print "Checking #{basename}: "
51
57
  begin
52
- if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
53
- puts "detected [#{path}]"
54
- path.strip
55
- elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
56
- puts "detected [#{path}]"
57
- path.strip
58
- else
58
+ # Use IO.popen instead of open("| ...") for Ruby 4.0 compatibility
59
+ path = IO.popen(["which", command], err: File::NULL, &:read).strip
60
+ if path.empty?
61
+ path = IO.popen(["which", basename], err: File::NULL, &:read).strip
62
+ end
63
+
64
+ if path.empty?
59
65
  puts "#{basename} not found"
60
66
  false
67
+ else
68
+ puts "detected [#{path}]"
69
+ path
61
70
  end
62
- rescue StandardError
71
+ rescue Errno::ENOENT, Errno::EPIPE, IOError
63
72
  puts "#{basename} not found"
64
73
  false
65
74
  end
@@ -75,13 +84,13 @@ module Wp2txt
75
84
  if @bz2_gem
76
85
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
77
86
  elsif Gem.win_platform?
78
- file = IO.popen("bunzip2.exe -c #{@input_file}")
87
+ file = IO.popen(["bunzip2.exe", "-c", @input_file])
79
88
  elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
80
- file = IO.popen("#{bzpath} -c -d #{@input_file}")
89
+ file = IO.popen([bzpath, "-c", "-d", @input_file])
81
90
  end
82
91
  else # meaning that it is a text file
83
92
  @infile_size = File.stat(@input_file).size
84
- file = open(@input_file)
93
+ file = File.open(@input_file, "r:UTF-8")
85
94
  end
86
95
 
87
96
  # create basename of output file
@@ -101,7 +110,7 @@ module Wp2txt
101
110
  loop do
102
111
  begin
103
112
  new_lines = @file_pointer.read(10_485_760)
104
- rescue StandardError
113
+ rescue IOError, Errno::EIO, Errno::ENOENT, Errno::EPIPE
105
114
  return nil
106
115
  end
107
116
  return nil unless new_lines
@@ -114,9 +123,10 @@ module Wp2txt
114
123
 
115
124
  new_first_line = temp_buf.shift
116
125
  @buffer.last << new_first_line
117
- @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
118
- @buffer += temp_buf unless temp_buf.empty?
119
- @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
126
+ # Use end_with? instead of [-1, 1] for clarity and performance
127
+ @buffer << +"" if new_first_line.end_with?("\n")
128
+ @buffer.concat(temp_buf) unless temp_buf.empty?
129
+ @buffer << +"" if @buffer.last.end_with?("\n")
120
130
  break if @buffer.size > 1
121
131
  end
122
132
  true
@@ -144,6 +154,10 @@ module Wp2txt
144
154
  @total_size += text.bytesize
145
155
  output_text << text
146
156
  end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
157
+
158
+ # Report progress every 5 seconds
159
+ report_progress
160
+
147
161
  # never close the file until the end of the page even if end_flag is on
148
162
  next unless end_flag && %r{</page} =~ text
149
163
 
@@ -157,7 +171,7 @@ module Wp2txt
157
171
  @outfiles << outfilename
158
172
  @fp = File.open(outfilename, "w")
159
173
  end
160
- @fp.puts(output_text) if output_text != ""
174
+ @fp.puts(output_text) unless output_text.empty?
161
175
  @fp.close
162
176
 
163
177
  if outfilename && File.size(outfilename).zero?
@@ -167,6 +181,18 @@ module Wp2txt
167
181
 
168
182
  rename(@outfiles, "xml")
169
183
  end
184
+
185
+ private
186
+
187
+ def report_progress
188
+ return unless @progress_callback
189
+
190
+ now = Time.now
191
+ return if now - @last_progress_time < 5 # Report every 5 seconds
192
+
193
+ @last_progress_time = now
194
+ @progress_callback.call(@size_read, @file_index)
195
+ end
170
196
  end
171
197
 
172
198
  class Runner
@@ -183,7 +209,7 @@ module Wp2txt
183
209
 
184
210
  def prepare
185
211
  @infile_size = File.stat(@input_file).size
186
- file = open(@input_file)
212
+ file = File.open(@input_file, "r:UTF-8")
187
213
  @file_pointer = file
188
214
  @outfile_base = File.basename(@input_file, ".*")
189
215
  @total_size = 0
@@ -194,7 +220,7 @@ module Wp2txt
194
220
  loop do
195
221
  begin
196
222
  new_lines = @file_pointer.read(10_485_760)
197
- rescue StandardError
223
+ rescue IOError, Errno::EIO, Errno::ENOENT, Errno::EPIPE
198
224
  return nil
199
225
  end
200
226
  return nil unless new_lines
@@ -206,10 +232,11 @@ module Wp2txt
206
232
  temp_buf << ss.rest unless ss.eos?
207
233
 
208
234
  new_first_line = temp_buf.shift
209
- @buffer.last << new_first_line
210
- @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
211
- @buffer += temp_buf unless temp_buf.empty?
212
- @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
235
+ @buffer.last << new_first_line
236
+ # Use end_with? instead of [-1, 1] for clarity and performance
237
+ @buffer << +"" if new_first_line.end_with?("\n")
238
+ @buffer.concat(temp_buf) unless temp_buf.empty?
239
+ @buffer << +"" if @buffer.last.end_with?("\n")
213
240
  break if @buffer.size > 1
214
241
  end
215
242
  true
@@ -247,7 +274,7 @@ module Wp2txt
247
274
  else
248
275
  page.force_encoding("utf-8")
249
276
  end
250
- rescue StandardError
277
+ rescue ::Encoding::InvalidByteSequenceError, ::Encoding::UndefinedConversionError
251
278
  page
252
279
  end
253
280
 
@@ -0,0 +1,161 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Benchmark script for wp2txt regex performance
4
+ # Compares pre-compiled regex patterns vs inline compilation
5
+ #
6
+ # Usage: ruby scripts/benchmark_regex.rb
7
+
8
+ require "benchmark"
9
+ begin
10
+ require "benchmark/ips"
11
+ rescue LoadError
12
+ # benchmark-ips is optional
13
+ end
14
+
15
+ # Add lib to load path
16
+ $LOAD_PATH.unshift(File.expand_path("../lib", __dir__))
17
+ require "wp2txt"
18
+ require "wp2txt/article"
19
+
20
+ # Sample Wikipedia-like content for benchmarking
21
+ SAMPLE_TEXT = <<~WIKI
22
+ {{Infobox person
23
+ | name = Test Person
24
+ | birth_date = 1980-01-01
25
+ | occupation = Writer
26
+ }}
27
+ '''Test Article''' is a [[test]] article with various [[wiki markup|markup]].
28
+
29
+ == Section 1 ==
30
+ This section has some {{cite web|url=http://example.com|title=Example}} references.
31
+ There are also [[Category:Test]] links and [[File:Image.jpg|thumb|A caption]].
32
+
33
+ === Subsection ===
34
+ More content with '''bold''' and ''italic'' text.
35
+ * List item 1
36
+ * List item 2
37
+ # Numbered item
38
+
39
+ == Section 2 ==
40
+ {| class="wikitable"
41
+ |-
42
+ ! Header 1 !! Header 2
43
+ |-
44
+ | Cell 1 || Cell 2
45
+ |}
46
+
47
+ Some text with &nbsp; entities and &#x266A; characters.
48
+ Also has <ref name="test">Reference content</ref> and <nowiki>[[preserved]]</nowiki>.
49
+
50
+ {{DEFAULTSORT:Test Article}}
51
+ [[Category:Articles]]
52
+ [[Category:Tests]]
53
+ WIKI
54
+
55
+ # Create multiple copies for more realistic benchmarking
56
+ LARGE_TEXT = (SAMPLE_TEXT * 100).freeze
57
+
58
+ class BenchmarkRunner
59
+ include Wp2txt
60
+
61
+ def initialize
62
+ @nowikis = {}
63
+ end
64
+
65
+ def run_cleanup(text)
66
+ cleanup(text.dup)
67
+ end
68
+
69
+ def run_full_format(text)
70
+ format_wiki(text.dup)
71
+ end
72
+ end
73
+
74
+ def run_benchmarks
75
+ puts "=" * 60
76
+ puts "wp2txt Regex Performance Benchmark"
77
+ puts "=" * 60
78
+ puts
79
+ puts "Ruby version: #{RUBY_VERSION}"
80
+ puts "Sample text size: #{SAMPLE_TEXT.bytesize} bytes"
81
+ puts "Large text size: #{LARGE_TEXT.bytesize} bytes"
82
+ puts
83
+
84
+ runner = BenchmarkRunner.new
85
+
86
+ puts "-" * 60
87
+ puts "Warmup (JIT compilation, method caching)"
88
+ puts "-" * 60
89
+ 5.times { runner.run_cleanup(SAMPLE_TEXT) }
90
+ 5.times { runner.run_full_format(SAMPLE_TEXT) }
91
+ puts "Done."
92
+ puts
93
+
94
+ puts "-" * 60
95
+ puts "Benchmark: cleanup() method"
96
+ puts "-" * 60
97
+
98
+ Benchmark.bm(20) do |x|
99
+ x.report("cleanup (small):") do
100
+ 1000.times { runner.run_cleanup(SAMPLE_TEXT) }
101
+ end
102
+
103
+ x.report("cleanup (large):") do
104
+ 10.times { runner.run_cleanup(LARGE_TEXT) }
105
+ end
106
+ end
107
+
108
+ puts
109
+ puts "-" * 60
110
+ puts "Benchmark: format_wiki() method (full pipeline)"
111
+ puts "-" * 60
112
+
113
+ Benchmark.bm(20) do |x|
114
+ x.report("format_wiki (small):") do
115
+ 1000.times { runner.run_full_format(SAMPLE_TEXT) }
116
+ end
117
+
118
+ x.report("format_wiki (large):") do
119
+ 10.times { runner.run_full_format(LARGE_TEXT) }
120
+ end
121
+ end
122
+
123
+ # If benchmark-ips is available, run IPS benchmarks
124
+ if defined?(Benchmark::IPS)
125
+ puts
126
+ puts "-" * 60
127
+ puts "IPS Benchmark (iterations per second)"
128
+ puts "-" * 60
129
+
130
+ Benchmark.ips do |x|
131
+ x.report("cleanup") { runner.run_cleanup(SAMPLE_TEXT) }
132
+ x.report("format_wiki") { runner.run_full_format(SAMPLE_TEXT) }
133
+ x.compare!
134
+ end
135
+ end
136
+
137
+ puts
138
+ puts "-" * 60
139
+ puts "Memory profile (approximate)"
140
+ puts "-" * 60
141
+
142
+ # Simple memory measurement
143
+ GC.start
144
+ before = GC.stat[:total_allocated_objects]
145
+ 100.times { runner.run_cleanup(SAMPLE_TEXT) }
146
+ after = GC.stat[:total_allocated_objects]
147
+ puts "cleanup() allocations per call: ~#{(after - before) / 100}"
148
+
149
+ GC.start
150
+ before = GC.stat[:total_allocated_objects]
151
+ 100.times { runner.run_full_format(SAMPLE_TEXT) }
152
+ after = GC.stat[:total_allocated_objects]
153
+ puts "format_wiki() allocations per call: ~#{(after - before) / 100}"
154
+
155
+ puts
156
+ puts "=" * 60
157
+ puts "Benchmark complete"
158
+ puts "=" * 60
159
+ end
160
+
161
+ run_benchmarks
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Fetches HTML named character references from WHATWG HTML specification
4
+ # Usage: ruby scripts/fetch_html_entities.rb
5
+ #
6
+ # This script downloads the official entities.json from WHATWG and converts
7
+ # it into a format suitable for wp2txt text processing.
8
+
9
+ require "net/http"
10
+ require "json"
11
+ require "fileutils"
12
+
13
+ WHATWG_ENTITIES_URL = "https://html.spec.whatwg.org/entities.json"
14
+
15
+ def fetch_whatwg_entities
16
+ puts "Fetching entities from WHATWG HTML specification..."
17
+ uri = URI(WHATWG_ENTITIES_URL)
18
+
19
+ response = Net::HTTP.get_response(uri)
20
+ unless response.is_a?(Net::HTTPSuccess)
21
+ warn "Failed to fetch entities: HTTP #{response.code}"
22
+ return nil
23
+ end
24
+
25
+ JSON.parse(response.body)
26
+ rescue StandardError => e
27
+ warn "Error fetching entities: #{e.message}"
28
+ nil
29
+ end
30
+
31
+ def convert_entities(raw_data)
32
+ entities = {}
33
+
34
+ raw_data.each do |name, info|
35
+ # Only include entries with semicolon (standard form)
36
+ # Skip legacy forms without semicolon like "&nbsp"
37
+ next unless name.end_with?(";")
38
+
39
+ # Extract entity name without & and ;
40
+ # e.g., "&alpha;" -> "alpha"
41
+ key = name
42
+
43
+ # Get the character(s)
44
+ characters = info["characters"]
45
+ next if characters.nil? || characters.empty?
46
+
47
+ entities[key] = characters
48
+ end
49
+
50
+ entities
51
+ end
52
+
53
+ def main
54
+ raw_data = fetch_whatwg_entities
55
+ if raw_data.nil?
56
+ warn "Failed to fetch entities. Aborting."
57
+ exit 1
58
+ end
59
+
60
+ puts "Processing #{raw_data.size} raw entries..."
61
+
62
+ entities = convert_entities(raw_data)
63
+ puts "Converted to #{entities.size} standard entities (with semicolon)"
64
+
65
+ result = {
66
+ "meta" => {
67
+ "generated_at" => Time.now.utc.iso8601,
68
+ "source" => WHATWG_ENTITIES_URL,
69
+ "description" => "HTML named character references from WHATWG HTML specification",
70
+ "total_entities" => entities.size
71
+ },
72
+ "entities" => entities.sort.to_h
73
+ }
74
+
75
+ # Write output
76
+ output_path = File.join(__dir__, "..", "lib", "wp2txt", "data", "html_entities.json")
77
+ FileUtils.mkdir_p(File.dirname(output_path))
78
+
79
+ File.write(output_path, JSON.pretty_generate(result))
80
+ puts "\nData written to: #{output_path}"
81
+
82
+ # Summary - show some categories
83
+ greek = entities.keys.select { |k| k.match?(/&(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega);/i) }
84
+ math = entities.keys.select { |k| k.match?(/&(sum|prod|int|infin|nabla|part|forall|exist|empty|isin|notin|cap|cup|sub|sup|oplus|otimes);/i) }
85
+ arrows = entities.keys.select { |k| k.match?(/arr;$/i) }
86
+
87
+ puts "\n=== Summary ==="
88
+ puts "Total entities: #{entities.size}"
89
+ puts "Greek letters: #{greek.size}"
90
+ puts "Math symbols: #{math.size}"
91
+ puts "Arrows: #{arrows.size}"
92
+ end
93
+
94
+ main if __FILE__ == $PROGRAM_NAME
@@ -0,0 +1,180 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Fetches Wikipedia language metadata from Wikimedia APIs
4
+ # Usage: ruby scripts/fetch_language_metadata.rb
5
+ #
6
+ # This script queries the Wikimedia sitematrix API to get all Wikipedia
7
+ # language editions and their statistics (article counts, etc.)
8
+
9
+ require "net/http"
10
+ require "json"
11
+ require "fileutils"
12
+
13
+ # Fetch all Wikipedia languages with statistics from sitematrix API
14
+ def fetch_wikipedia_languages
15
+ uri = URI("https://meta.wikimedia.org/w/api.php")
16
+ params = {
17
+ action: "sitematrix",
18
+ smtype: "language",
19
+ format: "json"
20
+ }
21
+ uri.query = URI.encode_www_form(params)
22
+
23
+ response = Net::HTTP.get_response(uri)
24
+ return {} unless response.is_a?(Net::HTTPSuccess)
25
+
26
+ data = JSON.parse(response.body)
27
+ languages = {}
28
+
29
+ data["sitematrix"].each do |key, val|
30
+ next unless key.match?(/^\d+$/) && val.is_a?(Hash) && val["site"]
31
+
32
+ # Find Wikipedia site info
33
+ wiki_site = val["site"].find { |site| site["code"] == "wiki" }
34
+ next unless wiki_site
35
+
36
+ lang_code = val["code"]
37
+ languages[lang_code] = {
38
+ "name" => val["name"],
39
+ "localname" => val["localname"],
40
+ "url" => wiki_site["url"],
41
+ "dbname" => wiki_site["dbname"],
42
+ "closed" => wiki_site["closed"] || false,
43
+ "private" => wiki_site["private"] || false
44
+ }
45
+ end
46
+
47
+ languages
48
+ rescue StandardError => e
49
+ warn "Error fetching sitematrix: #{e.message}"
50
+ {}
51
+ end
52
+
53
+ # Fetch article statistics for a specific Wikipedia
54
+ def fetch_wiki_statistics(lang_code)
55
+ uri = URI("https://#{lang_code}.wikipedia.org/w/api.php")
56
+ params = {
57
+ action: "query",
58
+ meta: "siteinfo",
59
+ siprop: "statistics",
60
+ format: "json"
61
+ }
62
+ uri.query = URI.encode_www_form(params)
63
+
64
+ response = Net::HTTP.get_response(uri)
65
+ return nil unless response.is_a?(Net::HTTPSuccess)
66
+
67
+ data = JSON.parse(response.body)
68
+ stats = data.dig("query", "statistics")
69
+ return nil unless stats
70
+
71
+ {
72
+ "articles" => stats["articles"],
73
+ "pages" => stats["pages"],
74
+ "edits" => stats["edits"],
75
+ "users" => stats["users"],
76
+ "activeusers" => stats["activeusers"]
77
+ }
78
+ rescue StandardError
79
+ nil
80
+ end
81
+
82
+ def main
83
+ puts "Fetching Wikipedia language list..."
84
+ languages = fetch_wikipedia_languages
85
+
86
+ if languages.empty?
87
+ warn "Failed to fetch language list. Aborting."
88
+ exit 1
89
+ end
90
+
91
+ # Filter out closed/private wikis
92
+ active_languages = languages.reject { |_, info| info["closed"] || info["private"] }
93
+ puts "Found #{active_languages.size} active Wikipedia editions."
94
+
95
+ puts "Fetching statistics for each Wikipedia (this may take a few minutes)..."
96
+ successful = 0
97
+ failed = []
98
+
99
+ active_languages.each_with_index do |(lang_code, info), idx|
100
+ print "\r Processing: #{lang_code.ljust(10)} (#{idx + 1}/#{active_languages.size})"
101
+ $stdout.flush
102
+
103
+ stats = fetch_wiki_statistics(lang_code)
104
+ if stats
105
+ info.merge!(stats)
106
+ successful += 1
107
+ else
108
+ failed << lang_code
109
+ end
110
+
111
+ sleep 0.05 # Rate limiting
112
+ end
113
+
114
+ puts "\n Successfully fetched: #{successful}/#{active_languages.size}"
115
+ puts " Failed: #{failed.size} (#{failed.first(10).join(', ')}#{failed.size > 10 ? '...' : ''})" if failed.any?
116
+
117
+ # Categorize by size
118
+ size_categories = {
119
+ "large" => [], # 1M+ articles
120
+ "medium" => [], # 100K-1M articles
121
+ "small" => [], # 10K-100K articles
122
+ "mini" => [] # <10K articles
123
+ }
124
+
125
+ active_languages.each do |lang_code, info|
126
+ articles = info["articles"] || 0
127
+ category = if articles >= 1_000_000
128
+ "large"
129
+ elsif articles >= 100_000
130
+ "medium"
131
+ elsif articles >= 10_000
132
+ "small"
133
+ else
134
+ "mini"
135
+ end
136
+ size_categories[category] << lang_code
137
+ info["size_category"] = category
138
+ end
139
+
140
+ # Build result
141
+ result = {
142
+ "meta" => {
143
+ "generated_at" => Time.now.utc.iso8601,
144
+ "source" => "Wikimedia sitematrix + siteinfo APIs",
145
+ "total_languages" => active_languages.size,
146
+ "statistics_fetched" => successful
147
+ },
148
+ "size_summary" => {
149
+ "large" => size_categories["large"].size,
150
+ "medium" => size_categories["medium"].size,
151
+ "small" => size_categories["small"].size,
152
+ "mini" => size_categories["mini"].size
153
+ },
154
+ "languages" => active_languages.sort_by { |_, info| -(info["articles"] || 0) }.to_h
155
+ }
156
+
157
+ # Write output
158
+ output_path = File.join(__dir__, "..", "lib", "wp2txt", "data", "language_metadata.json")
159
+ FileUtils.mkdir_p(File.dirname(output_path))
160
+
161
+ File.write(output_path, JSON.pretty_generate(result))
162
+ puts "\nData written to: #{output_path}"
163
+
164
+ # Summary
165
+ puts "\n=== Summary ==="
166
+ puts "Total active Wikipedias: #{active_languages.size}"
167
+ puts "Size categories:"
168
+ puts " Large (1M+ articles): #{size_categories['large'].size} - #{size_categories['large'].first(5).join(', ')}..."
169
+ puts " Medium (100K-1M): #{size_categories['medium'].size}"
170
+ puts " Small (10K-100K): #{size_categories['small'].size}"
171
+ puts " Mini (<10K): #{size_categories['mini'].size}"
172
+
173
+ # Top 20 by article count
174
+ puts "\nTop 20 Wikipedias by article count:"
175
+ active_languages.sort_by { |_, info| -(info["articles"] || 0) }.first(20).each_with_index do |(code, info), idx|
176
+ puts " #{(idx + 1).to_s.rjust(2)}. #{code.ljust(5)} #{info['name'].to_s.ljust(20)} #{(info['articles'] || 0).to_s.rjust(10)} articles"
177
+ end
178
+ end
179
+
180
+ main if __FILE__ == $PROGRAM_NAME